Bug 1816486 - Update libvpx to bc2965f r=webrtc-reviewers,ng

Run `./mach vendor media/libvpx/moz.yaml --patch-mode=none` as what media/libvpx/README_MOZILLA said. The updated libvpx revision is bc2965ff72af7d7b21ffeab10549fcc67ed66ccf. Differential Revision: https://phabricator.services.mozilla.com/D169840
2023-02-14 22:26:52 +00:00 · 2023-02-14 22:26:52 +00:00 · 1b617be5fa
--- a/media/libvpx/config/generic/vpx_config.asm
+++ b/media/libvpx/config/generic/vpx_config.asm
@ -77,7 +77,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  0
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@ -90,4 +89,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
    .section .note.GNU-stack,"",%progbits
--- a/media/libvpx/config/generic/vpx_config.h
+++ b/media/libvpx/config/generic/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/linux/arm/vpx_config.asm
+++ b/media/libvpx/config/linux/arm/vpx_config.asm
@ -77,7 +77,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  0
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@ -90,4 +89,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
    .section .note.GNU-stack,"",%progbits
--- a/media/libvpx/config/linux/arm/vpx_config.h
+++ b/media/libvpx/config/linux/arm/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/linux/arm64/vpx_config.asm
+++ b/media/libvpx/config/linux/arm64/vpx_config.asm
@ -77,7 +77,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  0
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@ -90,4 +89,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
    .section .note.GNU-stack,"",%progbits
--- a/media/libvpx/config/linux/arm64/vpx_config.h
+++ b/media/libvpx/config/linux/arm64/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/linux/ia32/vpx_config.asm
+++ b/media/libvpx/config/linux/ia32/vpx_config.asm
@ -74,7 +74,6 @@
 %define CONFIG_MULTI_RES_ENCODING 1
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 0
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
--- a/media/libvpx/config/linux/ia32/vpx_config.h
+++ b/media/libvpx/config/linux/ia32/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/linux/x64/vpx_config.asm
+++ b/media/libvpx/config/linux/x64/vpx_config.asm
@ -74,7 +74,6 @@
 %define CONFIG_MULTI_RES_ENCODING 1
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 0
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
--- a/media/libvpx/config/linux/x64/vpx_config.h
+++ b/media/libvpx/config/linux/x64/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/mac/ia32/vpx_config.asm
+++ b/media/libvpx/config/mac/ia32/vpx_config.asm
@ -74,7 +74,6 @@
 %define CONFIG_MULTI_RES_ENCODING 1
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 0
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
--- a/media/libvpx/config/mac/ia32/vpx_config.h
+++ b/media/libvpx/config/mac/ia32/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/mac/x64/vpx_config.asm
+++ b/media/libvpx/config/mac/x64/vpx_config.asm
@ -74,7 +74,6 @@
 %define CONFIG_MULTI_RES_ENCODING 1
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 0
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
--- a/media/libvpx/config/mac/x64/vpx_config.h
+++ b/media/libvpx/config/mac/x64/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/vpx_version.h
+++ b/media/libvpx/config/vpx_version.h
@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  12
+#define VERSION_MINOR  13
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING      " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0"
+#define VERSION_STRING      " v1.13.0"
--- a/media/libvpx/config/win/aarch64/vpx_config.asm
+++ b/media/libvpx/config/win/aarch64/vpx_config.asm
@ -77,7 +77,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  1
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  0
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@ -90,4 +89,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
    .section .note.GNU-stack,"",%progbits
--- a/media/libvpx/config/win/aarch64/vpx_config.h
+++ b/media/libvpx/config/win/aarch64/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/win/ia32/vpx_config.asm
+++ b/media/libvpx/config/win/ia32/vpx_config.asm
@ -74,7 +74,6 @@
 %define CONFIG_MULTI_RES_ENCODING 1
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 0
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
--- a/media/libvpx/config/win/ia32/vpx_config.h
+++ b/media/libvpx/config/win/ia32/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/config/win/x64/vpx_config.asm
+++ b/media/libvpx/config/win/x64/vpx_config.asm
@ -74,7 +74,6 @@
 %define CONFIG_MULTI_RES_ENCODING 1
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 0
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
--- a/media/libvpx/config/win/x64/vpx_config.h
+++ b/media/libvpx/config/win/x64/vpx_config.h
@ -86,7 +86,6 @@
 #define CONFIG_MULTI_RES_ENCODING 1
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 0
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 8192
 #define DECODE_HEIGHT_LIMIT 4608
 #endif /* VPX_CONFIG_H */
--- a/media/libvpx/libvpx/.mailmap
+++ b/media/libvpx/libvpx/.mailmap
@ -25,6 +25,7 @@ Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 Johann <johann@duck.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Martin Storsjö <martin@martin.st>
--- a/media/libvpx/libvpx/AUTHORS
+++ b/media/libvpx/libvpx/AUTHORS
@ -21,6 +21,7 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
+Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
 Anton Venema <anton.venema@liveswitch.com>
@ -175,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
 Sami Pietilä <samipietila@google.com>
+Sam James <sam@gentoo.org>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
--- a/media/libvpx/libvpx/CHANGELOG
+++ b/media/libvpx/libvpx/CHANGELOG
@ -1,3 +1,39 @@
+2023-01-31 v1.13.0 "Ugly Duckling"
+  This release includes more Neon and AVX2 optimizations, adds a new codec
+  control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
+  numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
+
+    GoogleTest is upgraded to v1.12.1.
+
+    .clang-format is upgraded to clang-format-11.
+
+    VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
+    feature of using external rate control models for vp9.
+
+  - Enhancement:
+    Numerous improvements on Neon optimizations.
+    Numerous improvements on AVX2 optimizations.
+    Additional ARM targets added for Visual Studio.
+
+  - Bug fixes:
+    Fix to calculating internal stats when frame dropped.
+    Fix to segfault for external resize test in vp9.
+    Fix to build system with replacing egrep with grep -E.
+    Fix to a few bugs with external RTC rate control library.
+    Fix to make SVC work with VBR.
+    Fix to key frame setting in VP9 external RC.
+    Fix to -Wimplicit-int (Clang 16).
+    Fix to VP8 external RC for buffer levels.
+    Fix to VP8 external RC for dynamic update of layers.
+    Fix to VP9 auto level.
+    Fix to off-by-one error of max w/h in validate_config.
+    Fix to make SVC work for Profile 1.
+
 2022-06-17 v1.12.0 "Torrent Duck"
  This release adds optimizations for Loongarch, adds support for vp8 in the
  real-time rate control library, upgrades GoogleTest to v1.11.0, updates
--- a/media/libvpx/libvpx/README
+++ b/media/libvpx/libvpx/README
@ -1,4 +1,4 @@
-v1.12.0 Torrent Duck
+v1.13.0 Ugly Duckling

 Welcome to the WebM VP8/VP9 Codec SDK!

--- a/media/libvpx/libvpx/configure
+++ b/media/libvpx/libvpx/configure
@ -293,6 +293,7 @@ EXPERIMENT_LIST="
    emulate_hardware
    non_greedy_mv
    rate_ctrl
+    collect_component_timing
 "
 CONFIG_LIST="
    dependency_tracking
@ -342,7 +343,6 @@ CONFIG_LIST="
    multi_res_encoding
    temporal_denoising
    vp9_temporal_denoising
-    consistent_recode
    coefficient_range_checking
    vp9_highbitdepth
    better_hw_compatibility
@ -406,7 +406,6 @@ CMDLINE_SELECT="
    multi_res_encoding
    temporal_denoising
    vp9_temporal_denoising
-    consistent_recode
    coefficient_range_checking
    better_hw_compatibility
    vp9_highbitdepth
--- a/media/libvpx/libvpx/libs.mk
+++ b/media/libvpx/libvpx/libs.mk
@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 7
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 8
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
--- a/media/libvpx/libvpx/test/comp_avg_pred_test.cc
+++ b/media/libvpx/libvpx/test/comp_avg_pred_test.cc
@ -260,5 +260,11 @@ INSTANTIATE_TEST_SUITE_P(
    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
 #endif  // HAVE_SSE2

+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
--- a/media/libvpx/libvpx/test/variance_test.cc
+++ b/media/libvpx/libvpx/test/variance_test.cc
@ -1572,6 +1572,10 @@ INSTANTIATE_TEST_SUITE_P(
                             12),
        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
                             12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
+                             12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
+                             12),
        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
                             10),
        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
@ -1594,6 +1598,10 @@ INSTANTIATE_TEST_SUITE_P(
                             10),
        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
                             10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
+                             10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
+                             10),
        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
                             8),
        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
@ -1613,7 +1621,9 @@ INSTANTIATE_TEST_SUITE_P(
        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
                             8),
        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
-        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
                             8)));

 INSTANTIATE_TEST_SUITE_P(
@ -1652,6 +1662,12 @@ INSTANTIATE_TEST_SUITE_P(
        SubpelAvgVarianceParams(3, 2,
                                &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
                                12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
+                                12),
        SubpelAvgVarianceParams(6, 6,
                                &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
                                10),
@ -1685,6 +1701,12 @@ INSTANTIATE_TEST_SUITE_P(
        SubpelAvgVarianceParams(3, 2,
                                &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
                                10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
+                                10),
        SubpelAvgVarianceParams(6, 6,
                                &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
                                8),
@ -1717,6 +1739,12 @@ INSTANTIATE_TEST_SUITE_P(
                                8),
        SubpelAvgVarianceParams(3, 2,
                                &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
                                8)));

 #endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc
@ -127,14 +127,15 @@ class Vp8RcInterfaceTest
        encoder->Control(VP8E_SET_CPUUSED, -6);
        encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
        encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
-      } else if (frame_params_.frame_type == INTER_FRAME) {
+      } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
        // Disable golden frame update.
        frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
        frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
      }
    }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
    encoder_exit_ = video->frame() == test_video_.frames;
  }

--- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc
@ -41,7 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16;
 constexpr int kReadMinGfInterval = 5;
 constexpr int kReadMaxGfInterval = 13;
 const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.50;
+const double kPsnrThreshold = 30.4;

 struct ToyRateCtrl {
  int magic_number;
--- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
+++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc
@ -57,9 +57,11 @@ class RcInterfaceTest
      encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
      encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
    }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
-    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    if (rc_cfg_.rc_mode == VPX_CBR &&
+        frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
      // Disable golden frame update.
      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
@ -183,8 +185,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
      encoder->Control(VP9E_SET_SVC, 1);
      encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
    }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
    encoder_exit_ = video->frame() == kNumFrames;
    current_superframe_ = video->frame();
    if (dynamic_spatial_layers_ == 1) {
@ -247,7 +250,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
          else
            frame_params_.temporal_layer_id = 0;
          rc_api_->ComputeQP(frame_params_);
-          frame_params_.frame_type = INTER_FRAME;
+          frame_params_.frame_type = libvpx::RcFrameType::kInterFrame;
          rc_api_->PostEncodeUpdate(sizes_[sl]);
        }
      }
--- a/media/libvpx/libvpx/vp8/common/onyx.h
+++ b/media/libvpx/libvpx/vp8/common/onyx.h
@ -26,13 +26,6 @@ struct VP8_COMP;

 /* Create/destroy static data structures. */

-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
  USAGE_LOCAL_FILE_PLAYBACK = 0x0,
  USAGE_STREAM_FROM_SERVER = 0x1,
@ -58,19 +51,19 @@ typedef enum {
 #include <assert.h>
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
  switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
      *hr = 1;
      *hs = 1;
      break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
      *hr = 4;
      *hs = 5;
      break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
      *hr = 3;
      *hs = 5;
      break;
-    case ONETWO:
+    case VP8E_ONETWO:
      *hr = 1;
      *hs = 2;
      break;
@ -273,8 +266,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                   unsigned int threshold[4]);
 int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                       unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 int vp8_get_quantizer(struct VP8_COMP *cpi);

 #ifdef __cplusplus
--- a/media/libvpx/libvpx/vp8/encoder/block.h
+++ b/media/libvpx/libvpx/vp8/encoder/block.h
@ -92,8 +92,7 @@ typedef struct macroblock {
  signed int last_act_zbin_adj;

  int *mvcost[2];
-  /* MSVC generates code that thinks this is 16-byte aligned */
-  DECLARE_ALIGNED(16, int*, mvsadcost[2]);
+  int *mvsadcost[2];
  int (*mbmode_cost)[MB_MODE_COUNT];
  int (*intra_uv_mode_cost)[MB_MODE_COUNT];
  int (*bmode_costs)[10][10];
--- a/media/libvpx/libvpx/vp8/encoder/firstpass.c
+++ b/media/libvpx/libvpx/vp8/encoder/firstpass.c
@ -2990,8 +2990,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
    }

    /* Set back to unscaled by defaults */
-    cpi->common.horiz_scale = NORMAL;
-    cpi->common.vert_scale = NORMAL;
+    cpi->common.horiz_scale = VP8E_NORMAL;
+    cpi->common.vert_scale = VP8E_NORMAL;

    /* Calculate Average bits per frame. */
    av_bits_per_frame = cpi->oxcf.target_bandwidth /
--- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c
+++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c
@ -1667,7 +1667,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {

  cm->sharpness_level = cpi->oxcf.Sharpness;

-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
+  if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
    int hr, hs, vr, vs;

    Scale2Ratio(cm->horiz_scale, &hr, &hs);
@ -2504,15 +2504,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
    if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
                             cpi->oxcf.optimal_buffer_level / 100)) {
      cm->horiz_scale =
-          (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
-      cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+          (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
+      cm->vert_scale =
+          (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
    }
    /* Should we now start scaling back up */
    else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
                                  cpi->oxcf.optimal_buffer_level / 100)) {
      cm->horiz_scale =
-          (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
-      cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+          (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
+      cm->vert_scale =
+          (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
    }

    /* Get the new height and width */
@ -5380,15 +5382,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
  }
 }

-int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
-  if (horiz_mode <= ONETWO) {
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
+  if (horiz_mode <= VP8E_ONETWO) {
    cpi->common.horiz_scale = horiz_mode;
  } else {
    return -1;
  }

-  if (vert_mode <= ONETWO) {
+  if (vert_mode <= VP8E_ONETWO) {
    cpi->common.vert_scale = vert_mode;
  } else {
    return -1;
--- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c
+++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c
@ -947,19 +947,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
    if (img != NULL) {
      res = image2yuvconfig(img, &sd);

-      if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
-        /* from vpx_encoder.h for g_w/g_h:
-           "Note that the frames passed as input to the encoder must have this
-           resolution"
-        */
-        ctx->base.err_detail = "Invalid input frame resolution";
-        res = VPX_CODEC_INVALID_PARAM;
-      } else {
-        if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
-                                  &sd, dst_time_stamp, dst_end_time_stamp)) {
-          VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
-          res = update_error_state(ctx, &cpi->common.error);
-        }
+      if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
+                                dst_time_stamp, dst_end_time_stamp)) {
+        VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
+        res = update_error_state(ctx, &cpi->common.error);
      }

      /* reset for next frame */
@ -1233,8 +1224,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
  if (data) {
    int res;
    vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
-                                (VPX_SCALING)scalemode.v_scaling_mode);
+    res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+                                scalemode.v_scaling_mode);

    if (!res) {
      /*force next frame a key frame to effect scaling mode */
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc
@ -10,7 +10,9 @@

 #include <math.h>
 #include <new>
+#include "vp8/common/common.h"
 #include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/onyx_int.h"
 #include "vp8/encoder/ratectrl.h"
 #include "vpx_ports/system_state.h"

@ -65,6 +67,13 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
  return rc_api;
 }

+VP8RateControlRTC::~VP8RateControlRTC() {
+  if (cpi_) {
+    vpx_free(cpi_->gf_active_flags);
+    vpx_free(cpi_);
+  }
+}
+
 void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
  VP8_COMMON *cm = &cpi_->common;
  VP8_CONFIG *oxcf = &cpi_->oxcf;
@ -203,7 +212,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
    vp8_restore_layer_context(cpi_, layer);
    vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
  }
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
  cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
  cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
  if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
--- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h
@ -12,23 +12,24 @@
 #define VPX_VP8_RATECTRL_RTC_H_

 #include <cstdint>
+#include <cstring>
 #include <memory>

-#include "vp8/encoder/onyx_int.h"
-#include "vp8/common/common.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"

+struct VP8_COMP;
+
 namespace libvpx {
 struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
 public:
  VP8RateControlRtcConfig() {
-    vp8_zero(layer_target_bitrate);
-    vp8_zero(ts_rate_decimator);
+    memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
  }
 };

 struct VP8FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
  int temporal_layer_id;
 };

@ -36,12 +37,7 @@ class VP8RateControlRTC {
 public:
  static std::unique_ptr<VP8RateControlRTC> Create(
      const VP8RateControlRtcConfig &cfg);
-  ~VP8RateControlRTC() {
-    if (cpi_) {
-      vpx_free(cpi_->gf_active_flags);
-      vpx_free(cpi_);
-    }
-  }
+  ~VP8RateControlRTC();

  void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
  // GetQP() needs to be called after ComputeQP() to get the latest QP
@ -54,7 +50,7 @@ class VP8RateControlRTC {
 private:
  VP8RateControlRTC() {}
  void InitRateControl(const VP8RateControlRtcConfig &cfg);
-  VP8_COMP *cpi_;
+  struct VP8_COMP *cpi_;
  int q_;
 };

--- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@ -220,7 +220,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
      // Look up the component cost of the residual motion vector
      {
        uint32_t cost[4];
-        int16_t __attribute__((aligned(16))) rowcol[8];
+        DECLARE_ALIGNED(16, int16_t, rowcol[8]);
        vst1q_s16(rowcol, v_diff_mv_w);

        // Note: This is a use case for gather instruction
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c
@ -1980,6 +1980,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
  int64_t best_rd = INT64_MAX;

  vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif

  // Use the lower precision, but faster, 32x32 fdct for mode selection.
  x->use_lp32x32fdct = 1;
@ -2047,15 +2050,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
    vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
  } else {
    if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
      if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
        vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                           ctx, best_rd);
      else
        vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                  bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
    } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
      vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                    bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
    }
  }

@ -2078,6 +2093,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,

  ctx->rate = rd_cost->rate;
  ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
 }
 #endif  // !CONFIG_REALTIME_ONLY

@ -4411,8 +4429,14 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,

  if (should_encode_sb && pc_tree->index != 3) {
    int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_sb_time);
+#endif
    encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
              pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_sb_time);
+#endif
 #if CONFIG_RATE_CTRL
    if (oxcf->use_simple_encode_api) {
      // Store partition, motion vector of the superblock.
@ -4539,8 +4563,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
                                &x->min_partition_size, &x->max_partition_size);
      }
      td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, rd_pick_partition_time);
+#endif
      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                        &dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, rd_pick_partition_time);
+#endif
    }
    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                    sb_col_in_tile, num_sb_cols);
@ -5810,14 +5841,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
        for (i = 0; i < BLOCK_SIZES; ++i) {
          for (j = 0; j < MAX_MODES; ++j) {
            tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_RATE_CTRL
-            if (cpi->oxcf.use_simple_encode_api) {
-              tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-            }
-#endif  // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
            tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif  // CONFIG_CONSISTENT_RECODE
            tile_data->mode_map[i][j] = j;
          }
        }
@ -6037,9 +6061,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
  x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
  x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-#if CONFIG_CONSISTENT_RECODE
  x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
-#endif
  if (xd->lossless) x->optimize = 0;
  x->sharpness = cpi->oxcf.sharpness;
  x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
@ -6184,13 +6206,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
  return sum_delta / (cm->mi_rows * cm->mi_cols);
 }

-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void restore_encode_params(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
  int i, j;
+  TileDataEnc *tile_data;
  RD_OPT *rd_opt = &cpi->rd;
  for (i = 0; i < MAX_REF_FRAMES; i++) {
    for (j = 0; j < REFERENCE_MODES; j++)
@ -6201,35 +6221,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
      rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
  }

-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] =
-                tile_data->thresh_freq_fact_prev[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
  }

  cm->interp_filter = cpi->sf.default_interp_filter;
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL

 void vp9_encode_frame(VP9_COMP *cpi) {
  VP9_COMMON *const cm = &cpi->common;

-#if CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    restore_encode_params(cpi);
-  }
-#endif  // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
  restore_encode_params(cpi);
-#endif

 #if CONFIG_MISMATCH_DEBUG
  mismatch_reset_frame(MAX_MB_PLANE);
@ -6283,7 +6287,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
    if (cm->interp_filter == SWITCHABLE)
      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);

+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_frame_internal_time);
+#endif
    encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_frame_internal_time);
+#endif

    for (i = 0; i < REFERENCE_MODES; ++i)
      mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c
--- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h
@ -90,13 +90,6 @@ typedef enum {
  ENCODE_BREAKOUT_LIMITED = 2
 } ENCODE_BREAKOUT_TYPE;

-typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
 typedef enum {
  // Good Quality Fast Encoding. The encoder balances quality with the amount of
  // time it takes to encode the output. Speed setting controls how fast.
@ -336,9 +329,7 @@ typedef struct TplDepFrame {
 typedef struct TileDataEnc {
  TileInfo tile_info;
  int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
  int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
  int8_t mode_map[BLOCK_SIZES][MAX_MODES];
  FIRSTPASS_DATA fp_data;
  VP9RowMTSync row_mt_sync;
@ -659,6 +650,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
 static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
 #endif  // CONFIG_RATE_CTRL

+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+  vp9_get_compressed_data_time,
+  vp9_temporal_filter_time,
+  vp9_rc_get_second_pass_params_time,
+  setup_tpl_stats_time,
+  Pass2Encode_time,
+
+  encode_with_recode_loop_time,
+  loopfilter_frame_time,
+  vp9_pack_bitstream_time,
+
+  encode_frame_internal_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  encode_sb_time,
+
+  vp9_rd_pick_inter_mode_sb_time,
+  vp9_rd_pick_inter_mode_sub8x8_time,
+
+  intra_mode_search_time,
+  handle_inter_mode_time,
+  single_motion_search_time,
+  joint_motion_search_time,
+  interp_filter_time,
+
+  kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+    case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+    case vp9_rc_get_second_pass_params_time:
+      return "vp9_rc_get_second_pass_params_time";
+    case setup_tpl_stats_time: return "setup_tpl_stats_time";
+    case Pass2Encode_time: return "Pass2Encode_time";
+
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loopfilter_frame_time: return "loopfilter_frame_time";
+    case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+    case encode_frame_internal_time: return "encode_frame_internal_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case encode_sb_time: return "encode_sb_time";
+
+    case vp9_rd_pick_inter_mode_sb_time:
+      return "vp9_rd_pick_inter_mode_sb_time";
+    case vp9_rd_pick_inter_mode_sub8x8_time:
+      return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+    case intra_mode_search_time: return "intra_mode_search_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case single_motion_search_time: return "single_motion_search_time";
+    case joint_motion_search_time: return "joint_motion_search_time";
+    case interp_filter_time: return "interp_filter_time";
+
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 typedef struct VP9_COMP {
  FRAME_INFO frame_info;
  QUANTS quants;
@ -973,6 +1030,22 @@ typedef struct VP9_COMP {
  EXT_RATECTRL ext_ratectrl;

  int fixed_qp_onepass;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
+  uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
+  struct vpx_usec_timer component_timer[kTimingComponents];
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
+  uint64_t frame_component_time[kTimingComponents];
+#endif
 } VP9_COMP;

 #if CONFIG_RATE_CTRL
@ -1154,8 +1227,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
 int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                       int cols);

-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);

 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                         unsigned int height);
@ -1392,6 +1465,38 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);

 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))

+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
+                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h
@ -121,11 +121,9 @@ typedef struct RD_OPT {
  int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];

  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
  int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];

  int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
  int RDMULT;
  int RDDIV;
  double r0;
--- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c
@ -2832,8 +2832,14 @@ static int64_t handle_inter_mode(
      frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;

      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, joint_motion_search_time);
+#endif
        joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
                            single_newmv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, joint_motion_search_time);
+#endif
      } else {
        rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                  &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@ -2845,7 +2851,13 @@ static int64_t handle_inter_mode(
      *rate2 += rate_mv;
    } else {
      int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, single_motion_search_time);
+#endif
      single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, single_motion_search_time);
+#endif
      if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;

      frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@ -2908,6 +2920,9 @@ static int64_t handle_inter_mode(
  intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
  if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);

+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, interp_filter_time);
+#endif
  // Search for best switchable filter by checking the variance of
  // pred error irrespective of whether the filter will be used
  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
@ -3005,6 +3020,9 @@ static int64_t handle_inter_mode(
      restore_dst_buf(xd, orig_dst, orig_dst_stride);
    }
  }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, interp_filter_time);
+#endif
  // Set the appropriate filter
  mi->interp_filter =
      cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
@ -3707,19 +3725,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
    if (ref_frame == INTRA_FRAME) {
      TX_SIZE uv_tx;
      struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
      memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
      super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                      best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
      if (rate_y == INT_MAX) continue;

      uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
                              [pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
      if (rate_uv_intra[uv_tx] == INT_MAX) {
        choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
                             &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                             &skip_uv[uv_tx], &mode_uv[uv_tx]);
      }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
      rate_uv = rate_uv_tokenonly[uv_tx];
      distortion_uv = dist_uv[uv_tx];
      skippable = skippable && skip_uv[uv_tx];
@ -3730,11 +3759,17 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
        rate2 += intra_cost_penalty;
      distortion2 = distortion_y + distortion_uv;
    } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_inter_mode_time);
+#endif
      this_rd = handle_inter_mode(
          cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
          recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
          single_inter_filter, single_skippable, &total_sse, best_rd,
          &mask_filter, filter_cache);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_inter_mode_time);
+#endif
      if (this_rd == INT64_MAX) continue;

      compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@ -3970,13 +4005,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
  }

  if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-// If adaptive interp filter is enabled, then the current leaf node of 8x8
-// data is needed for sub8x8. Hence preserve the context.
-#if CONFIG_CONSISTENT_RECODE
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
    if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#else
-    if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#endif
    rd_cost->rate = INT_MAX;
    rd_cost->rdcost = INT64_MAX;
    return;
--- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c
@ -16,8 +16,11 @@
 #include "vpx_dsp/vpx_dsp_common.h"

 // Mesh search patters for various speed settings
-static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
-  { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+  { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+  { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
 };

 #if !CONFIG_REALTIME_ONLY
@ -209,15 +212,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
  const int boosted = frame_is_boosted(cpi);
  int i;

-  sf->tx_size_search_breakout = 1;
+  sf->adaptive_pred_interp_filter = 1;
  sf->adaptive_rd_thresh = 1;
  sf->adaptive_rd_thresh_row_mt = 0;
  sf->allow_skip_recode = 1;
  sf->less_rectangular_check = 1;
-  sf->use_square_partition_only = !boosted;
+  sf->mv.auto_mv_step_size = 1;
  sf->prune_ref_frame_for_rect_partitions = 1;
-  sf->rd_ml_partition.var_pruning = 1;
+  sf->temporal_filter_search_method = NSTEP;
+  sf->tx_size_search_breakout = 1;
+  sf->use_square_partition_only = !boosted;

+  sf->rd_ml_partition.var_pruning = 1;
  sf->rd_ml_partition.prune_rect_thresh[0] = -1;
  sf->rd_ml_partition.prune_rect_thresh[1] = 350;
  sf->rd_ml_partition.prune_rect_thresh[2] = 325;
@ -238,7 +244,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
  }

  if (speed >= 1) {
-    sf->temporal_filter_search_method = NSTEP;
    sf->rd_ml_partition.var_pruning = !boosted;
    sf->rd_ml_partition.prune_rect_thresh[1] = 225;
    sf->rd_ml_partition.prune_rect_thresh[2] = 225;
@ -263,11 +268,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
    sf->less_rectangular_check = 1;
    sf->use_rd_breakout = 1;
    sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
    sf->adaptive_rd_thresh = 2;
    sf->mv.subpel_search_level = 1;
    if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
    sf->allow_acl = 0;

    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
@ -991,10 +994,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
  sf->exhaustive_searches_thresh =
      (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
                                                              : INT_MAX;
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+  {
+    const int mesh_density_level =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
    for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
-      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+      sf->mesh_patterns[i].range =
+          best_quality_mesh_pattern[mesh_density_level][i].range;
+      sf->mesh_patterns[i].interval =
+          best_quality_mesh_pattern[mesh_density_level][i].interval;
    }
  }

--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c
--- a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
+++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h
@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[3];
+  FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc
@ -48,6 +48,29 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
  return rc_api;
 }

+VP9RateControlRTC::~VP9RateControlRTC() {
+  if (cpi_) {
+    if (cpi_->svc.number_spatial_layers > 1 ||
+        cpi_->svc.number_temporal_layers > 1) {
+      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+          vpx_free(lc->map);
+          vpx_free(lc->last_coded_q_map);
+          vpx_free(lc->consec_zero_mv);
+        }
+      }
+    }
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vpx_free(cpi_->segmentation_map);
+      cpi_->segmentation_map = NULL;
+      vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+    }
+    vpx_free(cpi_);
+  }
+}
+
 void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
  VP9_COMMON *cm = &cpi_->common;
  VP9EncoderConfig *oxcf = &cpi_->oxcf;
@ -157,7 +180,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
    cm->height = height;
  }
  vp9_set_mb_mi(cm, cm->width, cm->height);
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
  // This is needed to ensure key frame does not get unset in rc_get_svc_params.
  cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
  cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
--- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h
@ -19,14 +19,14 @@
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"

-namespace libvpx {
+struct VP9_COMP;

+namespace libvpx {
 struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
 public:
  VP9RateControlRtcConfig() {
@ -53,7 +53,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
 };

 struct VP9FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
  int spatial_layer_id;
  int temporal_layer_id;
 };
@ -90,28 +90,7 @@ class VP9RateControlRTC {
 public:
  static std::unique_ptr<VP9RateControlRTC> Create(
      const VP9RateControlRtcConfig &cfg);
-  ~VP9RateControlRTC() {
-    if (cpi_) {
-      if (cpi_->svc.number_spatial_layers > 1 ||
-          cpi_->svc.number_temporal_layers > 1) {
-        for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
-          for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
-            int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
-            LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
-            vpx_free(lc->map);
-            vpx_free(lc->last_coded_q_map);
-            vpx_free(lc->consec_zero_mv);
-          }
-        }
-      }
-      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-        vpx_free(cpi_->segmentation_map);
-        cpi_->segmentation_map = NULL;
-        vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
-      }
-      vpx_free(cpi_);
-    }
-  }
+  ~VP9RateControlRTC();

  void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
  // GetQP() needs to be called after ComputeQP() to get the latest QP
@ -125,7 +104,7 @@ class VP9RateControlRTC {
 private:
  VP9RateControlRTC() {}
  void InitRateControl(const VP9RateControlRtcConfig &cfg);
-  VP9_COMP *cpi_;
+  struct VP9_COMP *cpi_;
 };

 }  // namespace libvpx
--- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c
+++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c
@ -1372,22 +1372,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
          timebase_units_to_ticks(timestamp_ratio, pts + duration);
      res = image2yuvconfig(img, &sd);

-      if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
-        /* from vpx_encoder.h for g_w/g_h:
-           "Note that the frames passed as input to the encoder must have this
-           resolution"
-        */
-        ctx->base.err_detail = "Invalid input frame resolution";
-        res = VPX_CODEC_INVALID_PARAM;
-      } else {
-        // Store the original flags in to the frame buffer. Will extract the
-        // key frame flag when we actually encode this frame.
-        if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
+      // Store the original flags in to the frame buffer. Will extract the
+      // key frame flag when we actually encode this frame.
+      if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
                                dst_time_stamp, dst_end_time_stamp)) {
-          res = update_error_state(ctx, &cpi->common.error);
-        }
-        ctx->next_frame_flags = 0;
+        res = update_error_state(ctx, &cpi->common.error);
      }
+      ctx->next_frame_flags = 0;
    }

    cx_data = ctx->cx_data;
@ -1684,9 +1675,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
  vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);

  if (mode) {
-    const int res =
-        vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
-                              (VPX_SCALING)mode->v_scaling_mode);
+    const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
+                                          mode->v_scaling_mode);
    return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
  }
  return VPX_CODEC_INVALID_PARAM;
--- a/media/libvpx/libvpx/vp9/vp9cx.mk
+++ b/media/libvpx/libvpx/vp9/vp9cx.mk
@ -104,6 +104,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

--- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
+++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@ -14,6 +14,9 @@
 #include "vpx/vpx_encoder.h"

 namespace libvpx {
+
+enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
+
 struct VpxRateControlRtcConfig {
 public:
  VpxRateControlRtcConfig() {
--- a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@ -165,8 +165,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
  // Transpose top left and top right quarters into one contiguous location to
  // process to the top half.

-  transpose_s16_8x8_new(&temp0[0], &temp2[0]);
-  transpose_s16_8x8_new(&temp1[0], &temp2[8]);
+  transpose_s16_8x8q(&temp0[0], &temp2[0]);
+  transpose_s16_8x8q(&temp1[0], &temp2[8]);
  partial_round_shift(temp2);
  cross_input(temp2, temp3);
  vpx_fdct8x16_body(temp3, temp2);
@ -180,7 +180,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {

  // Transpose bottom left and bottom right quarters into one contiguous
  // location to process to the bottom half.
-  transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+  transpose_s16_8x8q(&temp0[8], &temp1[0]);

  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                    &temp1[13], &temp1[14], &temp1[15]);
--- a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
  dct_body_first_pass(temp5, temp4);

  // Generate the top row by munging the first set of 8 from each one together.
-  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);

  dct_body_second_pass(temp0, temp5);

@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
  store(output, temp5);

  // Second row of 8x32.
-  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);

  dct_body_second_pass(temp0, temp5);

@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
  store(output + 8 * 32, temp5);

  // Third row of 8x32
-  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);

  dct_body_second_pass(temp0, temp5);

@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
  store(output + 16 * 32, temp5);

  // Final row of 8x32.
-  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);

  dct_body_second_pass(temp0, temp5);

@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
  dct_body_first_pass(temp5, temp4);

  // Generate the top row by munging the first set of 8 from each one together.
-  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);

  dct_body_second_pass_rd(temp0, temp5);

@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
  store(output, temp5);

  // Second row of 8x32.
-  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);

  dct_body_second_pass_rd(temp0, temp5);

@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
  store(output + 8 * 32, temp5);

  // Third row of 8x32
-  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);

  dct_body_second_pass_rd(temp0, temp5);

@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
  store(output + 16 * 32, temp5);

  // Final row of 8x32.
-  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);

  dct_body_second_pass_rd(temp0, temp5);

--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c
@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c
@ -0,0 +1,233 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0, s1;
+
+    s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4], int w,
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3;
+
+      s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h)                                            \
+  void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+                                         const uint8_t *const ref[4],        \
+                                         int ref_stride, uint32_t res[4]) {  \
+    highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));   \
+  }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c
@ -17,209 +17,363 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"

-static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
-                                                  int src_stride,
-                                                  const uint8_t *ref_ptr,
-                                                  int ref_stride, int width,
-                                                  int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 4) {
-      const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
-      const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
-      sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
-    }
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    sum = vabal_u16(sum, s, r);
+
    src16_ptr += src_stride;
    ref16_ptr += ref_stride;
-  }
+  } while (--i != 0);

-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
 }

-static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
-                                                  int src_stride,
-                                                  const uint8_t *ref_ptr,
-                                                  int ref_stride, int width,
-                                                  int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 8) {
-      const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
-      const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
-      sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
-                               vget_high_u16(ref_u16));
-    }
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t diff = vabdq_u16(s, r);
+    sum = vpadalq_u16(sum, diff);
+
    src16_ptr += src_stride;
    ref16_ptr += ref_stride;
-  }
+  } while (--i != 0);

-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
 }

-static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
-    int ref_stride, const uint8_t *second_pred, int width, int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 4) {
-      const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
-      const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
-      const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
-      const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
-      sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
-    }
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint16x8_t diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    diff0 = vabdq_u16(s0, r0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    diff1 = vabdq_u16(s1, r1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
    src16_ptr += src_stride;
    ref16_ptr += ref_stride;
-    pred_ptr += width;
-  }
+  } while (--i != 0);

-  return horizontal_add_uint32x4(sum_abs_diff);
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
 }

-static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
-    int ref_stride, const uint8_t *second_pred, int width, int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 8) {
-      const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
-      const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
-      const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
-      const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
-    }
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+      uint16x8_t diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      diff0 = vabdq_u16(s0, r0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      diff1 = vabdq_u16(s1, r1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      diff2 = vabdq_u16(s2, r2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      diff3 = vabdq_u16(s3, r3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
    src16_ptr += src_stride;
    ref16_ptr += ref_stride;
-    pred_ptr += width;
-  }
+  } while (--i != 0);

-  return horizontal_add_uint32x4(sum_abs_diff);
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
 }

-#define highbd_sad4MxN(m, n)                                                 \
-  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+#define HBD_SAD_WXH_NEON(w, h)                                            \
+  unsigned int vpx_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
  }

-#define highbd_sadMxN(m, n)                                                  \
-  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t vpx_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
  }

-#define highbd_sad4MxN_avg(m, n)                                          \
-  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
-      int ref_stride, const uint8_t *second_pred) {                       \
-    return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
-                                second_pred, m, n);                       \
-  }
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)

-#define highbd_sadMxN_avg(m, n)                                           \
-  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
-      int ref_stride, const uint8_t *second_pred) {                       \
-    return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
-                                second_pred, m, n);                       \
-  }
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)

-#define highbd_sadMxNx4D(m, n)                                                 \
-  void vpx_highbd_sad##m##x##n##x4d_neon(                                      \
-      const uint8_t *src_ptr, int src_stride,                                  \
-      const uint8_t *const ref_array[4], int ref_stride,                       \
-      uint32_t sad_array[4]) {                                                 \
-    int i;                                                                     \
-    for (i = 0; i < 4; ++i) {                                                  \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride,       \
-                                                    ref_array[i], ref_stride); \
-    }                                                                          \
-  }
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)

-/* clang-format off */
-// 4x4
-highbd_sad4MxN(4, 4)
-highbd_sad4MxN_avg(4, 4)
-highbd_sadMxNx4D(4, 4)
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)

-// 4x8
-highbd_sad4MxN(4, 8)
-highbd_sad4MxN_avg(4, 8)
-highbd_sadMxNx4D(4, 8)
-
-// 8x4
-highbd_sadMxN(8, 4)
-highbd_sadMxN_avg(8, 4)
-highbd_sadMxNx4D(8, 4)
-
-// 8x8
-highbd_sadMxN(8, 8)
-highbd_sadMxN_avg(8, 8)
-highbd_sadMxNx4D(8, 8)
-
-// 8x16
-highbd_sadMxN(8, 16)
-highbd_sadMxN_avg(8, 16)
-highbd_sadMxNx4D(8, 16)
-
-// 16x8
-highbd_sadMxN(16, 8)
-highbd_sadMxN_avg(16, 8)
-highbd_sadMxNx4D(16, 8)
-
-// 16x16
-highbd_sadMxN(16, 16)
-highbd_sadMxN_avg(16, 16)
-highbd_sadMxNx4D(16, 16)
-
-// 16x32
-highbd_sadMxN(16, 32)
-highbd_sadMxN_avg(16, 32)
-highbd_sadMxNx4D(16, 32)
-
-// 32x16
-highbd_sadMxN(32, 16)
-highbd_sadMxN_avg(32, 16)
-highbd_sadMxNx4D(32, 16)
-
-// 32x32
-highbd_sadMxN(32, 32)
-highbd_sadMxN_avg(32, 32)
-highbd_sadMxNx4D(32, 32)
-
-// 32x64
-highbd_sadMxN(32, 64)
-highbd_sadMxN_avg(32, 64)
-highbd_sadMxNx4D(32, 64)
-
-// 64x32
-highbd_sadMxN(64, 32)
-highbd_sadMxN_avg(64, 32)
-highbd_sadMxNx4D(64, 32)
-
-// 64x64
-highbd_sadMxN(64, 64)
-highbd_sadMxN_avg(64, 64)
-highbd_sadMxNx4D(64, 64)
-    /* clang-format on */
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c
@ -0,0 +1,594 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
+    uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
+
+    uint16x8_t blend = vmulq_u16(s0, f0);
+    blend = vmlaq_u16(blend, s1, f1);
+    blend = vrshrq_n_u16(blend, 3);
+
+    vst1q_u16(dst_ptr, blend);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 8;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)                  \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + padding)];                                          \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,           \
+                                       (h + padding), xoffset);                \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding)      \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + padding));                          \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + padding));                          \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + padding), xoffset);            \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + padding), xoffset);            \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
+    uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
+    uint16x8_t p = vld1q_u16(second_pred);
+
+    uint16x8_t blend = vmulq_u16(s0, f0);
+    blend = vmlaq_u16(blend, s1, f1);
+    blend = vrshrq_n_u16(blend, 3);
+
+    vst1q_u16(dst_ptr, vrhaddq_u16(blend, p));
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding)          \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                        \
+    uint16_t tmp0[w * (h + padding)];                                      \
+    uint16_t tmp1[w * h];                                                  \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                          \
+                                                                           \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                       (h + padding), xoffset);            \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+                                                                           \
+    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding)  \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + padding));                          \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + padding)];                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + padding));                          \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + padding), xoffset);            \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + padding), xoffset);            \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1)
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c
@ -18,11 +18,6 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"

-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
 static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
                                     const uint16_t *ref_ptr, int ref_stride,
                                     int w, int h, uint64_t *sse,
@ -136,7 +131,7 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }

-#define HIGHBD_VAR(W, H)                                                    \
+#define HBD_VARIANCE_WXH_NEON(W, H)                                         \
  uint32_t vpx_highbd_8_variance##W##x##H##_neon(                           \
      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
      int ref_stride, uint32_t *sse) {                                      \
@ -218,274 +213,19 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
    return *sse;                                                            \
  }

-static INLINE void highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  uint32_t i, j;
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
-
-  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
-  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
-  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
-        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
-        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
-        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
-        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
-        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
-        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  } else {
-    assert(output_width >= 4);
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 4) {
-        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
-        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
-        uint16x4_t out_u16;
-        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
-        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
-        vst1_u16(&output_ptr[j], out_u16);
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  }
-}
-
-static INLINE void highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, unsigned int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  uint32_t i, j;
-
-  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
-  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
-  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
-        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
-        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
-        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
-        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
-        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
-        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  } else {
-    assert(output_width >= 4);
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 4) {
-        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
-        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
-        uint16x4_t out_u16;
-        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
-        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
-        vst1_u16(&output_ptr[j], out_u16);
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H)                                                \
-  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon(                    \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
-                                                 ref_ptr, ref_stride, sse);    \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon(                   \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_10_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon(                   \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_12_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
-  }
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon(                \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
-                                                 ref_ptr, ref_stride, sse);    \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon(               \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_10_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon(               \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_12_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
-  }
-
-void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
-                                   int width, int height, const uint16_t *ref,
-                                   int ref_stride) {
-  int i, j;
-  uint32x4_t one_u32 = vdupq_n_u32(1);
-  if (width >= 8) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 8) {
-        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
-        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
-        const uint32x4_t sum1_u32 =
-            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
-        const uint32x4_t sum2_u32 =
-            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
-        const uint16x4_t sum1_u16 =
-            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
-        const uint16x4_t sum2_u16 =
-            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
-        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
-        vst1q_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  } else {
-    assert(width >= 4);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 4) {
-        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
-        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
-        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
-        const uint16x4_t vcomp_pred =
-            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
-        vst1_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  }
-}
-
-/* All three forms of the variance are available in the same sizes. */
-#define HIGHBD_VARIANCES(W, H) \
-  HIGHBD_VAR(W, H)             \
-  HIGHBD_SUBPIX_VAR(W, H)      \
-  HIGHBD_SUBPIX_AVG_VAR(W, H)
-
-HIGHBD_VARIANCES(64, 64)
-HIGHBD_VARIANCES(64, 32)
-HIGHBD_VARIANCES(32, 64)
-HIGHBD_VARIANCES(32, 32)
-HIGHBD_VARIANCES(32, 16)
-HIGHBD_VARIANCES(16, 32)
-HIGHBD_VARIANCES(16, 16)
-HIGHBD_VARIANCES(16, 8)
-HIGHBD_VARIANCES(8, 16)
-HIGHBD_VARIANCES(8, 8)
-HIGHBD_VARIANCES(8, 4)
-HIGHBD_VARIANCES(4, 8)
-HIGHBD_VARIANCES(4, 4)
+HBD_VARIANCE_WXH_NEON(64, 64)
+HBD_VARIANCE_WXH_NEON(64, 32)
+HBD_VARIANCE_WXH_NEON(32, 64)
+HBD_VARIANCE_WXH_NEON(32, 32)
+HBD_VARIANCE_WXH_NEON(32, 16)
+HBD_VARIANCE_WXH_NEON(16, 32)
+HBD_VARIANCE_WXH_NEON(16, 16)
+HBD_VARIANCE_WXH_NEON(16, 8)
+HBD_VARIANCE_WXH_NEON(8, 16)
+HBD_VARIANCE_WXH_NEON(8, 8)
+HBD_VARIANCE_WXH_NEON(8, 4)
+HBD_VARIANCE_WXH_NEON(4, 8)
+HBD_VARIANCE_WXH_NEON(4, 4)

 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
--- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
  (void)bd;

  if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
    do {
-      vst1_u16(dst, vld1_u16(src));
+      s0 = vld1_u16(src);
      src += src_stride;
+      s1 = vld1_u16(src);
+      src += src_stride;
+
+      vst1_u16(dst, s0);
      dst += dst_stride;
-      vst1_u16(dst, vld1_u16(src));
-      src += src_stride;
+      vst1_u16(dst, s1);
      dst += dst_stride;
      h -= 2;
-    } while (h > 0);
+    } while (h != 0);
  } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
    do {
-      vst1q_u16(dst, vld1q_u16(src));
+      s0 = vld1q_u16(src);
      src += src_stride;
+      s1 = vld1q_u16(src);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
      dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s1);
      dst += dst_stride;
      h -= 2;
-    } while (h > 0);
+    } while (h != 0);
  } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
    do {
-      vst2q_u16(dst, vld2q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
      src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+      h -= 2;
+    } while (h != 0);
  } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
    do {
-      vst4q_u16(dst, vld4q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
  } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
    do {
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      s4 = vld1q_u16(src + 32);
+      s5 = vld1q_u16(src + 40);
+      s6 = vld1q_u16(src + 48);
+      s7 = vld1q_u16(src + 56);
      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      vst1q_u16(dst + 32, s4);
+      vst1q_u16(dst + 40, s5);
+      vst1q_u16(dst + 48, s6);
+      vst1q_u16(dst + 56, s7);
      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
  }
 }
--- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h
@ -126,6 +126,20 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
  return vreinterpret_u8_u32(a_u32);
 }

+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+                                             ptrdiff_t stride) {
+  uint64_t a;
+  uint64x2_t a_u64;
+  if (stride == 4) return vld1q_u16(buf);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vdupq_n_u64(a);
+  memcpy(&a, buf, 8);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  return vreinterpretq_u16_u64(a_u64);
+}
+
 // Store 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                      const uint8x8_t a) {
--- a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c
@ -17,633 +17,316 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"

-static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
-                                                 const void *const buf1) {
-  uint32_t a;
-  uint32x2_t aa;
-  memcpy(&a, buf0, 4);
-  aa = vdup_n_u32(a);
-  memcpy(&a, buf1, 4);
-  aa = vset_lane_u32(a, aa, 1);
-  return vreinterpret_u8_u32(aa);
+#if defined(__ARM_FEATURE_DOTPROD)
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
 }

-static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
-                            const uint8_t *const ref_array[4],
-                            const int ref_stride, const int height,
-                            uint32_t sad_array[4]) {
-  int i;
-  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
-#if !defined(__aarch64__)
-  uint16x4_t a[2];
-#endif
-  uint32x4_t r;
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];

-  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
-  assert(!(src_stride % sizeof(uint32_t)));
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;

-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vreinterpret_u8_u32(
-        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
-    const uint8x8_t ref01 = load_unaligned_2_buffers(
-        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
-    const uint8x8_t ref23 = load_unaligned_2_buffers(
-        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
-    abs[0] = vabal_u8(abs[0], s, ref01);
-    abs[1] = vabal_u8(abs[1], s, ref23);
-  }
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);

-#if defined(__aarch64__)
-  abs[0] = vpaddq_u16(abs[0], abs[1]);
-  r = vpaddlq_u16(abs[0]);
-#else
-  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
-  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
-  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
-#endif
-  vst1q_u32(sad_array, r);
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }

-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }

-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
-                                          uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
-  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
-  const uint16x8_t b0 = vpaddq_u16(a0, a1);
-  const uint32x4_t r = vpaddlq_u16(b0);
-#else
-  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint16x4_t b0 = vpadd_u16(a0, a1);
-  const uint16x4_t b1 = vpadd_u16(a2, a3);
-  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
-#endif
-  vst1q_u32(sad_array, r);
-}
-
-#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
-
-// Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
-  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
-  const uint32x4_t b0 = vpaddlq_u16(a0);
-  const uint32x4_t b1 = vpaddlq_u16(a1);
-  const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
-  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
-  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  vst1q_u32(sad_array, vcombine_u32(c0, c1));
-#endif
-}
-
-// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t b0 = vpaddq_u32(a0, a1);
-  const uint32x4_t b1 = vpaddq_u32(a2, a3);
-  const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
-  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
-  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
-  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
-  const uint32x2_t c0 = vpadd_u32(b0, b1);
-  const uint32x2_t c1 = vpadd_u32(b2, b3);
-  vst1q_u32(sad_array, vcombine_u32(c0, c1));
-#endif
-}
-
-// Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x4_t c0 = vpaddq_u32(b0, b1);
-  const uint32x4_t c1 = vpaddq_u32(b2, b3);
-  const uint32x4_t r = vpaddq_u32(c0, c1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
-  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
-  const uint32x2_t d0 = vpadd_u32(c0, c1);
-  const uint32x2_t d1 = vpadd_u32(c2, c3);
-  vst1q_u32(sad_array, vcombine_u32(d0, d1));
-#endif
-}
-
-#endif
-
-static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t sad_array[4], const int height) {
-  int i, j;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
-      ref_loop[j] += ref_stride;
-      sum[j] = vabal_u8(sum[j], s, b_u8);
-    }
-  }
-
-  sad_512_pel_final_neon(sum, sad_array);
-}
-
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
-}
-
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
-}
-
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint32x4_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  const uint8x16_t diff = vabdq_u8(src_ptr, r);
-  *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
-}
-
-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
                        vdupq_n_u32(0) };

-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
-    sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
-    sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
-    sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
-    sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
-  }
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);

-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
 }

-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
+#else  // !defined(__ARM_FEATURE_DOTPROD))

-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint16x8_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
-  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
 }

-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  int h_tmp = h > 64 ? 64 : h;
+  int i = 0;
+  vst1q_u32(res, vdupq_n_u32(0));
+
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      uint8x16_t s0, s1, s2, s3;
+
+      s0 = vld1q_u8(src + i * src_stride);
+      sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+      sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+      s1 = vld1q_u8(src + i * src_stride + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+      sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+      s2 = vld1q_u8(src + i * src_stride + 32);
+      sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+      sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+      sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+      sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+      s3 = vld1q_u8(src + i * src_stride + 48);
+      sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+      sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+      sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+      sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+      i++;
+    } while (i < h_tmp);
+
+    res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
+    res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
+    res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
+    res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
+
+    h_tmp += 64;
+  } while (i < h);
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
+  res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
+  res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
+  res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                        vdupq_n_u16(0) };

-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    /* Manual unrolling here stops the compiler from getting confused. */
-    sad16_neon(ref_loop[0], s, &sum[0]);
-    ref_loop[0] += ref_stride;
-    sad16_neon(ref_loop[1], s, &sum[1]);
-    ref_loop[1] += ref_stride;
-    sad16_neon(ref_loop[2], s, &sum[2]);
-    ref_loop[2] += ref_stride;
-    sad16_neon(ref_loop[3], s, &sum[3]);
-    ref_loop[3] += ref_stride;
-  }
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);

-  sad_512_pel_final_neon(sum, sad_array);
+    i++;
+  } while (i < h);
+
+  res[0] = horizontal_add_uint16x8(sum[0]);
+  res[1] = horizontal_add_uint16x8(sum[1]);
+  res[2] = horizontal_add_uint16x8(sum[2]);
+  res[3] = horizontal_add_uint16x8(sum[3]);
 }

-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // defined(__ARM_FEATURE_DOTPROD)

-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+                             uint16x8_t *const sad_sum) {
+  uint8x8_t abs_diff = vabd_u8(src, ref);
+  *sad_sum = vaddw_u8(*sad_sum, abs_diff);
 }

-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
-}
-
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             const int height, uint16x8_t *const sum) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-}
-
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
-  sad_512_pel_final_neon(sum, sad_array);
-}
-
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
-  sad_1024_pel_final_neon(sum, sad_array);
-}
-
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
-  sad_2048_pel_final_neon(sum, sad_array);
-}
-
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  uint32x4_t r0, r1, r2, r3;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  r2 = vpaddq_u32(sum[4], sum[5]);
-  r3 = vpaddq_u32(sum[6], sum[7]);
-  r0 = vpaddq_u32(r0, r1);
-  r1 = vpaddq_u32(r2, r3);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                        vdupq_n_u16(0) };

-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
+  int i = 0;
+  do {
+    const uint8x8_t s = vld1_u8(src + i * src_stride);
+    sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);

-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+    i++;
+  } while (i < h);

-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  sad_2048_pel_final_neon(sum, sad_array);
+  res[0] = horizontal_add_uint16x8(sum[0]);
+  res[1] = horizontal_add_uint16x8(sum[1]);
+  res[2] = horizontal_add_uint16x8(sum[2]);
+  res[3] = horizontal_add_uint16x8(sum[3]);
 }

-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0) };
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };

-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
+  int i = 0;
+  do {
+    uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);

-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+    sad8_neon(s, r3, &sum[3]);

-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
+    i += 2;
+  } while (i < h);

-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  sad_4096_pel_final_neon(sum, sad_array);
+  res[0] = horizontal_add_uint16x8(sum[0]);
+  res[1] = horizontal_add_uint16x8(sum[1]);
+  res[2] = horizontal_add_uint16x8(sum[2]);
+  res[3] = horizontal_add_uint16x8(sum[3]);
 }

-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#define SAD_WXH_4D_NEON(w, h)                                                  \
+  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
+  }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+
+#undef SAD_WXH_4D_NEON
--- a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c
@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,

  int i = h / 2;
  do {
-    uint32x2_t s, r;
-    uint32_t s0, s1, r0, r1;
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);

-    memcpy(&s0, src_ptr, 4);
-    memcpy(&r0, ref_ptr, 4);
-    s = vdup_n_u32(s0);
-    r = vdup_n_u32(r0);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
+    sum = vabal_u8(sum, s, r);

-    memcpy(&s1, src_ptr, 4);
-    memcpy(&r1, ref_ptr, 4);
-    s = vset_lane_u32(s1, s, 1);
-    r = vset_lane_u32(r1, r, 1);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
  } while (--i != 0);

  return horizontal_add_uint16x8(sum);
@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,

  int i = h / 2;
  do {
-    uint32x2_t s, r;
-    uint32_t s0, s1, r0, r1;
-    uint8x8_t p, avg;
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);

-    memcpy(&s0, src_ptr, 4);
-    memcpy(&r0, ref_ptr, 4);
-    s = vdup_n_u32(s0);
-    r = vdup_n_u32(r0);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);

-    memcpy(&s1, src_ptr, 4);
-    memcpy(&r1, ref_ptr, 4);
-    s = vset_lane_u32(s1, s, 1);
-    r = vset_lane_u32(r1, r, 1);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-
-    p = vld1_u8(second_pred);
-    avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
-
-    sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
    second_pred += 8;
  } while (--i != 0);

--- a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h
@ -40,6 +40,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 #endif
 }

+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
 #if defined(__aarch64__)
  return vaddv_s32(a);
@ -77,4 +94,20 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
  return vget_lane_u32(c, 0);
 #endif
 }
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if defined(__aarch64__)
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
--- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h
@ -23,10 +23,17 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
  int16x8x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_s16_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s16_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                           vreinterpret_s16_s32(vget_low_s32(a1)));
  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
                           vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
  return b0;
 }

@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {

 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
  uint16x8x2_t b0;
+#if defined(__aarch64__)
+  b0.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
                           vreinterpret_u16_u32(vget_low_u32(a1)));
  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
                           vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
  return b0;
 }

@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
 }

 // Transpose 8x8 to a new location.
-static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77

-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);

-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77

-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
 }

 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
  // d2.val[1]: 06 16 26 36 46 56 66 76
  // d3.val[0]: 03 13 23 33 43 53 63 73
  // d3.val[1]: 07 17 27 37 47 57 67 77
+
  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
  // d2.val[1]: 06 16 26 36 46 56 66 76
  // d3.val[0]: 03 13 23 33 43 53 63 73
  // d3.val[1]: 07 17 27 37 47 57 67 77
+
  const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
  const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
  const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
--- a/media/libvpx/libvpx/vpx_dsp/psnr.c
+++ b/media/libvpx/libvpx/vpx_dsp/psnr.c
@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
 * and highbd_8_variance(). It should not.
 */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
  int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;

  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
    }

    a += a_stride;
    b += b_stride;
  }
+
+  return sse;
 }

 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int w,
+                                    int h) {
  int i, j;
+  int64_t sse = 0;

  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;

  for (i = 0; i < h; i++) {
    for (j = 0; j < w; j++) {
      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
    }
    a += a_stride;
    b += b_stride;
  }
-}

-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
+  return sse;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
  const int dw = width % 16;
  const int dh = height % 16;
  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
  int x, y;

  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
  }

  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
  }

  for (y = 0; y < height / 16; ++y) {
    const uint8_t *pa = a;
    const uint8_t *pb = b;
+    unsigned int sse;
    for (x = 0; x < width / 16; ++x) {
      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
      total_sse += sse;
@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
  int x, y;
  const int dw = width % 16;
  const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
  if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+                                      b_stride, dw, height);
  }
  if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+                                      &b[(height - dh) * b_stride], b_stride,
+                                      width - dw, dh);
  }
  for (y = 0; y < height / 16; ++y) {
    const uint8_t *pa = a;
    const uint8_t *pb = b;
+    unsigned int sse;
    for (x = 0; x < width / 16; ++x) {
      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
      total_sse += sse;
--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk
@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig
+++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig
@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

--- a/media/libvpx/libvpx/y4minput.c
+++ b/media/libvpx/libvpx/y4minput.c
@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
  _img->fmt = _y4m->vpx_fmt;
  _img->w = _img->d_w = _y4m->pic_w;
  _img->h = _img->d_h = _y4m->pic_h;
+  _img->bit_depth = _y4m->bit_depth;
  _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
  _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
  _img->bps = _y4m->bps;
--- a/media/libvpx/moz.yaml
+++ b/media/libvpx/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9 (Thu Jan 26 21:31:14 2023).
+  release: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf (Tue Feb 14 02:46:51 2023).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9
+  revision: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libvpx/sources.mozbuild
+++ b/media/libvpx/sources.mozbuild
@ -181,6 +181,7 @@ files = {
    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
    'libvpx/vp9/encoder/vp9_temporal_filter.c',
    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
    'libvpx/vp9/encoder/vp9_treewriter.c',
    'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
    'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
@ -450,6 +451,7 @@ files = {
    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
    'libvpx/vp9/encoder/vp9_temporal_filter.c',
    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
    'libvpx/vp9/encoder/vp9_treewriter.c',
    'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
    'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
@ -699,6 +701,7 @@ files = {
    'libvpx/vp9/encoder/vp9_subexp.c',
    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
    'libvpx/vp9/encoder/vp9_treewriter.c',
    'libvpx/vp9/vp9_cx_iface.c',
    'libvpx/vp9/vp9_dx_iface.c',
@ -944,6 +947,7 @@ files = {
    'libvpx/vp9/encoder/vp9_subexp.c',
    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
    'libvpx/vp9/encoder/vp9_treewriter.c',
    'libvpx/vp9/vp9_cx_iface.c',
    'libvpx/vp9/vp9_dx_iface.c',
@ -1158,6 +1162,7 @@ files = {
    'libvpx/vp9/encoder/vp9_svc_layercontext.c',
    'libvpx/vp9/encoder/vp9_temporal_filter.c',
    'libvpx/vp9/encoder/vp9_tokenize.c',
+    'libvpx/vp9/encoder/vp9_tpl_model.c',
    'libvpx/vp9/encoder/vp9_treewriter.c',
    'libvpx/vp9/vp9_cx_iface.c',
    'libvpx/vp9/vp9_dx_iface.c',