зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1816486 - Update libvpx to bc2965f r=webrtc-reviewers,ng
Run `./mach vendor media/libvpx/moz.yaml --patch-mode=none` as what media/libvpx/README_MOZILLA said. The updated libvpx revision is bc2965ff72af7d7b21ffeab10549fcc67ed66ccf. Differential Revision: https://phabricator.services.mozilla.com/D169840
This commit is contained in:
Родитель
bef38fea84
Коммит
1b617be5fa
|
@ -77,7 +77,6 @@
|
|||
.equ CONFIG_MULTI_RES_ENCODING , 1
|
||||
.equ CONFIG_TEMPORAL_DENOISING , 1
|
||||
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
|
||||
.equ CONFIG_CONSISTENT_RECODE , 0
|
||||
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
|
||||
.equ CONFIG_VP9_HIGHBITDEPTH , 0
|
||||
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
|
||||
|
@ -90,4 +89,5 @@
|
|||
.equ CONFIG_EMULATE_HARDWARE , 0
|
||||
.equ CONFIG_NON_GREEDY_MV , 0
|
||||
.equ CONFIG_RATE_CTRL , 0
|
||||
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -77,7 +77,6 @@
|
|||
.equ CONFIG_MULTI_RES_ENCODING , 1
|
||||
.equ CONFIG_TEMPORAL_DENOISING , 1
|
||||
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
|
||||
.equ CONFIG_CONSISTENT_RECODE , 0
|
||||
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
|
||||
.equ CONFIG_VP9_HIGHBITDEPTH , 0
|
||||
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
|
||||
|
@ -90,4 +89,5 @@
|
|||
.equ CONFIG_EMULATE_HARDWARE , 0
|
||||
.equ CONFIG_NON_GREEDY_MV , 0
|
||||
.equ CONFIG_RATE_CTRL , 0
|
||||
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -77,7 +77,6 @@
|
|||
.equ CONFIG_MULTI_RES_ENCODING , 1
|
||||
.equ CONFIG_TEMPORAL_DENOISING , 1
|
||||
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
|
||||
.equ CONFIG_CONSISTENT_RECODE , 0
|
||||
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
|
||||
.equ CONFIG_VP9_HIGHBITDEPTH , 0
|
||||
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
|
||||
|
@ -90,4 +89,5 @@
|
|||
.equ CONFIG_EMULATE_HARDWARE , 0
|
||||
.equ CONFIG_NON_GREEDY_MV , 0
|
||||
.equ CONFIG_RATE_CTRL , 0
|
||||
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
%define CONFIG_MULTI_RES_ENCODING 1
|
||||
%define CONFIG_TEMPORAL_DENOISING 1
|
||||
%define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
%define CONFIG_CONSISTENT_RECODE 0
|
||||
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
%define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
%define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -87,3 +86,4 @@
|
|||
%define CONFIG_EMULATE_HARDWARE 0
|
||||
%define CONFIG_NON_GREEDY_MV 0
|
||||
%define CONFIG_RATE_CTRL 0
|
||||
%define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
%define CONFIG_MULTI_RES_ENCODING 1
|
||||
%define CONFIG_TEMPORAL_DENOISING 1
|
||||
%define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
%define CONFIG_CONSISTENT_RECODE 0
|
||||
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
%define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
%define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -87,3 +86,4 @@
|
|||
%define CONFIG_EMULATE_HARDWARE 0
|
||||
%define CONFIG_NON_GREEDY_MV 0
|
||||
%define CONFIG_RATE_CTRL 0
|
||||
%define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
%define CONFIG_MULTI_RES_ENCODING 1
|
||||
%define CONFIG_TEMPORAL_DENOISING 1
|
||||
%define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
%define CONFIG_CONSISTENT_RECODE 0
|
||||
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
%define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
%define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -87,3 +86,4 @@
|
|||
%define CONFIG_EMULATE_HARDWARE 0
|
||||
%define CONFIG_NON_GREEDY_MV 0
|
||||
%define CONFIG_RATE_CTRL 0
|
||||
%define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
%define CONFIG_MULTI_RES_ENCODING 1
|
||||
%define CONFIG_TEMPORAL_DENOISING 1
|
||||
%define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
%define CONFIG_CONSISTENT_RECODE 0
|
||||
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
%define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
%define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -87,3 +86,4 @@
|
|||
%define CONFIG_EMULATE_HARDWARE 0
|
||||
%define CONFIG_NON_GREEDY_MV 0
|
||||
%define CONFIG_RATE_CTRL 0
|
||||
%define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
// This file is generated. Do not edit.
|
||||
#define VERSION_MAJOR 1
|
||||
#define VERSION_MINOR 12
|
||||
#define VERSION_MINOR 13
|
||||
#define VERSION_PATCH 0
|
||||
#define VERSION_EXTRA ""
|
||||
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
|
||||
#define VERSION_STRING_NOSP "v1.12.0"
|
||||
#define VERSION_STRING " v1.12.0"
|
||||
#define VERSION_STRING_NOSP "v1.13.0"
|
||||
#define VERSION_STRING " v1.13.0"
|
||||
|
|
|
@ -77,7 +77,6 @@
|
|||
.equ CONFIG_MULTI_RES_ENCODING , 1
|
||||
.equ CONFIG_TEMPORAL_DENOISING , 1
|
||||
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
|
||||
.equ CONFIG_CONSISTENT_RECODE , 0
|
||||
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
|
||||
.equ CONFIG_VP9_HIGHBITDEPTH , 0
|
||||
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
|
||||
|
@ -90,4 +89,5 @@
|
|||
.equ CONFIG_EMULATE_HARDWARE , 0
|
||||
.equ CONFIG_NON_GREEDY_MV , 0
|
||||
.equ CONFIG_RATE_CTRL , 0
|
||||
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
|
||||
.section .note.GNU-stack,"",%progbits
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
%define CONFIG_MULTI_RES_ENCODING 1
|
||||
%define CONFIG_TEMPORAL_DENOISING 1
|
||||
%define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
%define CONFIG_CONSISTENT_RECODE 0
|
||||
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
%define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
%define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -87,3 +86,4 @@
|
|||
%define CONFIG_EMULATE_HARDWARE 0
|
||||
%define CONFIG_NON_GREEDY_MV 0
|
||||
%define CONFIG_RATE_CTRL 0
|
||||
%define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -74,7 +74,6 @@
|
|||
%define CONFIG_MULTI_RES_ENCODING 1
|
||||
%define CONFIG_TEMPORAL_DENOISING 1
|
||||
%define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
%define CONFIG_CONSISTENT_RECODE 0
|
||||
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
%define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
%define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -87,3 +86,4 @@
|
|||
%define CONFIG_EMULATE_HARDWARE 0
|
||||
%define CONFIG_NON_GREEDY_MV 0
|
||||
%define CONFIG_RATE_CTRL 0
|
||||
%define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
|
|
|
@ -86,7 +86,6 @@
|
|||
#define CONFIG_MULTI_RES_ENCODING 1
|
||||
#define CONFIG_TEMPORAL_DENOISING 1
|
||||
#define CONFIG_VP9_TEMPORAL_DENOISING 0
|
||||
#define CONFIG_CONSISTENT_RECODE 0
|
||||
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
|
||||
#define CONFIG_VP9_HIGHBITDEPTH 0
|
||||
#define CONFIG_BETTER_HW_COMPATIBILITY 0
|
||||
|
@ -99,6 +98,7 @@
|
|||
#define CONFIG_EMULATE_HARDWARE 0
|
||||
#define CONFIG_NON_GREEDY_MV 0
|
||||
#define CONFIG_RATE_CTRL 0
|
||||
#define CONFIG_COLLECT_COMPONENT_TIMING 0
|
||||
#define DECODE_WIDTH_LIMIT 8192
|
||||
#define DECODE_HEIGHT_LIMIT 4608
|
||||
#endif /* VPX_CONFIG_H */
|
||||
|
|
|
@ -25,6 +25,7 @@ Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
|
|||
Johann <johann@duck.com> <johann.koenig@gmail.com>
|
||||
John Koleszar <jkoleszar@google.com>
|
||||
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
|
||||
Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
|
||||
Marco Paniconi <marpan@google.com>
|
||||
Marco Paniconi <marpan@google.com> <marpan@chromium.org>
|
||||
Martin Storsjö <martin@martin.st>
|
||||
|
|
|
@ -21,6 +21,7 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
|
|||
Andres Mejia <mcitadel@gmail.com>
|
||||
Andrew Lewis <andrewlewis@google.com>
|
||||
Andrew Russell <anrussell@google.com>
|
||||
Andrew Salkeld <andrew.salkeld@arm.com>
|
||||
Angie Chen <yunqi@google.com>
|
||||
Angie Chiang <angiebird@google.com>
|
||||
Anton Venema <anton.venema@liveswitch.com>
|
||||
|
@ -175,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
|
|||
Ronald S. Bultje <rsbultje@gmail.com>
|
||||
Rui Ueyama <ruiu@google.com>
|
||||
Sai Deng <sdeng@google.com>
|
||||
Salome Thirot <salome.thirot@arm.com>
|
||||
Sami Pietilä <samipietila@google.com>
|
||||
Sam James <sam@gentoo.org>
|
||||
Sarah Parker <sarahparker@google.com>
|
||||
Sasi Inguva <isasi@google.com>
|
||||
Scott Graham <scottmg@chromium.org>
|
||||
|
|
|
@ -1,3 +1,39 @@
|
|||
2023-01-31 v1.13.0 "Ugly Duckling"
|
||||
This release includes more Neon and AVX2 optimizations, adds a new codec
|
||||
control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
|
||||
numerous bug fixes.
|
||||
|
||||
- Upgrading:
|
||||
This release is ABI incompatible with the previous release.
|
||||
|
||||
New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
|
||||
|
||||
GoogleTest is upgraded to v1.12.1.
|
||||
|
||||
.clang-format is upgraded to clang-format-11.
|
||||
|
||||
VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
|
||||
feature of using external rate control models for vp9.
|
||||
|
||||
- Enhancement:
|
||||
Numerous improvements on Neon optimizations.
|
||||
Numerous improvements on AVX2 optimizations.
|
||||
Additional ARM targets added for Visual Studio.
|
||||
|
||||
- Bug fixes:
|
||||
Fix to calculating internal stats when frame dropped.
|
||||
Fix to segfault for external resize test in vp9.
|
||||
Fix to build system with replacing egrep with grep -E.
|
||||
Fix to a few bugs with external RTC rate control library.
|
||||
Fix to make SVC work with VBR.
|
||||
Fix to key frame setting in VP9 external RC.
|
||||
Fix to -Wimplicit-int (Clang 16).
|
||||
Fix to VP8 external RC for buffer levels.
|
||||
Fix to VP8 external RC for dynamic update of layers.
|
||||
Fix to VP9 auto level.
|
||||
Fix to off-by-one error of max w/h in validate_config.
|
||||
Fix to make SVC work for Profile 1.
|
||||
|
||||
2022-06-17 v1.12.0 "Torrent Duck"
|
||||
This release adds optimizations for Loongarch, adds support for vp8 in the
|
||||
real-time rate control library, upgrades GoogleTest to v1.11.0, updates
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
v1.12.0 Torrent Duck
|
||||
v1.13.0 Ugly Duckling
|
||||
|
||||
Welcome to the WebM VP8/VP9 Codec SDK!
|
||||
|
||||
|
|
|
@ -293,6 +293,7 @@ EXPERIMENT_LIST="
|
|||
emulate_hardware
|
||||
non_greedy_mv
|
||||
rate_ctrl
|
||||
collect_component_timing
|
||||
"
|
||||
CONFIG_LIST="
|
||||
dependency_tracking
|
||||
|
@ -342,7 +343,6 @@ CONFIG_LIST="
|
|||
multi_res_encoding
|
||||
temporal_denoising
|
||||
vp9_temporal_denoising
|
||||
consistent_recode
|
||||
coefficient_range_checking
|
||||
vp9_highbitdepth
|
||||
better_hw_compatibility
|
||||
|
@ -406,7 +406,6 @@ CMDLINE_SELECT="
|
|||
multi_res_encoding
|
||||
temporal_denoising
|
||||
vp9_temporal_denoising
|
||||
consistent_recode
|
||||
coefficient_range_checking
|
||||
better_hw_compatibility
|
||||
vp9_highbitdepth
|
||||
|
|
|
@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
|
|||
# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
|
||||
# SO_VERSION_* then follow the rules in the link to detemine the new version
|
||||
# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
|
||||
SO_VERSION_MAJOR := 7
|
||||
SO_VERSION_MINOR := 1
|
||||
SO_VERSION_MAJOR := 8
|
||||
SO_VERSION_MINOR := 0
|
||||
SO_VERSION_PATCH := 0
|
||||
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
|
||||
LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib
|
||||
|
|
|
@ -260,5 +260,11 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
#if HAVE_NEON
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
NEON, AvgPredTestHBD,
|
||||
::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
|
||||
#endif // HAVE_NEON
|
||||
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
} // namespace
|
||||
|
|
|
@ -1572,6 +1572,10 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
12),
|
||||
SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
|
||||
12),
|
||||
SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
|
||||
12),
|
||||
SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
|
||||
12),
|
||||
SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
|
||||
10),
|
||||
SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
|
||||
|
@ -1594,6 +1598,10 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
10),
|
||||
SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
|
||||
10),
|
||||
SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
|
||||
10),
|
||||
SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
|
||||
10),
|
||||
SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
|
||||
8),
|
||||
SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
|
||||
|
@ -1613,7 +1621,9 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
|
||||
8),
|
||||
SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
|
||||
SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
|
||||
SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
|
||||
SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
|
||||
SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
|
||||
8)));
|
||||
|
||||
INSTANTIATE_TEST_SUITE_P(
|
||||
|
@ -1652,6 +1662,12 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
SubpelAvgVarianceParams(3, 2,
|
||||
&vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
|
||||
12),
|
||||
SubpelAvgVarianceParams(2, 3,
|
||||
&vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
|
||||
12),
|
||||
SubpelAvgVarianceParams(2, 2,
|
||||
&vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
|
||||
12),
|
||||
SubpelAvgVarianceParams(6, 6,
|
||||
&vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
|
||||
10),
|
||||
|
@ -1685,6 +1701,12 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
SubpelAvgVarianceParams(3, 2,
|
||||
&vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
|
||||
10),
|
||||
SubpelAvgVarianceParams(2, 3,
|
||||
&vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
|
||||
10),
|
||||
SubpelAvgVarianceParams(2, 2,
|
||||
&vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
|
||||
10),
|
||||
SubpelAvgVarianceParams(6, 6,
|
||||
&vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
|
||||
8),
|
||||
|
@ -1717,6 +1739,12 @@ INSTANTIATE_TEST_SUITE_P(
|
|||
8),
|
||||
SubpelAvgVarianceParams(3, 2,
|
||||
&vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
|
||||
8),
|
||||
SubpelAvgVarianceParams(2, 3,
|
||||
&vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
|
||||
8),
|
||||
SubpelAvgVarianceParams(2, 2,
|
||||
&vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
|
||||
8)));
|
||||
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
|
|
@ -127,14 +127,15 @@ class Vp8RcInterfaceTest
|
|||
encoder->Control(VP8E_SET_CPUUSED, -6);
|
||||
encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
|
||||
encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
|
||||
} else if (frame_params_.frame_type == INTER_FRAME) {
|
||||
} else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
|
||||
// Disable golden frame update.
|
||||
frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
|
||||
frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
|
||||
}
|
||||
}
|
||||
frame_params_.frame_type =
|
||||
video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
|
||||
frame_params_.frame_type = video->frame() % key_interval_ == 0
|
||||
? libvpx::RcFrameType::kKeyFrame
|
||||
: libvpx::RcFrameType::kInterFrame;
|
||||
encoder_exit_ = video->frame() == test_video_.frames;
|
||||
}
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16;
|
|||
constexpr int kReadMinGfInterval = 5;
|
||||
constexpr int kReadMaxGfInterval = 13;
|
||||
const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
|
||||
const double kPsnrThreshold = 30.50;
|
||||
const double kPsnrThreshold = 30.4;
|
||||
|
||||
struct ToyRateCtrl {
|
||||
int magic_number;
|
||||
|
|
|
@ -57,9 +57,11 @@ class RcInterfaceTest
|
|||
encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
|
||||
encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
|
||||
}
|
||||
frame_params_.frame_type =
|
||||
video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
|
||||
if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
|
||||
frame_params_.frame_type = video->frame() % key_interval_ == 0
|
||||
? libvpx::RcFrameType::kKeyFrame
|
||||
: libvpx::RcFrameType::kInterFrame;
|
||||
if (rc_cfg_.rc_mode == VPX_CBR &&
|
||||
frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
|
||||
// Disable golden frame update.
|
||||
frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
|
||||
frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
|
||||
|
@ -183,8 +185,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
|
|||
encoder->Control(VP9E_SET_SVC, 1);
|
||||
encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
|
||||
}
|
||||
frame_params_.frame_type =
|
||||
video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
|
||||
frame_params_.frame_type = video->frame() % key_interval_ == 0
|
||||
? libvpx::RcFrameType::kKeyFrame
|
||||
: libvpx::RcFrameType::kInterFrame;
|
||||
encoder_exit_ = video->frame() == kNumFrames;
|
||||
current_superframe_ = video->frame();
|
||||
if (dynamic_spatial_layers_ == 1) {
|
||||
|
@ -247,7 +250,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
|
|||
else
|
||||
frame_params_.temporal_layer_id = 0;
|
||||
rc_api_->ComputeQP(frame_params_);
|
||||
frame_params_.frame_type = INTER_FRAME;
|
||||
frame_params_.frame_type = libvpx::RcFrameType::kInterFrame;
|
||||
rc_api_->PostEncodeUpdate(sizes_[sl]);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -26,13 +26,6 @@ struct VP8_COMP;
|
|||
|
||||
/* Create/destroy static data structures. */
|
||||
|
||||
typedef enum {
|
||||
NORMAL = 0,
|
||||
FOURFIVE = 1,
|
||||
THREEFIVE = 2,
|
||||
ONETWO = 3
|
||||
} VPX_SCALING;
|
||||
|
||||
typedef enum {
|
||||
USAGE_LOCAL_FILE_PLAYBACK = 0x0,
|
||||
USAGE_STREAM_FROM_SERVER = 0x1,
|
||||
|
@ -58,19 +51,19 @@ typedef enum {
|
|||
#include <assert.h>
|
||||
static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
|
||||
switch (mode) {
|
||||
case NORMAL:
|
||||
case VP8E_NORMAL:
|
||||
*hr = 1;
|
||||
*hs = 1;
|
||||
break;
|
||||
case FOURFIVE:
|
||||
case VP8E_FOURFIVE:
|
||||
*hr = 4;
|
||||
*hs = 5;
|
||||
break;
|
||||
case THREEFIVE:
|
||||
case VP8E_THREEFIVE:
|
||||
*hr = 3;
|
||||
*hs = 5;
|
||||
break;
|
||||
case ONETWO:
|
||||
case VP8E_ONETWO:
|
||||
*hr = 1;
|
||||
*hs = 2;
|
||||
break;
|
||||
|
@ -273,8 +266,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
|
|||
unsigned int threshold[4]);
|
||||
int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
|
||||
unsigned int rows, unsigned int cols);
|
||||
int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
|
||||
VPX_SCALING vert_mode);
|
||||
int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
|
||||
VPX_SCALING_MODE vert_mode);
|
||||
int vp8_get_quantizer(struct VP8_COMP *cpi);
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -92,8 +92,7 @@ typedef struct macroblock {
|
|||
signed int last_act_zbin_adj;
|
||||
|
||||
int *mvcost[2];
|
||||
/* MSVC generates code that thinks this is 16-byte aligned */
|
||||
DECLARE_ALIGNED(16, int*, mvsadcost[2]);
|
||||
int *mvsadcost[2];
|
||||
int (*mbmode_cost)[MB_MODE_COUNT];
|
||||
int (*intra_uv_mode_cost)[MB_MODE_COUNT];
|
||||
int (*bmode_costs)[10][10];
|
||||
|
|
|
@ -2990,8 +2990,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
|
|||
}
|
||||
|
||||
/* Set back to unscaled by defaults */
|
||||
cpi->common.horiz_scale = NORMAL;
|
||||
cpi->common.vert_scale = NORMAL;
|
||||
cpi->common.horiz_scale = VP8E_NORMAL;
|
||||
cpi->common.vert_scale = VP8E_NORMAL;
|
||||
|
||||
/* Calculate Average bits per frame. */
|
||||
av_bits_per_frame = cpi->oxcf.target_bandwidth /
|
||||
|
|
|
@ -1667,7 +1667,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
|
|||
|
||||
cm->sharpness_level = cpi->oxcf.Sharpness;
|
||||
|
||||
if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
|
||||
if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
|
||||
int hr, hs, vr, vs;
|
||||
|
||||
Scale2Ratio(cm->horiz_scale, &hr, &hs);
|
||||
|
@ -2504,15 +2504,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
|
|||
if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
|
||||
cpi->oxcf.optimal_buffer_level / 100)) {
|
||||
cm->horiz_scale =
|
||||
(cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
|
||||
cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
|
||||
(cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
|
||||
cm->vert_scale =
|
||||
(cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
|
||||
}
|
||||
/* Should we now start scaling back up */
|
||||
else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
|
||||
cpi->oxcf.optimal_buffer_level / 100)) {
|
||||
cm->horiz_scale =
|
||||
(cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
|
||||
cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
|
||||
(cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
|
||||
cm->vert_scale =
|
||||
(cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
|
||||
}
|
||||
|
||||
/* Get the new height and width */
|
||||
|
@ -5380,15 +5382,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
|
|||
}
|
||||
}
|
||||
|
||||
int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
|
||||
VPX_SCALING vert_mode) {
|
||||
if (horiz_mode <= ONETWO) {
|
||||
int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
|
||||
VPX_SCALING_MODE vert_mode) {
|
||||
if (horiz_mode <= VP8E_ONETWO) {
|
||||
cpi->common.horiz_scale = horiz_mode;
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (vert_mode <= ONETWO) {
|
||||
if (vert_mode <= VP8E_ONETWO) {
|
||||
cpi->common.vert_scale = vert_mode;
|
||||
} else {
|
||||
return -1;
|
||||
|
|
|
@ -947,19 +947,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
|
|||
if (img != NULL) {
|
||||
res = image2yuvconfig(img, &sd);
|
||||
|
||||
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
|
||||
/* from vpx_encoder.h for g_w/g_h:
|
||||
"Note that the frames passed as input to the encoder must have this
|
||||
resolution"
|
||||
*/
|
||||
ctx->base.err_detail = "Invalid input frame resolution";
|
||||
res = VPX_CODEC_INVALID_PARAM;
|
||||
} else {
|
||||
if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
|
||||
&sd, dst_time_stamp, dst_end_time_stamp)) {
|
||||
VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
|
||||
res = update_error_state(ctx, &cpi->common.error);
|
||||
}
|
||||
if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
|
||||
dst_time_stamp, dst_end_time_stamp)) {
|
||||
VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
|
||||
res = update_error_state(ctx, &cpi->common.error);
|
||||
}
|
||||
|
||||
/* reset for next frame */
|
||||
|
@ -1233,8 +1224,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
|
|||
if (data) {
|
||||
int res;
|
||||
vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
|
||||
res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
|
||||
(VPX_SCALING)scalemode.v_scaling_mode);
|
||||
res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
|
||||
scalemode.v_scaling_mode);
|
||||
|
||||
if (!res) {
|
||||
/*force next frame a key frame to effect scaling mode */
|
||||
|
|
|
@ -10,7 +10,9 @@
|
|||
|
||||
#include <math.h>
|
||||
#include <new>
|
||||
#include "vp8/common/common.h"
|
||||
#include "vp8/vp8_ratectrl_rtc.h"
|
||||
#include "vp8/encoder/onyx_int.h"
|
||||
#include "vp8/encoder/ratectrl.h"
|
||||
#include "vpx_ports/system_state.h"
|
||||
|
||||
|
@ -65,6 +67,13 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
|
|||
return rc_api;
|
||||
}
|
||||
|
||||
VP8RateControlRTC::~VP8RateControlRTC() {
|
||||
if (cpi_) {
|
||||
vpx_free(cpi_->gf_active_flags);
|
||||
vpx_free(cpi_);
|
||||
}
|
||||
}
|
||||
|
||||
void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
|
||||
VP8_COMMON *cm = &cpi_->common;
|
||||
VP8_CONFIG *oxcf = &cpi_->oxcf;
|
||||
|
@ -203,7 +212,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
|
|||
vp8_restore_layer_context(cpi_, layer);
|
||||
vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
|
||||
}
|
||||
cm->frame_type = frame_params.frame_type;
|
||||
cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
|
||||
cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
|
||||
cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
|
||||
if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
|
||||
|
|
|
@ -12,23 +12,24 @@
|
|||
#define VPX_VP8_RATECTRL_RTC_H_
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
|
||||
#include "vp8/encoder/onyx_int.h"
|
||||
#include "vp8/common/common.h"
|
||||
#include "vpx/internal/vpx_ratectrl_rtc.h"
|
||||
|
||||
struct VP8_COMP;
|
||||
|
||||
namespace libvpx {
|
||||
struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
|
||||
public:
|
||||
VP8RateControlRtcConfig() {
|
||||
vp8_zero(layer_target_bitrate);
|
||||
vp8_zero(ts_rate_decimator);
|
||||
memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
|
||||
memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
|
||||
}
|
||||
};
|
||||
|
||||
struct VP8FrameParamsQpRTC {
|
||||
FRAME_TYPE frame_type;
|
||||
RcFrameType frame_type;
|
||||
int temporal_layer_id;
|
||||
};
|
||||
|
||||
|
@ -36,12 +37,7 @@ class VP8RateControlRTC {
|
|||
public:
|
||||
static std::unique_ptr<VP8RateControlRTC> Create(
|
||||
const VP8RateControlRtcConfig &cfg);
|
||||
~VP8RateControlRTC() {
|
||||
if (cpi_) {
|
||||
vpx_free(cpi_->gf_active_flags);
|
||||
vpx_free(cpi_);
|
||||
}
|
||||
}
|
||||
~VP8RateControlRTC();
|
||||
|
||||
void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
|
||||
// GetQP() needs to be called after ComputeQP() to get the latest QP
|
||||
|
@ -54,7 +50,7 @@ class VP8RateControlRTC {
|
|||
private:
|
||||
VP8RateControlRTC() {}
|
||||
void InitRateControl(const VP8RateControlRtcConfig &cfg);
|
||||
VP8_COMP *cpi_;
|
||||
struct VP8_COMP *cpi_;
|
||||
int q_;
|
||||
};
|
||||
|
||||
|
|
|
@ -220,7 +220,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
|
|||
// Look up the component cost of the residual motion vector
|
||||
{
|
||||
uint32_t cost[4];
|
||||
int16_t __attribute__((aligned(16))) rowcol[8];
|
||||
DECLARE_ALIGNED(16, int16_t, rowcol[8]);
|
||||
vst1q_s16(rowcol, v_diff_mv_w);
|
||||
|
||||
// Note: This is a use case for gather instruction
|
||||
|
|
|
@ -1980,6 +1980,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
|
|||
int64_t best_rd = INT64_MAX;
|
||||
|
||||
vpx_clear_system_state();
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, rd_pick_sb_modes_time);
|
||||
#endif
|
||||
|
||||
// Use the lower precision, but faster, 32x32 fdct for mode selection.
|
||||
x->use_lp32x32fdct = 1;
|
||||
|
@ -2047,15 +2050,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
|
|||
vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
|
||||
} else {
|
||||
if (bsize >= BLOCK_8X8) {
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
|
||||
#endif
|
||||
if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
|
||||
vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
|
||||
ctx, best_rd);
|
||||
else
|
||||
vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
|
||||
bsize, ctx, best_rd);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
|
||||
#endif
|
||||
} else {
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
|
||||
#endif
|
||||
vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
|
||||
bsize, ctx, best_rd);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2078,6 +2093,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
|
|||
|
||||
ctx->rate = rd_cost->rate;
|
||||
ctx->dist = rd_cost->dist;
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, rd_pick_sb_modes_time);
|
||||
#endif
|
||||
}
|
||||
#endif // !CONFIG_REALTIME_ONLY
|
||||
|
||||
|
@ -4411,8 +4429,14 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
|
|||
|
||||
if (should_encode_sb && pc_tree->index != 3) {
|
||||
int output_enabled = (bsize == BLOCK_64X64);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, encode_sb_time);
|
||||
#endif
|
||||
encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
|
||||
pc_tree);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, encode_sb_time);
|
||||
#endif
|
||||
#if CONFIG_RATE_CTRL
|
||||
if (oxcf->use_simple_encode_api) {
|
||||
// Store partition, motion vector of the superblock.
|
||||
|
@ -4539,8 +4563,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
|
|||
&x->min_partition_size, &x->max_partition_size);
|
||||
}
|
||||
td->pc_root->none.rdcost = 0;
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, rd_pick_partition_time);
|
||||
#endif
|
||||
rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
|
||||
&dummy_rdc, dummy_rdc, td->pc_root);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, rd_pick_partition_time);
|
||||
#endif
|
||||
}
|
||||
(*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
|
||||
sb_col_in_tile, num_sb_cols);
|
||||
|
@ -5810,14 +5841,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
|
|||
for (i = 0; i < BLOCK_SIZES; ++i) {
|
||||
for (j = 0; j < MAX_MODES; ++j) {
|
||||
tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
|
||||
#if CONFIG_RATE_CTRL
|
||||
if (cpi->oxcf.use_simple_encode_api) {
|
||||
tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
|
||||
}
|
||||
#endif // CONFIG_RATE_CTRL
|
||||
#if CONFIG_CONSISTENT_RECODE
|
||||
tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
|
||||
#endif // CONFIG_CONSISTENT_RECODE
|
||||
tile_data->mode_map[i][j] = j;
|
||||
}
|
||||
}
|
||||
|
@ -6037,9 +6061,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
|
|||
x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
|
||||
#if CONFIG_CONSISTENT_RECODE
|
||||
x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
|
||||
#endif
|
||||
if (xd->lossless) x->optimize = 0;
|
||||
x->sharpness = cpi->oxcf.sharpness;
|
||||
x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
|
||||
|
@ -6184,13 +6206,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
|
|||
return sum_delta / (cm->mi_rows * cm->mi_cols);
|
||||
}
|
||||
|
||||
#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
|
||||
static void restore_encode_params(VP9_COMP *cpi) {
|
||||
VP9_COMMON *const cm = &cpi->common;
|
||||
const int tile_cols = 1 << cm->log2_tile_cols;
|
||||
const int tile_rows = 1 << cm->log2_tile_rows;
|
||||
int tile_col, tile_row;
|
||||
int tile_idx;
|
||||
int i, j;
|
||||
TileDataEnc *tile_data;
|
||||
RD_OPT *rd_opt = &cpi->rd;
|
||||
for (i = 0; i < MAX_REF_FRAMES; i++) {
|
||||
for (j = 0; j < REFERENCE_MODES; j++)
|
||||
|
@ -6201,35 +6221,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
|
|||
rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
|
||||
}
|
||||
|
||||
if (cpi->tile_data != NULL) {
|
||||
for (tile_row = 0; tile_row < tile_rows; ++tile_row)
|
||||
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
|
||||
TileDataEnc *tile_data =
|
||||
&cpi->tile_data[tile_row * tile_cols + tile_col];
|
||||
for (i = 0; i < BLOCK_SIZES; ++i) {
|
||||
for (j = 0; j < MAX_MODES; ++j) {
|
||||
tile_data->thresh_freq_fact[i][j] =
|
||||
tile_data->thresh_freq_fact_prev[i][j];
|
||||
}
|
||||
}
|
||||
}
|
||||
for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
|
||||
assert(cpi->tile_data);
|
||||
tile_data = &cpi->tile_data[tile_idx];
|
||||
vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
|
||||
}
|
||||
|
||||
cm->interp_filter = cpi->sf.default_interp_filter;
|
||||
}
|
||||
#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
|
||||
|
||||
void vp9_encode_frame(VP9_COMP *cpi) {
|
||||
VP9_COMMON *const cm = &cpi->common;
|
||||
|
||||
#if CONFIG_RATE_CTRL
|
||||
if (cpi->oxcf.use_simple_encode_api) {
|
||||
restore_encode_params(cpi);
|
||||
}
|
||||
#endif // CONFIG_RATE_CTRL
|
||||
#if CONFIG_CONSISTENT_RECODE
|
||||
restore_encode_params(cpi);
|
||||
#endif
|
||||
|
||||
#if CONFIG_MISMATCH_DEBUG
|
||||
mismatch_reset_frame(MAX_MB_PLANE);
|
||||
|
@ -6283,7 +6287,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
|
|||
if (cm->interp_filter == SWITCHABLE)
|
||||
cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, encode_frame_internal_time);
|
||||
#endif
|
||||
encode_frame_internal(cpi);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, encode_frame_internal_time);
|
||||
#endif
|
||||
|
||||
for (i = 0; i < REFERENCE_MODES; ++i)
|
||||
mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -90,13 +90,6 @@ typedef enum {
|
|||
ENCODE_BREAKOUT_LIMITED = 2
|
||||
} ENCODE_BREAKOUT_TYPE;
|
||||
|
||||
typedef enum {
|
||||
NORMAL = 0,
|
||||
FOURFIVE = 1,
|
||||
THREEFIVE = 2,
|
||||
ONETWO = 3
|
||||
} VPX_SCALING;
|
||||
|
||||
typedef enum {
|
||||
// Good Quality Fast Encoding. The encoder balances quality with the amount of
|
||||
// time it takes to encode the output. Speed setting controls how fast.
|
||||
|
@ -336,9 +329,7 @@ typedef struct TplDepFrame {
|
|||
typedef struct TileDataEnc {
|
||||
TileInfo tile_info;
|
||||
int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
|
||||
#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
|
||||
int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
|
||||
#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
|
||||
int8_t mode_map[BLOCK_SIZES][MAX_MODES];
|
||||
FIRSTPASS_DATA fp_data;
|
||||
VP9RowMTSync row_mt_sync;
|
||||
|
@ -659,6 +650,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
|
|||
static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
|
||||
#endif // CONFIG_RATE_CTRL
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
#include "vpx_ports/vpx_timer.h"
|
||||
// Adjust the following to add new components.
|
||||
typedef enum {
|
||||
vp9_get_compressed_data_time,
|
||||
vp9_temporal_filter_time,
|
||||
vp9_rc_get_second_pass_params_time,
|
||||
setup_tpl_stats_time,
|
||||
Pass2Encode_time,
|
||||
|
||||
encode_with_recode_loop_time,
|
||||
loopfilter_frame_time,
|
||||
vp9_pack_bitstream_time,
|
||||
|
||||
encode_frame_internal_time,
|
||||
rd_pick_partition_time,
|
||||
rd_pick_sb_modes_time,
|
||||
encode_sb_time,
|
||||
|
||||
vp9_rd_pick_inter_mode_sb_time,
|
||||
vp9_rd_pick_inter_mode_sub8x8_time,
|
||||
|
||||
intra_mode_search_time,
|
||||
handle_inter_mode_time,
|
||||
single_motion_search_time,
|
||||
joint_motion_search_time,
|
||||
interp_filter_time,
|
||||
|
||||
kTimingComponents,
|
||||
} TIMING_COMPONENT;
|
||||
|
||||
static INLINE char const *get_component_name(int index) {
|
||||
switch (index) {
|
||||
case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
|
||||
case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
|
||||
case vp9_rc_get_second_pass_params_time:
|
||||
return "vp9_rc_get_second_pass_params_time";
|
||||
case setup_tpl_stats_time: return "setup_tpl_stats_time";
|
||||
case Pass2Encode_time: return "Pass2Encode_time";
|
||||
|
||||
case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
|
||||
case loopfilter_frame_time: return "loopfilter_frame_time";
|
||||
case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
|
||||
|
||||
case encode_frame_internal_time: return "encode_frame_internal_time";
|
||||
case rd_pick_partition_time: return "rd_pick_partition_time";
|
||||
case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
|
||||
case encode_sb_time: return "encode_sb_time";
|
||||
|
||||
case vp9_rd_pick_inter_mode_sb_time:
|
||||
return "vp9_rd_pick_inter_mode_sb_time";
|
||||
case vp9_rd_pick_inter_mode_sub8x8_time:
|
||||
return "vp9_rd_pick_inter_mode_sub8x8_time";
|
||||
|
||||
case intra_mode_search_time: return "intra_mode_search_time";
|
||||
case handle_inter_mode_time: return "handle_inter_mode_time";
|
||||
case single_motion_search_time: return "single_motion_search_time";
|
||||
case joint_motion_search_time: return "joint_motion_search_time";
|
||||
case interp_filter_time: return "interp_filter_time";
|
||||
|
||||
default: assert(0);
|
||||
}
|
||||
return "error";
|
||||
}
|
||||
#endif
|
||||
|
||||
typedef struct VP9_COMP {
|
||||
FRAME_INFO frame_info;
|
||||
QUANTS quants;
|
||||
|
@ -973,6 +1030,22 @@ typedef struct VP9_COMP {
|
|||
EXT_RATECTRL ext_ratectrl;
|
||||
|
||||
int fixed_qp_onepass;
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
/*!
|
||||
* component_time[] are initialized to zero while encoder starts.
|
||||
*/
|
||||
uint64_t component_time[kTimingComponents];
|
||||
/*!
|
||||
* Stores timing for individual components between calls of start_timing()
|
||||
* and end_timing().
|
||||
*/
|
||||
struct vpx_usec_timer component_timer[kTimingComponents];
|
||||
/*!
|
||||
* frame_component_time[] are initialized to zero at beginning of each frame.
|
||||
*/
|
||||
uint64_t frame_component_time[kTimingComponents];
|
||||
#endif
|
||||
} VP9_COMP;
|
||||
|
||||
#if CONFIG_RATE_CTRL
|
||||
|
@ -1154,8 +1227,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
|
|||
int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
|
||||
int cols);
|
||||
|
||||
int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
|
||||
VPX_SCALING vert_mode);
|
||||
int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
|
||||
VPX_SCALING_MODE vert_mode);
|
||||
|
||||
int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
|
||||
unsigned int height);
|
||||
|
@ -1392,6 +1465,38 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
|
|||
|
||||
#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
|
||||
|
||||
static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
|
||||
RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
|
||||
if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
|
||||
new_fb_ptr->mi_cols < cm->mi_cols) {
|
||||
vpx_free(new_fb_ptr->mvs);
|
||||
CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
|
||||
(MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
|
||||
sizeof(*new_fb_ptr->mvs)));
|
||||
new_fb_ptr->mi_rows = cm->mi_rows;
|
||||
new_fb_ptr->mi_cols = cm->mi_cols;
|
||||
}
|
||||
}
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
static INLINE void start_timing(VP9_COMP *cpi, int component) {
|
||||
vpx_usec_timer_start(&cpi->component_timer[component]);
|
||||
}
|
||||
static INLINE void end_timing(VP9_COMP *cpi, int component) {
|
||||
vpx_usec_timer_mark(&cpi->component_timer[component]);
|
||||
cpi->frame_component_time[component] +=
|
||||
vpx_usec_timer_elapsed(&cpi->component_timer[component]);
|
||||
}
|
||||
static INLINE char const *get_frame_type_enum(int type) {
|
||||
switch (type) {
|
||||
case 0: return "KEY_FRAME";
|
||||
case 1: return "INTER_FRAME";
|
||||
default: assert(0);
|
||||
}
|
||||
return "error";
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
|
|
@ -121,11 +121,9 @@ typedef struct RD_OPT {
|
|||
int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
|
||||
|
||||
int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
|
||||
#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
|
||||
int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
|
||||
|
||||
int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
|
||||
#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
|
||||
int RDMULT;
|
||||
int RDDIV;
|
||||
double r0;
|
||||
|
|
|
@ -2832,8 +2832,14 @@ static int64_t handle_inter_mode(
|
|||
frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
|
||||
|
||||
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, joint_motion_search_time);
|
||||
#endif
|
||||
joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
|
||||
single_newmv, &rate_mv);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, joint_motion_search_time);
|
||||
#endif
|
||||
} else {
|
||||
rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
|
||||
&x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
|
||||
|
@ -2845,7 +2851,13 @@ static int64_t handle_inter_mode(
|
|||
*rate2 += rate_mv;
|
||||
} else {
|
||||
int_mv tmp_mv;
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, single_motion_search_time);
|
||||
#endif
|
||||
single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, single_motion_search_time);
|
||||
#endif
|
||||
if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
|
||||
|
||||
frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
|
||||
|
@ -2908,6 +2920,9 @@ static int64_t handle_inter_mode(
|
|||
intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
|
||||
if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, interp_filter_time);
|
||||
#endif
|
||||
// Search for best switchable filter by checking the variance of
|
||||
// pred error irrespective of whether the filter will be used
|
||||
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
|
||||
|
@ -3005,6 +3020,9 @@ static int64_t handle_inter_mode(
|
|||
restore_dst_buf(xd, orig_dst, orig_dst_stride);
|
||||
}
|
||||
}
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, interp_filter_time);
|
||||
#endif
|
||||
// Set the appropriate filter
|
||||
mi->interp_filter =
|
||||
cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
|
||||
|
@ -3707,19 +3725,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
|
|||
if (ref_frame == INTRA_FRAME) {
|
||||
TX_SIZE uv_tx;
|
||||
struct macroblockd_plane *const pd = &xd->plane[1];
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, intra_mode_search_time);
|
||||
#endif
|
||||
memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
|
||||
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
|
||||
best_rd, recon);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, intra_mode_search_time);
|
||||
#endif
|
||||
if (rate_y == INT_MAX) continue;
|
||||
|
||||
uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
|
||||
[pd->subsampling_y];
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, intra_mode_search_time);
|
||||
#endif
|
||||
if (rate_uv_intra[uv_tx] == INT_MAX) {
|
||||
choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
|
||||
&rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
|
||||
&skip_uv[uv_tx], &mode_uv[uv_tx]);
|
||||
}
|
||||
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, intra_mode_search_time);
|
||||
#endif
|
||||
rate_uv = rate_uv_tokenonly[uv_tx];
|
||||
distortion_uv = dist_uv[uv_tx];
|
||||
skippable = skippable && skip_uv[uv_tx];
|
||||
|
@ -3730,11 +3759,17 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
|
|||
rate2 += intra_cost_penalty;
|
||||
distortion2 = distortion_y + distortion_uv;
|
||||
} else {
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
start_timing(cpi, handle_inter_mode_time);
|
||||
#endif
|
||||
this_rd = handle_inter_mode(
|
||||
cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
|
||||
recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
|
||||
single_inter_filter, single_skippable, &total_sse, best_rd,
|
||||
&mask_filter, filter_cache);
|
||||
#if CONFIG_COLLECT_COMPONENT_TIMING
|
||||
end_timing(cpi, handle_inter_mode_time);
|
||||
#endif
|
||||
if (this_rd == INT64_MAX) continue;
|
||||
|
||||
compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
|
||||
|
@ -3970,13 +4005,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
|
|||
}
|
||||
|
||||
if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
|
||||
// If adaptive interp filter is enabled, then the current leaf node of 8x8
|
||||
// data is needed for sub8x8. Hence preserve the context.
|
||||
#if CONFIG_CONSISTENT_RECODE
|
||||
// If adaptive interp filter is enabled, then the current leaf node of 8x8
|
||||
// data is needed for sub8x8. Hence preserve the context.
|
||||
if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
|
||||
#else
|
||||
if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
|
||||
#endif
|
||||
rd_cost->rate = INT_MAX;
|
||||
rd_cost->rdcost = INT64_MAX;
|
||||
return;
|
||||
|
|
|
@ -16,8 +16,11 @@
|
|||
#include "vpx_dsp/vpx_dsp_common.h"
|
||||
|
||||
// Mesh search patters for various speed settings
|
||||
static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
|
||||
{ 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
|
||||
// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
|
||||
// FC_GRAPHICS_ANIMATION content type.
|
||||
static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
|
||||
{ { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
|
||||
{ { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
|
||||
};
|
||||
|
||||
#if !CONFIG_REALTIME_ONLY
|
||||
|
@ -209,15 +212,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
|
|||
const int boosted = frame_is_boosted(cpi);
|
||||
int i;
|
||||
|
||||
sf->tx_size_search_breakout = 1;
|
||||
sf->adaptive_pred_interp_filter = 1;
|
||||
sf->adaptive_rd_thresh = 1;
|
||||
sf->adaptive_rd_thresh_row_mt = 0;
|
||||
sf->allow_skip_recode = 1;
|
||||
sf->less_rectangular_check = 1;
|
||||
sf->use_square_partition_only = !boosted;
|
||||
sf->mv.auto_mv_step_size = 1;
|
||||
sf->prune_ref_frame_for_rect_partitions = 1;
|
||||
sf->rd_ml_partition.var_pruning = 1;
|
||||
sf->temporal_filter_search_method = NSTEP;
|
||||
sf->tx_size_search_breakout = 1;
|
||||
sf->use_square_partition_only = !boosted;
|
||||
|
||||
sf->rd_ml_partition.var_pruning = 1;
|
||||
sf->rd_ml_partition.prune_rect_thresh[0] = -1;
|
||||
sf->rd_ml_partition.prune_rect_thresh[1] = 350;
|
||||
sf->rd_ml_partition.prune_rect_thresh[2] = 325;
|
||||
|
@ -238,7 +244,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
|
|||
}
|
||||
|
||||
if (speed >= 1) {
|
||||
sf->temporal_filter_search_method = NSTEP;
|
||||
sf->rd_ml_partition.var_pruning = !boosted;
|
||||
sf->rd_ml_partition.prune_rect_thresh[1] = 225;
|
||||
sf->rd_ml_partition.prune_rect_thresh[2] = 225;
|
||||
|
@ -263,11 +268,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
|
|||
sf->less_rectangular_check = 1;
|
||||
sf->use_rd_breakout = 1;
|
||||
sf->adaptive_motion_search = 1;
|
||||
sf->mv.auto_mv_step_size = 1;
|
||||
sf->adaptive_rd_thresh = 2;
|
||||
sf->mv.subpel_search_level = 1;
|
||||
if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
|
||||
sf->adaptive_pred_interp_filter = 1;
|
||||
sf->allow_acl = 0;
|
||||
|
||||
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
|
||||
|
@ -991,10 +994,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
|
|||
sf->exhaustive_searches_thresh =
|
||||
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
|
||||
: INT_MAX;
|
||||
if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
|
||||
{
|
||||
const int mesh_density_level =
|
||||
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
|
||||
for (i = 0; i < MAX_MESH_STEP; ++i) {
|
||||
sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
|
||||
sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
|
||||
sf->mesh_patterns[i].range =
|
||||
best_quality_mesh_pattern[mesh_density_level][i].range;
|
||||
sf->mesh_patterns[i].interval =
|
||||
best_quality_mesh_pattern[mesh_density_level][i].interval;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,44 @@
|
|||
/*
|
||||
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
|
||||
#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef M_LOG2_E
|
||||
#define M_LOG2_E 0.693147180559945309417
|
||||
#endif
|
||||
#define log2f(x) (log(x) / (float)M_LOG2_E)
|
||||
|
||||
typedef struct GF_PICTURE {
|
||||
YV12_BUFFER_CONFIG *frame;
|
||||
int ref_frame[3];
|
||||
FRAME_UPDATE_TYPE update_type;
|
||||
} GF_PICTURE;
|
||||
|
||||
void vp9_init_tpl_buffer(VP9_COMP *cpi);
|
||||
void vp9_setup_tpl_stats(VP9_COMP *cpi);
|
||||
void vp9_free_tpl_buffer(VP9_COMP *cpi);
|
||||
|
||||
void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
|
||||
TX_SIZE tx_size);
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
|
||||
TX_SIZE tx_size);
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
|
|
@ -48,6 +48,29 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
|
|||
return rc_api;
|
||||
}
|
||||
|
||||
VP9RateControlRTC::~VP9RateControlRTC() {
|
||||
if (cpi_) {
|
||||
if (cpi_->svc.number_spatial_layers > 1 ||
|
||||
cpi_->svc.number_temporal_layers > 1) {
|
||||
for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
|
||||
for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
|
||||
int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
|
||||
LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
|
||||
vpx_free(lc->map);
|
||||
vpx_free(lc->last_coded_q_map);
|
||||
vpx_free(lc->consec_zero_mv);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
|
||||
vpx_free(cpi_->segmentation_map);
|
||||
cpi_->segmentation_map = NULL;
|
||||
vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
|
||||
}
|
||||
vpx_free(cpi_);
|
||||
}
|
||||
}
|
||||
|
||||
void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
|
||||
VP9_COMMON *cm = &cpi_->common;
|
||||
VP9EncoderConfig *oxcf = &cpi_->oxcf;
|
||||
|
@ -157,7 +180,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
|
|||
cm->height = height;
|
||||
}
|
||||
vp9_set_mb_mi(cm, cm->width, cm->height);
|
||||
cm->frame_type = frame_params.frame_type;
|
||||
cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
|
||||
// This is needed to ensure key frame does not get unset in rc_get_svc_params.
|
||||
cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
|
||||
cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
|
||||
|
|
|
@ -19,14 +19,14 @@
|
|||
#include "vp9/common/vp9_onyxc_int.h"
|
||||
#include "vp9/vp9_iface_common.h"
|
||||
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
|
||||
#include "vp9/encoder/vp9_encoder.h"
|
||||
#include "vp9/encoder/vp9_firstpass.h"
|
||||
#include "vp9/vp9_cx_iface.h"
|
||||
#include "vpx/internal/vpx_ratectrl_rtc.h"
|
||||
#include "vpx_mem/vpx_mem.h"
|
||||
|
||||
namespace libvpx {
|
||||
struct VP9_COMP;
|
||||
|
||||
namespace libvpx {
|
||||
struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
|
||||
public:
|
||||
VP9RateControlRtcConfig() {
|
||||
|
@ -53,7 +53,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
|
|||
};
|
||||
|
||||
struct VP9FrameParamsQpRTC {
|
||||
FRAME_TYPE frame_type;
|
||||
RcFrameType frame_type;
|
||||
int spatial_layer_id;
|
||||
int temporal_layer_id;
|
||||
};
|
||||
|
@ -90,28 +90,7 @@ class VP9RateControlRTC {
|
|||
public:
|
||||
static std::unique_ptr<VP9RateControlRTC> Create(
|
||||
const VP9RateControlRtcConfig &cfg);
|
||||
~VP9RateControlRTC() {
|
||||
if (cpi_) {
|
||||
if (cpi_->svc.number_spatial_layers > 1 ||
|
||||
cpi_->svc.number_temporal_layers > 1) {
|
||||
for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
|
||||
for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
|
||||
int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
|
||||
LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
|
||||
vpx_free(lc->map);
|
||||
vpx_free(lc->last_coded_q_map);
|
||||
vpx_free(lc->consec_zero_mv);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
|
||||
vpx_free(cpi_->segmentation_map);
|
||||
cpi_->segmentation_map = NULL;
|
||||
vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
|
||||
}
|
||||
vpx_free(cpi_);
|
||||
}
|
||||
}
|
||||
~VP9RateControlRTC();
|
||||
|
||||
void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
|
||||
// GetQP() needs to be called after ComputeQP() to get the latest QP
|
||||
|
@ -125,7 +104,7 @@ class VP9RateControlRTC {
|
|||
private:
|
||||
VP9RateControlRTC() {}
|
||||
void InitRateControl(const VP9RateControlRtcConfig &cfg);
|
||||
VP9_COMP *cpi_;
|
||||
struct VP9_COMP *cpi_;
|
||||
};
|
||||
|
||||
} // namespace libvpx
|
||||
|
|
|
@ -1372,22 +1372,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
|
|||
timebase_units_to_ticks(timestamp_ratio, pts + duration);
|
||||
res = image2yuvconfig(img, &sd);
|
||||
|
||||
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
|
||||
/* from vpx_encoder.h for g_w/g_h:
|
||||
"Note that the frames passed as input to the encoder must have this
|
||||
resolution"
|
||||
*/
|
||||
ctx->base.err_detail = "Invalid input frame resolution";
|
||||
res = VPX_CODEC_INVALID_PARAM;
|
||||
} else {
|
||||
// Store the original flags in to the frame buffer. Will extract the
|
||||
// key frame flag when we actually encode this frame.
|
||||
if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
|
||||
// Store the original flags in to the frame buffer. Will extract the
|
||||
// key frame flag when we actually encode this frame.
|
||||
if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
|
||||
dst_time_stamp, dst_end_time_stamp)) {
|
||||
res = update_error_state(ctx, &cpi->common.error);
|
||||
}
|
||||
ctx->next_frame_flags = 0;
|
||||
res = update_error_state(ctx, &cpi->common.error);
|
||||
}
|
||||
ctx->next_frame_flags = 0;
|
||||
}
|
||||
|
||||
cx_data = ctx->cx_data;
|
||||
|
@ -1684,9 +1675,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
|
|||
vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
|
||||
|
||||
if (mode) {
|
||||
const int res =
|
||||
vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
|
||||
(VPX_SCALING)mode->v_scaling_mode);
|
||||
const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
|
||||
mode->v_scaling_mode);
|
||||
return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
|
||||
}
|
||||
return VPX_CODEC_INVALID_PARAM;
|
||||
|
|
|
@ -104,6 +104,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
|
|||
endif
|
||||
VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
|
||||
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
|
||||
|
||||
|
|
|
@ -14,6 +14,9 @@
|
|||
#include "vpx/vpx_encoder.h"
|
||||
|
||||
namespace libvpx {
|
||||
|
||||
enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
|
||||
|
||||
struct VpxRateControlRtcConfig {
|
||||
public:
|
||||
VpxRateControlRtcConfig() {
|
||||
|
|
|
@ -165,8 +165,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
|
|||
// Transpose top left and top right quarters into one contiguous location to
|
||||
// process to the top half.
|
||||
|
||||
transpose_s16_8x8_new(&temp0[0], &temp2[0]);
|
||||
transpose_s16_8x8_new(&temp1[0], &temp2[8]);
|
||||
transpose_s16_8x8q(&temp0[0], &temp2[0]);
|
||||
transpose_s16_8x8q(&temp1[0], &temp2[8]);
|
||||
partial_round_shift(temp2);
|
||||
cross_input(temp2, temp3);
|
||||
vpx_fdct8x16_body(temp3, temp2);
|
||||
|
@ -180,7 +180,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
|
|||
|
||||
// Transpose bottom left and bottom right quarters into one contiguous
|
||||
// location to process to the bottom half.
|
||||
transpose_s16_8x8_new(&temp0[8], &temp1[0]);
|
||||
transpose_s16_8x8q(&temp0[8], &temp1[0]);
|
||||
|
||||
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
|
||||
&temp1[13], &temp1[14], &temp1[15]);
|
||||
|
|
|
@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
|
|||
dct_body_first_pass(temp5, temp4);
|
||||
|
||||
// Generate the top row by munging the first set of 8 from each one together.
|
||||
transpose_s16_8x8_new(&temp1[0], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[0], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[0], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[0], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[0], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[0], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[0], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[0], &temp0[24]);
|
||||
|
||||
dct_body_second_pass(temp0, temp5);
|
||||
|
||||
|
@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
|
|||
store(output, temp5);
|
||||
|
||||
// Second row of 8x32.
|
||||
transpose_s16_8x8_new(&temp1[8], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[8], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[8], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[8], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[8], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[8], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[8], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[8], &temp0[24]);
|
||||
|
||||
dct_body_second_pass(temp0, temp5);
|
||||
|
||||
|
@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
|
|||
store(output + 8 * 32, temp5);
|
||||
|
||||
// Third row of 8x32
|
||||
transpose_s16_8x8_new(&temp1[16], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[16], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[16], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[16], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[16], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[16], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[16], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[16], &temp0[24]);
|
||||
|
||||
dct_body_second_pass(temp0, temp5);
|
||||
|
||||
|
@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
|
|||
store(output + 16 * 32, temp5);
|
||||
|
||||
// Final row of 8x32.
|
||||
transpose_s16_8x8_new(&temp1[24], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[24], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[24], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[24], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[24], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[24], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[24], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[24], &temp0[24]);
|
||||
|
||||
dct_body_second_pass(temp0, temp5);
|
||||
|
||||
|
@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
|
|||
dct_body_first_pass(temp5, temp4);
|
||||
|
||||
// Generate the top row by munging the first set of 8 from each one together.
|
||||
transpose_s16_8x8_new(&temp1[0], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[0], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[0], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[0], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[0], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[0], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[0], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[0], &temp0[24]);
|
||||
|
||||
dct_body_second_pass_rd(temp0, temp5);
|
||||
|
||||
|
@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
|
|||
store(output, temp5);
|
||||
|
||||
// Second row of 8x32.
|
||||
transpose_s16_8x8_new(&temp1[8], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[8], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[8], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[8], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[8], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[8], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[8], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[8], &temp0[24]);
|
||||
|
||||
dct_body_second_pass_rd(temp0, temp5);
|
||||
|
||||
|
@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
|
|||
store(output + 8 * 32, temp5);
|
||||
|
||||
// Third row of 8x32
|
||||
transpose_s16_8x8_new(&temp1[16], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[16], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[16], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[16], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[16], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[16], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[16], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[16], &temp0[24]);
|
||||
|
||||
dct_body_second_pass_rd(temp0, temp5);
|
||||
|
||||
|
@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
|
|||
store(output + 16 * 32, temp5);
|
||||
|
||||
// Final row of 8x32.
|
||||
transpose_s16_8x8_new(&temp1[24], &temp0[0]);
|
||||
transpose_s16_8x8_new(&temp2[24], &temp0[8]);
|
||||
transpose_s16_8x8_new(&temp3[24], &temp0[16]);
|
||||
transpose_s16_8x8_new(&temp4[24], &temp0[24]);
|
||||
transpose_s16_8x8q(&temp1[24], &temp0[0]);
|
||||
transpose_s16_8x8q(&temp2[24], &temp0[8]);
|
||||
transpose_s16_8x8q(&temp3[24], &temp0[16]);
|
||||
transpose_s16_8x8q(&temp4[24], &temp0[24]);
|
||||
|
||||
dct_body_second_pass_rd(temp0, temp5);
|
||||
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
/*
|
||||
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
|
||||
int width, int height, const uint16_t *ref,
|
||||
int ref_stride) {
|
||||
int i = height;
|
||||
if (width > 8) {
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
const uint16x8_t p = vld1q_u16(pred + j);
|
||||
const uint16x8_t r = vld1q_u16(ref + j);
|
||||
|
||||
uint16x8_t avg = vrhaddq_u16(p, r);
|
||||
vst1q_u16(comp_pred + j, avg);
|
||||
|
||||
j += 8;
|
||||
} while (j < width);
|
||||
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
} while (--i != 0);
|
||||
} else if (width == 8) {
|
||||
do {
|
||||
const uint16x8_t p = vld1q_u16(pred);
|
||||
const uint16x8_t r = vld1q_u16(ref);
|
||||
|
||||
uint16x8_t avg = vrhaddq_u16(p, r);
|
||||
vst1q_u16(comp_pred, avg);
|
||||
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
} while (--i != 0);
|
||||
} else {
|
||||
assert(width == 4);
|
||||
do {
|
||||
const uint16x4_t p = vld1_u16(pred);
|
||||
const uint16x4_t r = vld1_u16(ref);
|
||||
|
||||
uint16x4_t avg = vrhadd_u16(p, r);
|
||||
vst1_u16(comp_pred, avg);
|
||||
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
} while (--i != 0);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,233 @@
|
|||
/*
|
||||
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
#include "./vpx_config.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_dsp/arm/mem_neon.h"
|
||||
#include "vpx_dsp/arm/sum_neon.h"
|
||||
|
||||
static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_ptr[4],
|
||||
int ref_stride, uint32_t res[4],
|
||||
int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
|
||||
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
|
||||
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
|
||||
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
|
||||
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
int i = 0;
|
||||
do {
|
||||
uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
|
||||
uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
|
||||
uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
|
||||
uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
|
||||
uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
|
||||
|
||||
sum[0] = vabal_u16(sum[0], s, r0);
|
||||
sum[1] = vabal_u16(sum[1], s, r1);
|
||||
sum[2] = vabal_u16(sum[2], s, r2);
|
||||
sum[3] = vabal_u16(sum[3], s, r3);
|
||||
|
||||
} while (++i < h);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
|
||||
uint32x4_t *const sad_sum) {
|
||||
uint16x8_t abs_diff = vabdq_u16(src, ref);
|
||||
*sad_sum = vpadalq_u16(*sad_sum, abs_diff);
|
||||
}
|
||||
|
||||
static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_ptr[4],
|
||||
int ref_stride, uint32_t res[4],
|
||||
int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
|
||||
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
|
||||
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
|
||||
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
|
||||
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
int i = 0;
|
||||
do {
|
||||
uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
|
||||
|
||||
sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
|
||||
sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
|
||||
sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
|
||||
sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
|
||||
|
||||
} while (++i < h);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *const ref_ptr[4],
|
||||
int ref_stride, uint32_t res[4],
|
||||
int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
|
||||
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
|
||||
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
|
||||
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
|
||||
|
||||
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum[4];
|
||||
|
||||
int i = 0;
|
||||
do {
|
||||
uint16x8_t s0, s1;
|
||||
|
||||
s0 = vld1q_u16(src16_ptr + i * src_stride);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
|
||||
|
||||
s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
|
||||
|
||||
} while (++i < h);
|
||||
|
||||
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
|
||||
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
|
||||
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
|
||||
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_ptr[4],
|
||||
int ref_stride, uint32_t res[4], int w,
|
||||
int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
|
||||
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
|
||||
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
|
||||
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
|
||||
|
||||
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum[4];
|
||||
|
||||
int i = 0;
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0, s1, s2, s3;
|
||||
|
||||
s0 = vld1q_u16(src16_ptr + i * src_stride + j);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
|
||||
sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
|
||||
|
||||
s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
|
||||
sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
|
||||
|
||||
s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
|
||||
sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
|
||||
&sum_lo[0]);
|
||||
sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
|
||||
&sum_lo[1]);
|
||||
sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
|
||||
&sum_lo[2]);
|
||||
sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
|
||||
&sum_lo[3]);
|
||||
|
||||
s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
|
||||
sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
|
||||
&sum_hi[0]);
|
||||
sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
|
||||
&sum_hi[1]);
|
||||
sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
|
||||
&sum_hi[2]);
|
||||
sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
|
||||
&sum_hi[3]);
|
||||
|
||||
j += 32;
|
||||
} while (j < w);
|
||||
|
||||
} while (++i < h);
|
||||
|
||||
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
|
||||
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
|
||||
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
|
||||
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *const ref_ptr[4],
|
||||
int ref_stride, uint32_t res[4],
|
||||
int h) {
|
||||
highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
|
||||
}
|
||||
|
||||
static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *const ref_ptr[4],
|
||||
int ref_stride, uint32_t res[4],
|
||||
int h) {
|
||||
highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
|
||||
}
|
||||
|
||||
#define HBD_SAD_WXH_4D_NEON(w, h) \
|
||||
void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *const ref[4], \
|
||||
int ref_stride, uint32_t res[4]) { \
|
||||
highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
|
||||
}
|
||||
|
||||
HBD_SAD_WXH_4D_NEON(4, 4)
|
||||
HBD_SAD_WXH_4D_NEON(4, 8)
|
||||
|
||||
HBD_SAD_WXH_4D_NEON(8, 4)
|
||||
HBD_SAD_WXH_4D_NEON(8, 8)
|
||||
HBD_SAD_WXH_4D_NEON(8, 16)
|
||||
|
||||
HBD_SAD_WXH_4D_NEON(16, 8)
|
||||
HBD_SAD_WXH_4D_NEON(16, 16)
|
||||
HBD_SAD_WXH_4D_NEON(16, 32)
|
||||
|
||||
HBD_SAD_WXH_4D_NEON(32, 16)
|
||||
HBD_SAD_WXH_4D_NEON(32, 32)
|
||||
HBD_SAD_WXH_4D_NEON(32, 64)
|
||||
|
||||
HBD_SAD_WXH_4D_NEON(64, 32)
|
||||
HBD_SAD_WXH_4D_NEON(64, 64)
|
|
@ -17,209 +17,363 @@
|
|||
#include "vpx_dsp/arm/mem_neon.h"
|
||||
#include "vpx_dsp/arm/sum_neon.h"
|
||||
|
||||
static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int width,
|
||||
int height) {
|
||||
int i, j;
|
||||
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
|
||||
static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j += 4) {
|
||||
const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
|
||||
const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
|
||||
sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
|
||||
}
|
||||
uint32x4_t sum = vdupq_n_u32(0);
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
uint16x4_t s = vld1_u16(src16_ptr);
|
||||
uint16x4_t r = vld1_u16(ref16_ptr);
|
||||
sum = vabal_u16(sum, s, r);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
}
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint32x4(sum_abs_diff);
|
||||
return horizontal_add_uint32x4(sum);
|
||||
}
|
||||
|
||||
static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int width,
|
||||
int height) {
|
||||
int i, j;
|
||||
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
|
||||
static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j += 8) {
|
||||
const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
|
||||
const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
|
||||
sum_abs_diff =
|
||||
vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
|
||||
sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
|
||||
vget_high_u16(ref_u16));
|
||||
}
|
||||
uint32x4_t sum = vdupq_n_u32(0);
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
uint16x8_t s = vld1q_u16(src16_ptr);
|
||||
uint16x8_t r = vld1q_u16(ref16_ptr);
|
||||
uint16x8_t diff = vabdq_u16(s, r);
|
||||
sum = vpadalq_u16(sum, diff);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
}
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint32x4(sum_abs_diff);
|
||||
return horizontal_add_uint32x4(sum);
|
||||
}
|
||||
|
||||
static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
|
||||
int ref_stride, const uint8_t *second_pred, int width, int height) {
|
||||
int i, j;
|
||||
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
|
||||
static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j += 4) {
|
||||
const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
|
||||
const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
|
||||
const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
|
||||
const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
|
||||
sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
|
||||
}
|
||||
uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
uint16x8_t s0, s1, r0, r1;
|
||||
uint16x8_t diff0, diff1;
|
||||
|
||||
s0 = vld1q_u16(src16_ptr);
|
||||
r0 = vld1q_u16(ref16_ptr);
|
||||
diff0 = vabdq_u16(s0, r0);
|
||||
sum[0] = vpadalq_u16(sum[0], diff0);
|
||||
|
||||
s1 = vld1q_u16(src16_ptr + 8);
|
||||
r1 = vld1q_u16(ref16_ptr + 8);
|
||||
diff1 = vabdq_u16(s1, r1);
|
||||
sum[1] = vpadalq_u16(sum[1], diff1);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
pred_ptr += width;
|
||||
}
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint32x4(sum_abs_diff);
|
||||
sum[0] = vaddq_u32(sum[0], sum[1]);
|
||||
return horizontal_add_uint32x4(sum[0]);
|
||||
}
|
||||
|
||||
static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
|
||||
int ref_stride, const uint8_t *second_pred, int width, int height) {
|
||||
int i, j;
|
||||
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
|
||||
static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int w, int h) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
|
||||
for (i = 0; i < height; i++) {
|
||||
for (j = 0; j < width; j += 8) {
|
||||
const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
|
||||
const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
|
||||
const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
|
||||
const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
|
||||
sum_abs_diff =
|
||||
vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
|
||||
sum_abs_diff =
|
||||
vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
|
||||
}
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
|
||||
uint16x8_t diff0, diff1, diff2, diff3;
|
||||
|
||||
s0 = vld1q_u16(src16_ptr + j);
|
||||
r0 = vld1q_u16(ref16_ptr + j);
|
||||
diff0 = vabdq_u16(s0, r0);
|
||||
sum[0] = vpadalq_u16(sum[0], diff0);
|
||||
|
||||
s1 = vld1q_u16(src16_ptr + j + 8);
|
||||
r1 = vld1q_u16(ref16_ptr + j + 8);
|
||||
diff1 = vabdq_u16(s1, r1);
|
||||
sum[1] = vpadalq_u16(sum[1], diff1);
|
||||
|
||||
s2 = vld1q_u16(src16_ptr + j + 16);
|
||||
r2 = vld1q_u16(ref16_ptr + j + 16);
|
||||
diff2 = vabdq_u16(s2, r2);
|
||||
sum[2] = vpadalq_u16(sum[2], diff2);
|
||||
|
||||
s3 = vld1q_u16(src16_ptr + j + 24);
|
||||
r3 = vld1q_u16(ref16_ptr + j + 24);
|
||||
diff3 = vabdq_u16(s3, r3);
|
||||
sum[3] = vpadalq_u16(sum[3], diff3);
|
||||
|
||||
j += 32;
|
||||
} while (j < w);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
pred_ptr += width;
|
||||
}
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint32x4(sum_abs_diff);
|
||||
sum[0] = vaddq_u32(sum[0], sum[1]);
|
||||
sum[2] = vaddq_u32(sum[2], sum[3]);
|
||||
sum[0] = vaddq_u32(sum[0], sum[2]);
|
||||
|
||||
return horizontal_add_uint32x4(sum[0]);
|
||||
}
|
||||
|
||||
#define highbd_sad4MxN(m, n) \
|
||||
unsigned int vpx_highbd_sad##m##x##n##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
||||
int ref_stride) { \
|
||||
return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
|
||||
static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h) {
|
||||
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
|
||||
}
|
||||
|
||||
static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h) {
|
||||
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
|
||||
}
|
||||
|
||||
#define HBD_SAD_WXH_NEON(w, h) \
|
||||
unsigned int vpx_highbd_sad##w##x##h##_neon( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, \
|
||||
int ref_stride) { \
|
||||
return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
|
||||
}
|
||||
|
||||
#define highbd_sadMxN(m, n) \
|
||||
unsigned int vpx_highbd_sad##m##x##n##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
||||
int ref_stride) { \
|
||||
return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
|
||||
HBD_SAD_WXH_NEON(4, 4)
|
||||
HBD_SAD_WXH_NEON(4, 8)
|
||||
|
||||
HBD_SAD_WXH_NEON(8, 4)
|
||||
HBD_SAD_WXH_NEON(8, 8)
|
||||
HBD_SAD_WXH_NEON(8, 16)
|
||||
|
||||
HBD_SAD_WXH_NEON(16, 8)
|
||||
HBD_SAD_WXH_NEON(16, 16)
|
||||
HBD_SAD_WXH_NEON(16, 32)
|
||||
|
||||
HBD_SAD_WXH_NEON(32, 16)
|
||||
HBD_SAD_WXH_NEON(32, 32)
|
||||
HBD_SAD_WXH_NEON(32, 64)
|
||||
|
||||
HBD_SAD_WXH_NEON(64, 32)
|
||||
HBD_SAD_WXH_NEON(64, 64)
|
||||
|
||||
static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h,
|
||||
const uint8_t *second_pred) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
|
||||
uint32x4_t sum = vdupq_n_u32(0);
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
uint16x4_t s = vld1_u16(src16_ptr);
|
||||
uint16x4_t r = vld1_u16(ref16_ptr);
|
||||
uint16x4_t p = vld1_u16(pred16_ptr);
|
||||
|
||||
uint16x4_t avg = vrhadd_u16(r, p);
|
||||
sum = vabal_u16(sum, s, avg);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
pred16_ptr += 4;
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint32x4(sum);
|
||||
}
|
||||
|
||||
static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h,
|
||||
const uint8_t *second_pred) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
|
||||
uint32x4_t sum = vdupq_n_u32(0);
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
uint16x8_t s = vld1q_u16(src16_ptr);
|
||||
uint16x8_t r = vld1q_u16(ref16_ptr);
|
||||
uint16x8_t p = vld1q_u16(pred16_ptr);
|
||||
|
||||
uint16x8_t avg = vrhaddq_u16(r, p);
|
||||
uint16x8_t diff = vabdq_u16(s, avg);
|
||||
sum = vpadalq_u16(sum, diff);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
pred16_ptr += 8;
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint32x4(sum);
|
||||
}
|
||||
|
||||
static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h,
|
||||
const uint8_t *second_pred) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
|
||||
uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
uint16x8_t s0, s1, r0, r1, p0, p1;
|
||||
uint16x8_t avg0, avg1, diff0, diff1;
|
||||
|
||||
s0 = vld1q_u16(src16_ptr);
|
||||
r0 = vld1q_u16(ref16_ptr);
|
||||
p0 = vld1q_u16(pred16_ptr);
|
||||
avg0 = vrhaddq_u16(r0, p0);
|
||||
diff0 = vabdq_u16(s0, avg0);
|
||||
sum[0] = vpadalq_u16(sum[0], diff0);
|
||||
|
||||
s1 = vld1q_u16(src16_ptr + 8);
|
||||
r1 = vld1q_u16(ref16_ptr + 8);
|
||||
p1 = vld1q_u16(pred16_ptr + 8);
|
||||
avg1 = vrhaddq_u16(r1, p1);
|
||||
diff1 = vabdq_u16(s1, avg1);
|
||||
sum[1] = vpadalq_u16(sum[1], diff1);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
pred16_ptr += 16;
|
||||
} while (--i != 0);
|
||||
|
||||
sum[0] = vaddq_u32(sum[0], sum[1]);
|
||||
return horizontal_add_uint32x4(sum[0]);
|
||||
}
|
||||
|
||||
static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int w, int h,
|
||||
const uint8_t *second_pred) {
|
||||
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
|
||||
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
|
||||
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
int i = h;
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
|
||||
uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
|
||||
|
||||
s0 = vld1q_u16(src16_ptr + j);
|
||||
r0 = vld1q_u16(ref16_ptr + j);
|
||||
p0 = vld1q_u16(pred16_ptr + j);
|
||||
avg0 = vrhaddq_u16(r0, p0);
|
||||
diff0 = vabdq_u16(s0, avg0);
|
||||
sum[0] = vpadalq_u16(sum[0], diff0);
|
||||
|
||||
s1 = vld1q_u16(src16_ptr + j + 8);
|
||||
r1 = vld1q_u16(ref16_ptr + j + 8);
|
||||
p1 = vld1q_u16(pred16_ptr + j + 8);
|
||||
avg1 = vrhaddq_u16(r1, p1);
|
||||
diff1 = vabdq_u16(s1, avg1);
|
||||
sum[1] = vpadalq_u16(sum[1], diff1);
|
||||
|
||||
s2 = vld1q_u16(src16_ptr + j + 16);
|
||||
r2 = vld1q_u16(ref16_ptr + j + 16);
|
||||
p2 = vld1q_u16(pred16_ptr + j + 16);
|
||||
avg2 = vrhaddq_u16(r2, p2);
|
||||
diff2 = vabdq_u16(s2, avg2);
|
||||
sum[2] = vpadalq_u16(sum[2], diff2);
|
||||
|
||||
s3 = vld1q_u16(src16_ptr + j + 24);
|
||||
r3 = vld1q_u16(ref16_ptr + j + 24);
|
||||
p3 = vld1q_u16(pred16_ptr + j + 24);
|
||||
avg3 = vrhaddq_u16(r3, p3);
|
||||
diff3 = vabdq_u16(s3, avg3);
|
||||
sum[3] = vpadalq_u16(sum[3], diff3);
|
||||
|
||||
j += 32;
|
||||
} while (j < w);
|
||||
|
||||
src16_ptr += src_stride;
|
||||
ref16_ptr += ref_stride;
|
||||
pred16_ptr += w;
|
||||
} while (--i != 0);
|
||||
|
||||
sum[0] = vaddq_u32(sum[0], sum[1]);
|
||||
sum[2] = vaddq_u32(sum[2], sum[3]);
|
||||
sum[0] = vaddq_u32(sum[0], sum[2]);
|
||||
|
||||
return horizontal_add_uint32x4(sum[0]);
|
||||
}
|
||||
|
||||
static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h,
|
||||
const uint8_t *second_pred) {
|
||||
return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
|
||||
second_pred);
|
||||
}
|
||||
|
||||
static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
|
||||
int src_stride,
|
||||
const uint8_t *ref_ptr,
|
||||
int ref_stride, int h,
|
||||
const uint8_t *second_pred) {
|
||||
return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
|
||||
second_pred);
|
||||
}
|
||||
|
||||
#define HBD_SAD_WXH_AVG_NEON(w, h) \
|
||||
uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \
|
||||
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
|
||||
const uint8_t *second_pred) { \
|
||||
return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
|
||||
second_pred); \
|
||||
}
|
||||
|
||||
#define highbd_sad4MxN_avg(m, n) \
|
||||
unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
||||
int ref_stride, const uint8_t *second_pred) { \
|
||||
return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
|
||||
second_pred, m, n); \
|
||||
}
|
||||
HBD_SAD_WXH_AVG_NEON(4, 4)
|
||||
HBD_SAD_WXH_AVG_NEON(4, 8)
|
||||
|
||||
#define highbd_sadMxN_avg(m, n) \
|
||||
unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
||||
int ref_stride, const uint8_t *second_pred) { \
|
||||
return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
|
||||
second_pred, m, n); \
|
||||
}
|
||||
HBD_SAD_WXH_AVG_NEON(8, 4)
|
||||
HBD_SAD_WXH_AVG_NEON(8, 8)
|
||||
HBD_SAD_WXH_AVG_NEON(8, 16)
|
||||
|
||||
#define highbd_sadMxNx4D(m, n) \
|
||||
void vpx_highbd_sad##m##x##n##x4d_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, \
|
||||
const uint8_t *const ref_array[4], int ref_stride, \
|
||||
uint32_t sad_array[4]) { \
|
||||
int i; \
|
||||
for (i = 0; i < 4; ++i) { \
|
||||
sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \
|
||||
ref_array[i], ref_stride); \
|
||||
} \
|
||||
}
|
||||
HBD_SAD_WXH_AVG_NEON(16, 8)
|
||||
HBD_SAD_WXH_AVG_NEON(16, 16)
|
||||
HBD_SAD_WXH_AVG_NEON(16, 32)
|
||||
|
||||
/* clang-format off */
|
||||
// 4x4
|
||||
highbd_sad4MxN(4, 4)
|
||||
highbd_sad4MxN_avg(4, 4)
|
||||
highbd_sadMxNx4D(4, 4)
|
||||
HBD_SAD_WXH_AVG_NEON(32, 16)
|
||||
HBD_SAD_WXH_AVG_NEON(32, 32)
|
||||
HBD_SAD_WXH_AVG_NEON(32, 64)
|
||||
|
||||
// 4x8
|
||||
highbd_sad4MxN(4, 8)
|
||||
highbd_sad4MxN_avg(4, 8)
|
||||
highbd_sadMxNx4D(4, 8)
|
||||
|
||||
// 8x4
|
||||
highbd_sadMxN(8, 4)
|
||||
highbd_sadMxN_avg(8, 4)
|
||||
highbd_sadMxNx4D(8, 4)
|
||||
|
||||
// 8x8
|
||||
highbd_sadMxN(8, 8)
|
||||
highbd_sadMxN_avg(8, 8)
|
||||
highbd_sadMxNx4D(8, 8)
|
||||
|
||||
// 8x16
|
||||
highbd_sadMxN(8, 16)
|
||||
highbd_sadMxN_avg(8, 16)
|
||||
highbd_sadMxNx4D(8, 16)
|
||||
|
||||
// 16x8
|
||||
highbd_sadMxN(16, 8)
|
||||
highbd_sadMxN_avg(16, 8)
|
||||
highbd_sadMxNx4D(16, 8)
|
||||
|
||||
// 16x16
|
||||
highbd_sadMxN(16, 16)
|
||||
highbd_sadMxN_avg(16, 16)
|
||||
highbd_sadMxNx4D(16, 16)
|
||||
|
||||
// 16x32
|
||||
highbd_sadMxN(16, 32)
|
||||
highbd_sadMxN_avg(16, 32)
|
||||
highbd_sadMxNx4D(16, 32)
|
||||
|
||||
// 32x16
|
||||
highbd_sadMxN(32, 16)
|
||||
highbd_sadMxN_avg(32, 16)
|
||||
highbd_sadMxNx4D(32, 16)
|
||||
|
||||
// 32x32
|
||||
highbd_sadMxN(32, 32)
|
||||
highbd_sadMxN_avg(32, 32)
|
||||
highbd_sadMxNx4D(32, 32)
|
||||
|
||||
// 32x64
|
||||
highbd_sadMxN(32, 64)
|
||||
highbd_sadMxN_avg(32, 64)
|
||||
highbd_sadMxNx4D(32, 64)
|
||||
|
||||
// 64x32
|
||||
highbd_sadMxN(64, 32)
|
||||
highbd_sadMxN_avg(64, 32)
|
||||
highbd_sadMxNx4D(64, 32)
|
||||
|
||||
// 64x64
|
||||
highbd_sadMxN(64, 64)
|
||||
highbd_sadMxN_avg(64, 64)
|
||||
highbd_sadMxNx4D(64, 64)
|
||||
/* clang-format on */
|
||||
HBD_SAD_WXH_AVG_NEON(64, 32)
|
||||
HBD_SAD_WXH_AVG_NEON(64, 64)
|
||||
|
|
|
@ -0,0 +1,594 @@
|
|||
/*
|
||||
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
|
||||
#include "vpx/vpx_integer.h"
|
||||
#include "vpx_dsp/arm/mem_neon.h"
|
||||
|
||||
// The bilinear filters look like this:
|
||||
//
|
||||
// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
|
||||
// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
|
||||
//
|
||||
// We can factor out the highest common multiple, such that the sum of both
|
||||
// weights will be 8 instead of 128. The benefits of this are two-fold:
|
||||
//
|
||||
// 1) We can infer the filter values from the filter_offset parameter in the
|
||||
// bilinear filter functions below - we don't have to actually load the values
|
||||
// from memory:
|
||||
// f0 = 8 - filter_offset
|
||||
// f1 = filter_offset
|
||||
//
|
||||
// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
|
||||
// 16-bit data types at all times, rather than widening out to 32-bit and
|
||||
// requiring double the number of data processing instructions. (12-bit * 8 =
|
||||
// 15-bit.)
|
||||
|
||||
// Process a block exactly 4 wide and a multiple of 2 high.
|
||||
static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr, int src_stride,
|
||||
int pixel_step, int dst_height,
|
||||
int filter_offset) {
|
||||
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
|
||||
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
|
||||
|
||||
int i = dst_height;
|
||||
do {
|
||||
uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
|
||||
uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
|
||||
|
||||
uint16x8_t blend = vmulq_u16(s0, f0);
|
||||
blend = vmlaq_u16(blend, s1, f1);
|
||||
blend = vrshrq_n_u16(blend, 3);
|
||||
|
||||
vst1q_u16(dst_ptr, blend);
|
||||
|
||||
src_ptr += 2 * src_stride;
|
||||
dst_ptr += 8;
|
||||
i -= 2;
|
||||
} while (i != 0);
|
||||
}
|
||||
|
||||
// Process a block which is a multiple of 8 and any height.
|
||||
static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr,
|
||||
int src_stride, int pixel_step,
|
||||
int dst_width, int dst_height,
|
||||
int filter_offset) {
|
||||
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
|
||||
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
|
||||
|
||||
int i = dst_height;
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0 = vld1q_u16(src_ptr + j);
|
||||
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
|
||||
|
||||
uint16x8_t blend = vmulq_u16(s0, f0);
|
||||
blend = vmlaq_u16(blend, s1, f1);
|
||||
blend = vrshrq_n_u16(blend, 3);
|
||||
|
||||
vst1q_u16(dst_ptr + j, blend);
|
||||
|
||||
j += 8;
|
||||
} while (j < dst_width);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_width;
|
||||
} while (--i != 0);
|
||||
}
|
||||
|
||||
static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr, int src_stride,
|
||||
int pixel_step, int dst_height,
|
||||
int filter_offset) {
|
||||
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
|
||||
8, dst_height, filter_offset);
|
||||
}
|
||||
static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr, int src_stride,
|
||||
int pixel_step, int dst_height,
|
||||
int filter_offset) {
|
||||
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
|
||||
16, dst_height, filter_offset);
|
||||
}
|
||||
static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr, int src_stride,
|
||||
int pixel_step, int dst_height,
|
||||
int filter_offset) {
|
||||
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
|
||||
32, dst_height, filter_offset);
|
||||
}
|
||||
static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr, int src_stride,
|
||||
int pixel_step, int dst_height,
|
||||
int filter_offset) {
|
||||
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
|
||||
64, dst_height, filter_offset);
|
||||
}
|
||||
|
||||
static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
|
||||
uint16_t *dst_ptr, int src_stride,
|
||||
int pixel_step, int dst_width,
|
||||
int dst_height) {
|
||||
int i = dst_height;
|
||||
|
||||
// We only specialize on the filter values for large block sizes (>= 16x16.)
|
||||
assert(dst_width >= 16 && dst_width % 16 == 0);
|
||||
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0 = vld1q_u16(src_ptr + j);
|
||||
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
|
||||
uint16x8_t avg = vrhaddq_u16(s0, s1);
|
||||
vst1q_u16(dst_ptr + j, avg);
|
||||
|
||||
j += 8;
|
||||
} while (j < dst_width);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_width;
|
||||
} while (--i != 0);
|
||||
}
|
||||
|
||||
#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
|
||||
unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
|
||||
const uint8_t *ref, int ref_stride, uint32_t *sse) { \
|
||||
uint16_t tmp0[w * (h + padding)]; \
|
||||
uint16_t tmp1[w * h]; \
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
|
||||
(h + padding), xoffset); \
|
||||
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
|
||||
\
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
|
||||
w, ref, ref_stride, sse); \
|
||||
}
|
||||
|
||||
#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
|
||||
unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
|
||||
const uint8_t *ref, int ref_stride, unsigned int *sse) { \
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
|
||||
\
|
||||
if (xoffset == 0) { \
|
||||
if (yoffset == 0) { \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
|
||||
} else if (yoffset == 4) { \
|
||||
uint16_t tmp[w * h]; \
|
||||
highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
|
||||
h); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
|
||||
} else { \
|
||||
uint16_t tmp[w * h]; \
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
|
||||
src_stride, h, yoffset); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
|
||||
} \
|
||||
} else if (xoffset == 4) { \
|
||||
uint16_t tmp0[w * (h + padding)]; \
|
||||
if (yoffset == 0) { \
|
||||
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
|
||||
} else if (yoffset == 4) { \
|
||||
uint16_t tmp1[w * (h + padding)]; \
|
||||
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
|
||||
(h + padding)); \
|
||||
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} else { \
|
||||
uint16_t tmp1[w * (h + padding)]; \
|
||||
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
|
||||
(h + padding)); \
|
||||
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} \
|
||||
} else { \
|
||||
uint16_t tmp0[w * (h + padding)]; \
|
||||
if (yoffset == 0) { \
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
|
||||
xoffset); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
|
||||
} else if (yoffset == 4) { \
|
||||
uint16_t tmp1[w * h]; \
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
|
||||
(h + padding), xoffset); \
|
||||
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} else { \
|
||||
uint16_t tmp1[w * h]; \
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
|
||||
(h + padding), xoffset); \
|
||||
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
// 4x<h> blocks are processed two rows at a time, so require an extra row of
|
||||
// padding.
|
||||
|
||||
// 8-bit
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
|
||||
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
|
||||
|
||||
// 10-bit
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
|
||||
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
|
||||
|
||||
// 12-bit
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
|
||||
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
|
||||
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
|
||||
|
||||
// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
|
||||
// width 4.
|
||||
static void highbd_avg_pred_var_filter_block2d_bil_w4(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_height, int filter_offset, const uint16_t *second_pred) {
|
||||
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
|
||||
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
|
||||
|
||||
int i = dst_height;
|
||||
do {
|
||||
uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
|
||||
uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
|
||||
uint16x8_t p = vld1q_u16(second_pred);
|
||||
|
||||
uint16x8_t blend = vmulq_u16(s0, f0);
|
||||
blend = vmlaq_u16(blend, s1, f1);
|
||||
blend = vrshrq_n_u16(blend, 3);
|
||||
|
||||
vst1q_u16(dst_ptr, vrhaddq_u16(blend, p));
|
||||
|
||||
src_ptr += 2 * src_stride;
|
||||
dst_ptr += 2 * 4;
|
||||
second_pred += 2 * 4;
|
||||
i -= 2;
|
||||
} while (i != 0);
|
||||
}
|
||||
|
||||
// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
|
||||
static void highbd_avg_pred_var_filter_block2d_bil_large(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_width, int dst_height, int filter_offset,
|
||||
const uint16_t *second_pred) {
|
||||
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
|
||||
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
|
||||
|
||||
int i = dst_height;
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0 = vld1q_u16(src_ptr + j);
|
||||
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
|
||||
uint16x8_t p = vld1q_u16(second_pred);
|
||||
|
||||
uint16x8_t blend = vmulq_u16(s0, f0);
|
||||
blend = vmlaq_u16(blend, s1, f1);
|
||||
blend = vrshrq_n_u16(blend, 3);
|
||||
|
||||
vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
|
||||
|
||||
j += 8;
|
||||
second_pred += 8;
|
||||
} while (j < dst_width);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_width;
|
||||
} while (--i != 0);
|
||||
}
|
||||
|
||||
static void highbd_avg_pred_var_filter_block2d_bil_w8(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_height, int filter_offset, const uint16_t *second_pred) {
|
||||
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
|
||||
pixel_step, 8, dst_height,
|
||||
filter_offset, second_pred);
|
||||
}
|
||||
static void highbd_avg_pred_var_filter_block2d_bil_w16(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_height, int filter_offset, const uint16_t *second_pred) {
|
||||
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
|
||||
pixel_step, 16, dst_height,
|
||||
filter_offset, second_pred);
|
||||
}
|
||||
static void highbd_avg_pred_var_filter_block2d_bil_w32(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_height, int filter_offset, const uint16_t *second_pred) {
|
||||
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
|
||||
pixel_step, 32, dst_height,
|
||||
filter_offset, second_pred);
|
||||
}
|
||||
static void highbd_avg_pred_var_filter_block2d_bil_w64(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_height, int filter_offset, const uint16_t *second_pred) {
|
||||
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
|
||||
pixel_step, 64, dst_height,
|
||||
filter_offset, second_pred);
|
||||
}
|
||||
|
||||
// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
|
||||
static void highbd_avg_pred_var_filter_block2d_avg(
|
||||
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
|
||||
int dst_width, int dst_height, const uint16_t *second_pred) {
|
||||
int i = dst_height;
|
||||
|
||||
// We only specialize on the filter values for large block sizes (>= 16x16.)
|
||||
assert(dst_width >= 16 && dst_width % 16 == 0);
|
||||
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s0 = vld1q_u16(src_ptr + j);
|
||||
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
|
||||
uint16x8_t avg = vrhaddq_u16(s0, s1);
|
||||
|
||||
uint16x8_t p = vld1q_u16(second_pred);
|
||||
avg = vrhaddq_u16(avg, p);
|
||||
|
||||
vst1q_u16(dst_ptr + j, avg);
|
||||
|
||||
j += 8;
|
||||
second_pred += 8;
|
||||
} while (j < dst_width);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_width;
|
||||
} while (--i != 0);
|
||||
}
|
||||
|
||||
// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
|
||||
static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
|
||||
int src_stride, int dst_width, int dst_height,
|
||||
const uint16_t *second_pred) {
|
||||
int i = dst_height;
|
||||
|
||||
// We only specialize on the filter values for large block sizes (>= 16x16.)
|
||||
assert(dst_width >= 16 && dst_width % 16 == 0);
|
||||
|
||||
do {
|
||||
int j = 0;
|
||||
do {
|
||||
uint16x8_t s = vld1q_u16(src_ptr + j);
|
||||
uint16x8_t p = vld1q_u16(second_pred);
|
||||
|
||||
uint16x8_t avg = vrhaddq_u16(s, p);
|
||||
|
||||
vst1q_u16(dst_ptr + j, avg);
|
||||
|
||||
j += 8;
|
||||
second_pred += 8;
|
||||
} while (j < dst_width);
|
||||
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += dst_width;
|
||||
} while (--i != 0);
|
||||
}
|
||||
|
||||
#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
|
||||
uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
|
||||
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
|
||||
const uint8_t *ref, int ref_stride, uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t tmp0[w * (h + padding)]; \
|
||||
uint16_t tmp1[w * h]; \
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
|
||||
(h + padding), xoffset); \
|
||||
highbd_avg_pred_var_filter_block2d_bil_w##w( \
|
||||
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
\
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
}
|
||||
|
||||
#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
|
||||
unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
|
||||
const uint8_t *src, int source_stride, int xoffset, int yoffset, \
|
||||
const uint8_t *ref, int ref_stride, unsigned int *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
|
||||
\
|
||||
if (xoffset == 0) { \
|
||||
uint16_t tmp[w * h]; \
|
||||
if (yoffset == 0) { \
|
||||
highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
|
||||
CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
|
||||
} else if (yoffset == 4) { \
|
||||
highbd_avg_pred_var_filter_block2d_avg( \
|
||||
src_ptr, tmp, source_stride, source_stride, w, h, \
|
||||
CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
|
||||
} else { \
|
||||
highbd_avg_pred_var_filter_block2d_bil_w##w( \
|
||||
src_ptr, tmp, source_stride, source_stride, h, yoffset, \
|
||||
CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
|
||||
} \
|
||||
} else if (xoffset == 4) { \
|
||||
uint16_t tmp0[w * (h + padding)]; \
|
||||
if (yoffset == 0) { \
|
||||
highbd_avg_pred_var_filter_block2d_avg( \
|
||||
src_ptr, tmp0, source_stride, 1, w, h, \
|
||||
CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
|
||||
} else if (yoffset == 4) { \
|
||||
uint16_t tmp1[w * (h + padding)]; \
|
||||
highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
|
||||
(h + padding)); \
|
||||
highbd_avg_pred_var_filter_block2d_avg( \
|
||||
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} else { \
|
||||
uint16_t tmp1[w * (h + padding)]; \
|
||||
highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
|
||||
(h + padding)); \
|
||||
highbd_avg_pred_var_filter_block2d_bil_w##w( \
|
||||
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} \
|
||||
} else { \
|
||||
uint16_t tmp0[w * (h + padding)]; \
|
||||
if (yoffset == 0) { \
|
||||
highbd_avg_pred_var_filter_block2d_bil_w##w( \
|
||||
src_ptr, tmp0, source_stride, 1, h, xoffset, \
|
||||
CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
|
||||
} else if (yoffset == 4) { \
|
||||
uint16_t tmp1[w * h]; \
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
|
||||
(h + padding), xoffset); \
|
||||
highbd_avg_pred_var_filter_block2d_avg( \
|
||||
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} else { \
|
||||
uint16_t tmp1[w * h]; \
|
||||
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
|
||||
(h + padding), xoffset); \
|
||||
highbd_avg_pred_var_filter_block2d_bil_w##w( \
|
||||
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
|
||||
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
|
||||
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
// 4x<h> blocks are processed two rows at a time, so require an extra row of
|
||||
// padding.
|
||||
|
||||
// 8-bit
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
|
||||
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1)
|
||||
|
||||
// 10-bit
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
|
||||
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1)
|
||||
|
||||
// 12-bit
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
|
||||
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1)
|
||||
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1)
|
||||
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1)
|
||||
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1)
|
|
@ -18,11 +18,6 @@
|
|||
#include "vpx_dsp/arm/sum_neon.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
static const uint8_t bilinear_filters[8][2] = {
|
||||
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
|
||||
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
|
||||
};
|
||||
|
||||
static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
|
||||
const uint16_t *ref_ptr, int ref_stride,
|
||||
int w, int h, uint64_t *sse,
|
||||
|
@ -136,7 +131,7 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
|
|||
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
|
||||
}
|
||||
|
||||
#define HIGHBD_VAR(W, H) \
|
||||
#define HBD_VARIANCE_WXH_NEON(W, H) \
|
||||
uint32_t vpx_highbd_8_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
|
||||
int ref_stride, uint32_t *sse) { \
|
||||
|
@ -218,274 +213,19 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
|
|||
return *sse; \
|
||||
}
|
||||
|
||||
static INLINE void highbd_var_filter_block2d_bil_first_pass(
|
||||
const uint8_t *src_ptr8, uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line, int pixel_step,
|
||||
unsigned int output_height, unsigned int output_width,
|
||||
const uint8_t *filter) {
|
||||
uint32_t i, j;
|
||||
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
|
||||
|
||||
uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
|
||||
uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
|
||||
uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
|
||||
|
||||
if (output_width >= 8) {
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; j += 8) {
|
||||
const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
|
||||
const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
|
||||
uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
|
||||
uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
|
||||
uint16x4_t out1_u16;
|
||||
uint16x4_t out2_u16;
|
||||
sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
|
||||
sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
|
||||
out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
|
||||
out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
|
||||
vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
|
||||
}
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
} else {
|
||||
assert(output_width >= 4);
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; j += 4) {
|
||||
const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
|
||||
const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
|
||||
uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
|
||||
uint16x4_t out_u16;
|
||||
sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
|
||||
out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
|
||||
vst1_u16(&output_ptr[j], out_u16);
|
||||
}
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void highbd_var_filter_block2d_bil_second_pass(
|
||||
const uint16_t *src_ptr, uint16_t *output_ptr,
|
||||
unsigned int src_pixels_per_line, unsigned int pixel_step,
|
||||
unsigned int output_height, unsigned int output_width,
|
||||
const uint8_t *filter) {
|
||||
uint32_t i, j;
|
||||
|
||||
uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
|
||||
uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
|
||||
uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
|
||||
|
||||
if (output_width >= 8) {
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; j += 8) {
|
||||
const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
|
||||
const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
|
||||
uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
|
||||
uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
|
||||
uint16x4_t out1_u16;
|
||||
uint16x4_t out2_u16;
|
||||
sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
|
||||
sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
|
||||
out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
|
||||
out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
|
||||
vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
|
||||
}
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
} else {
|
||||
assert(output_width >= 4);
|
||||
for (i = 0; i < output_height; ++i) {
|
||||
for (j = 0; j < output_width; j += 4) {
|
||||
const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
|
||||
const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
|
||||
uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
|
||||
uint16x4_t out_u16;
|
||||
sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
|
||||
out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
|
||||
vst1_u16(&output_ptr[j], out_u16);
|
||||
}
|
||||
// Next row...
|
||||
src_ptr += src_pixels_per_line;
|
||||
output_ptr += output_width;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_VAR(W, H) \
|
||||
uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass( \
|
||||
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[y_offset]); \
|
||||
\
|
||||
return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
|
||||
ref_ptr, ref_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass( \
|
||||
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[y_offset]); \
|
||||
\
|
||||
return vpx_highbd_10_variance##W##x##H##_neon( \
|
||||
CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass( \
|
||||
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[y_offset]); \
|
||||
\
|
||||
return vpx_highbd_12_variance##W##x##H##_neon( \
|
||||
CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
|
||||
}
|
||||
|
||||
#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
|
||||
uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass( \
|
||||
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[y_offset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
|
||||
H, temp2, W); \
|
||||
\
|
||||
return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
|
||||
ref_ptr, ref_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass( \
|
||||
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[y_offset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
|
||||
H, temp2, W); \
|
||||
\
|
||||
return vpx_highbd_10_variance##W##x##H##_neon( \
|
||||
CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
|
||||
} \
|
||||
\
|
||||
uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \
|
||||
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
|
||||
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
|
||||
const uint8_t *second_pred) { \
|
||||
uint16_t fdata3[(H + 1) * W]; \
|
||||
uint16_t temp2[H * W]; \
|
||||
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
|
||||
\
|
||||
highbd_var_filter_block2d_bil_first_pass( \
|
||||
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
|
||||
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
|
||||
bilinear_filters[y_offset]); \
|
||||
\
|
||||
vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
|
||||
H, temp2, W); \
|
||||
\
|
||||
return vpx_highbd_12_variance##W##x##H##_neon( \
|
||||
CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
|
||||
}
|
||||
|
||||
void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
|
||||
int width, int height, const uint16_t *ref,
|
||||
int ref_stride) {
|
||||
int i, j;
|
||||
uint32x4_t one_u32 = vdupq_n_u32(1);
|
||||
if (width >= 8) {
|
||||
for (i = 0; i < height; ++i) {
|
||||
for (j = 0; j < width; j += 8) {
|
||||
const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
|
||||
const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
|
||||
const uint32x4_t sum1_u32 =
|
||||
vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
|
||||
const uint32x4_t sum2_u32 =
|
||||
vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
|
||||
const uint16x4_t sum1_u16 =
|
||||
vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
|
||||
const uint16x4_t sum2_u16 =
|
||||
vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
|
||||
const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
|
||||
vst1q_u16(&comp_pred[j], vcomp_pred);
|
||||
}
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
}
|
||||
} else {
|
||||
assert(width >= 4);
|
||||
for (i = 0; i < height; ++i) {
|
||||
for (j = 0; j < width; j += 4) {
|
||||
const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
|
||||
const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
|
||||
const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
|
||||
const uint16x4_t vcomp_pred =
|
||||
vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
|
||||
vst1_u16(&comp_pred[j], vcomp_pred);
|
||||
}
|
||||
comp_pred += width;
|
||||
pred += width;
|
||||
ref += ref_stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* All three forms of the variance are available in the same sizes. */
|
||||
#define HIGHBD_VARIANCES(W, H) \
|
||||
HIGHBD_VAR(W, H) \
|
||||
HIGHBD_SUBPIX_VAR(W, H) \
|
||||
HIGHBD_SUBPIX_AVG_VAR(W, H)
|
||||
|
||||
HIGHBD_VARIANCES(64, 64)
|
||||
HIGHBD_VARIANCES(64, 32)
|
||||
HIGHBD_VARIANCES(32, 64)
|
||||
HIGHBD_VARIANCES(32, 32)
|
||||
HIGHBD_VARIANCES(32, 16)
|
||||
HIGHBD_VARIANCES(16, 32)
|
||||
HIGHBD_VARIANCES(16, 16)
|
||||
HIGHBD_VARIANCES(16, 8)
|
||||
HIGHBD_VARIANCES(8, 16)
|
||||
HIGHBD_VARIANCES(8, 8)
|
||||
HIGHBD_VARIANCES(8, 4)
|
||||
HIGHBD_VARIANCES(4, 8)
|
||||
HIGHBD_VARIANCES(4, 4)
|
||||
HBD_VARIANCE_WXH_NEON(64, 64)
|
||||
HBD_VARIANCE_WXH_NEON(64, 32)
|
||||
HBD_VARIANCE_WXH_NEON(32, 64)
|
||||
HBD_VARIANCE_WXH_NEON(32, 32)
|
||||
HBD_VARIANCE_WXH_NEON(32, 16)
|
||||
HBD_VARIANCE_WXH_NEON(16, 32)
|
||||
HBD_VARIANCE_WXH_NEON(16, 16)
|
||||
HBD_VARIANCE_WXH_NEON(16, 8)
|
||||
HBD_VARIANCE_WXH_NEON(8, 16)
|
||||
HBD_VARIANCE_WXH_NEON(8, 8)
|
||||
HBD_VARIANCE_WXH_NEON(8, 4)
|
||||
HBD_VARIANCE_WXH_NEON(4, 8)
|
||||
HBD_VARIANCE_WXH_NEON(4, 4)
|
||||
|
||||
HIGHBD_GET_VAR(8)
|
||||
HIGHBD_GET_VAR(16)
|
||||
|
|
|
@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
|
|||
(void)bd;
|
||||
|
||||
if (w < 8) { // copy4
|
||||
uint16x4_t s0, s1;
|
||||
do {
|
||||
vst1_u16(dst, vld1_u16(src));
|
||||
s0 = vld1_u16(src);
|
||||
src += src_stride;
|
||||
s1 = vld1_u16(src);
|
||||
src += src_stride;
|
||||
|
||||
vst1_u16(dst, s0);
|
||||
dst += dst_stride;
|
||||
vst1_u16(dst, vld1_u16(src));
|
||||
src += src_stride;
|
||||
vst1_u16(dst, s1);
|
||||
dst += dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} while (h != 0);
|
||||
} else if (w == 8) { // copy8
|
||||
uint16x8_t s0, s1;
|
||||
do {
|
||||
vst1q_u16(dst, vld1q_u16(src));
|
||||
s0 = vld1q_u16(src);
|
||||
src += src_stride;
|
||||
s1 = vld1q_u16(src);
|
||||
src += src_stride;
|
||||
|
||||
vst1q_u16(dst, s0);
|
||||
dst += dst_stride;
|
||||
vst1q_u16(dst, vld1q_u16(src));
|
||||
src += src_stride;
|
||||
vst1q_u16(dst, s1);
|
||||
dst += dst_stride;
|
||||
h -= 2;
|
||||
} while (h > 0);
|
||||
} while (h != 0);
|
||||
} else if (w < 32) { // copy16
|
||||
uint16x8_t s0, s1, s2, s3;
|
||||
do {
|
||||
vst2q_u16(dst, vld2q_u16(src));
|
||||
s0 = vld1q_u16(src);
|
||||
s1 = vld1q_u16(src + 8);
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
vst2q_u16(dst, vld2q_u16(src));
|
||||
s2 = vld1q_u16(src);
|
||||
s3 = vld1q_u16(src + 8);
|
||||
src += src_stride;
|
||||
|
||||
vst1q_u16(dst, s0);
|
||||
vst1q_u16(dst + 8, s1);
|
||||
dst += dst_stride;
|
||||
vst2q_u16(dst, vld2q_u16(src));
|
||||
src += src_stride;
|
||||
vst1q_u16(dst, s2);
|
||||
vst1q_u16(dst + 8, s3);
|
||||
dst += dst_stride;
|
||||
vst2q_u16(dst, vld2q_u16(src));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
h -= 2;
|
||||
} while (h != 0);
|
||||
} else if (w == 32) { // copy32
|
||||
uint16x8_t s0, s1, s2, s3;
|
||||
do {
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
s0 = vld1q_u16(src);
|
||||
s1 = vld1q_u16(src + 8);
|
||||
s2 = vld1q_u16(src + 16);
|
||||
s3 = vld1q_u16(src + 24);
|
||||
src += src_stride;
|
||||
|
||||
vst1q_u16(dst, s0);
|
||||
vst1q_u16(dst + 8, s1);
|
||||
vst1q_u16(dst + 16, s2);
|
||||
vst1q_u16(dst + 24, s3);
|
||||
dst += dst_stride;
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} while (--h != 0);
|
||||
} else { // copy64
|
||||
uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
do {
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
vst4q_u16(dst + 32, vld4q_u16(src + 32));
|
||||
s0 = vld1q_u16(src);
|
||||
s1 = vld1q_u16(src + 8);
|
||||
s2 = vld1q_u16(src + 16);
|
||||
s3 = vld1q_u16(src + 24);
|
||||
s4 = vld1q_u16(src + 32);
|
||||
s5 = vld1q_u16(src + 40);
|
||||
s6 = vld1q_u16(src + 48);
|
||||
s7 = vld1q_u16(src + 56);
|
||||
src += src_stride;
|
||||
|
||||
vst1q_u16(dst, s0);
|
||||
vst1q_u16(dst + 8, s1);
|
||||
vst1q_u16(dst + 16, s2);
|
||||
vst1q_u16(dst + 24, s3);
|
||||
vst1q_u16(dst + 32, s4);
|
||||
vst1q_u16(dst + 40, s5);
|
||||
vst1q_u16(dst + 48, s6);
|
||||
vst1q_u16(dst + 56, s7);
|
||||
dst += dst_stride;
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
vst4q_u16(dst + 32, vld4q_u16(src + 32));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
vst4q_u16(dst + 32, vld4q_u16(src + 32));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
vst4q_u16(dst, vld4q_u16(src));
|
||||
vst4q_u16(dst + 32, vld4q_u16(src + 32));
|
||||
src += src_stride;
|
||||
dst += dst_stride;
|
||||
h -= 4;
|
||||
} while (h > 0);
|
||||
} while (--h != 0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -126,6 +126,20 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
|
|||
return vreinterpret_u8_u32(a_u32);
|
||||
}
|
||||
|
||||
// Load 2 sets of 8 bytes when alignment is not guaranteed.
|
||||
static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
|
||||
ptrdiff_t stride) {
|
||||
uint64_t a;
|
||||
uint64x2_t a_u64;
|
||||
if (stride == 4) return vld1q_u16(buf);
|
||||
memcpy(&a, buf, 8);
|
||||
buf += stride;
|
||||
a_u64 = vdupq_n_u64(a);
|
||||
memcpy(&a, buf, 8);
|
||||
a_u64 = vsetq_lane_u64(a, a_u64, 1);
|
||||
return vreinterpretq_u16_u64(a_u64);
|
||||
}
|
||||
|
||||
// Store 2 sets of 4 bytes when alignment is not guaranteed.
|
||||
static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
|
||||
const uint8x8_t a) {
|
||||
|
|
|
@ -17,633 +17,316 @@
|
|||
#include "vpx_dsp/arm/mem_neon.h"
|
||||
#include "vpx_dsp/arm/sum_neon.h"
|
||||
|
||||
static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
|
||||
const void *const buf1) {
|
||||
uint32_t a;
|
||||
uint32x2_t aa;
|
||||
memcpy(&a, buf0, 4);
|
||||
aa = vdup_n_u32(a);
|
||||
memcpy(&a, buf1, 4);
|
||||
aa = vset_lane_u32(a, aa, 1);
|
||||
return vreinterpret_u8_u32(aa);
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
|
||||
uint32x4_t *const sad_sum) {
|
||||
uint8x16_t abs_diff = vabdq_u8(src, ref);
|
||||
*sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
|
||||
}
|
||||
|
||||
static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
|
||||
const uint8_t *const ref_array[4],
|
||||
const int ref_stride, const int height,
|
||||
uint32_t sad_array[4]) {
|
||||
int i;
|
||||
uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
|
||||
#if !defined(__aarch64__)
|
||||
uint16x4_t a[2];
|
||||
#endif
|
||||
uint32x4_t r;
|
||||
static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum[4];
|
||||
|
||||
assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
|
||||
assert(!(src_stride % sizeof(uint32_t)));
|
||||
int i = 0;
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
const uint8x8_t s = vreinterpret_u8_u32(
|
||||
vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
|
||||
const uint8x8_t ref01 = load_unaligned_2_buffers(
|
||||
ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
|
||||
const uint8x8_t ref23 = load_unaligned_2_buffers(
|
||||
ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
|
||||
abs[0] = vabal_u8(abs[0], s, ref01);
|
||||
abs[1] = vabal_u8(abs[1], s, ref23);
|
||||
}
|
||||
s0 = vld1q_u8(src + i * src_stride);
|
||||
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
|
||||
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
|
||||
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
|
||||
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
|
||||
|
||||
#if defined(__aarch64__)
|
||||
abs[0] = vpaddq_u16(abs[0], abs[1]);
|
||||
r = vpaddlq_u16(abs[0]);
|
||||
#else
|
||||
a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
|
||||
a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
|
||||
r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
|
||||
#endif
|
||||
vst1q_u32(sad_array, r);
|
||||
s1 = vld1q_u8(src + i * src_stride + 16);
|
||||
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
|
||||
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
|
||||
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
|
||||
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
|
||||
|
||||
s2 = vld1q_u8(src + i * src_stride + 32);
|
||||
sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
|
||||
sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
|
||||
sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
|
||||
sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
|
||||
|
||||
s3 = vld1q_u8(src + i * src_stride + 48);
|
||||
sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
|
||||
sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
|
||||
sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
|
||||
sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
|
||||
|
||||
i++;
|
||||
} while (i < h);
|
||||
|
||||
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
|
||||
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
|
||||
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
|
||||
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
|
||||
static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
uint32x4_t sum[4];
|
||||
|
||||
int i = 0;
|
||||
do {
|
||||
uint8x16_t s0, s1;
|
||||
|
||||
s0 = vld1q_u8(src + i * src_stride);
|
||||
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
|
||||
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
|
||||
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
|
||||
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
|
||||
|
||||
s1 = vld1q_u8(src + i * src_stride + 16);
|
||||
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
|
||||
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
|
||||
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
|
||||
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
|
||||
|
||||
i++;
|
||||
} while (i < h);
|
||||
|
||||
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
|
||||
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
|
||||
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
|
||||
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
|
||||
static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
|
||||
uint32_t sad_array[4]) {
|
||||
#if defined(__aarch64__)
|
||||
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
|
||||
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
|
||||
const uint16x8_t b0 = vpaddq_u16(a0, a1);
|
||||
const uint32x4_t r = vpaddlq_u16(b0);
|
||||
#else
|
||||
const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
|
||||
const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
|
||||
const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
|
||||
const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
|
||||
const uint16x4_t b0 = vpadd_u16(a0, a1);
|
||||
const uint16x4_t b1 = vpadd_u16(a2, a3);
|
||||
const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
|
||||
#endif
|
||||
vst1q_u32(sad_array, r);
|
||||
}
|
||||
|
||||
#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
// Can handle 1024 pixels' sad sum (such as 32x32)
|
||||
static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
|
||||
uint32_t sad_array[4]) {
|
||||
#if defined(__aarch64__)
|
||||
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
|
||||
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
|
||||
const uint32x4_t b0 = vpaddlq_u16(a0);
|
||||
const uint32x4_t b1 = vpaddlq_u16(a1);
|
||||
const uint32x4_t r = vpaddq_u32(b0, b1);
|
||||
vst1q_u32(sad_array, r);
|
||||
#else
|
||||
const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
|
||||
const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
|
||||
const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
|
||||
const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
|
||||
const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
|
||||
const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
|
||||
const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
|
||||
const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
|
||||
vst1q_u32(sad_array, vcombine_u32(c0, c1));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
|
||||
static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
|
||||
uint32_t sad_array[4]) {
|
||||
#if defined(__aarch64__)
|
||||
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
|
||||
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
|
||||
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
|
||||
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
|
||||
const uint32x4_t b0 = vpaddq_u32(a0, a1);
|
||||
const uint32x4_t b1 = vpaddq_u32(a2, a3);
|
||||
const uint32x4_t r = vpaddq_u32(b0, b1);
|
||||
vst1q_u32(sad_array, r);
|
||||
#else
|
||||
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
|
||||
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
|
||||
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
|
||||
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
|
||||
const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
|
||||
const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
|
||||
const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
|
||||
const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
|
||||
const uint32x2_t c0 = vpadd_u32(b0, b1);
|
||||
const uint32x2_t c1 = vpadd_u32(b2, b3);
|
||||
vst1q_u32(sad_array, vcombine_u32(c0, c1));
|
||||
#endif
|
||||
}
|
||||
|
||||
// Can handle 4096 pixels' sad sum (such as 64x64)
|
||||
static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
|
||||
uint32_t sad_array[4]) {
|
||||
#if defined(__aarch64__)
|
||||
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
|
||||
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
|
||||
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
|
||||
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
|
||||
const uint32x4_t a4 = vpaddlq_u16(sum[4]);
|
||||
const uint32x4_t a5 = vpaddlq_u16(sum[5]);
|
||||
const uint32x4_t a6 = vpaddlq_u16(sum[6]);
|
||||
const uint32x4_t a7 = vpaddlq_u16(sum[7]);
|
||||
const uint32x4_t b0 = vaddq_u32(a0, a1);
|
||||
const uint32x4_t b1 = vaddq_u32(a2, a3);
|
||||
const uint32x4_t b2 = vaddq_u32(a4, a5);
|
||||
const uint32x4_t b3 = vaddq_u32(a6, a7);
|
||||
const uint32x4_t c0 = vpaddq_u32(b0, b1);
|
||||
const uint32x4_t c1 = vpaddq_u32(b2, b3);
|
||||
const uint32x4_t r = vpaddq_u32(c0, c1);
|
||||
vst1q_u32(sad_array, r);
|
||||
#else
|
||||
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
|
||||
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
|
||||
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
|
||||
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
|
||||
const uint32x4_t a4 = vpaddlq_u16(sum[4]);
|
||||
const uint32x4_t a5 = vpaddlq_u16(sum[5]);
|
||||
const uint32x4_t a6 = vpaddlq_u16(sum[6]);
|
||||
const uint32x4_t a7 = vpaddlq_u16(sum[7]);
|
||||
const uint32x4_t b0 = vaddq_u32(a0, a1);
|
||||
const uint32x4_t b1 = vaddq_u32(a2, a3);
|
||||
const uint32x4_t b2 = vaddq_u32(a4, a5);
|
||||
const uint32x4_t b3 = vaddq_u32(a6, a7);
|
||||
const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
|
||||
const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
|
||||
const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
|
||||
const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
|
||||
const uint32x2_t d0 = vpadd_u32(c0, c1);
|
||||
const uint32x2_t d1 = vpadd_u32(c2, c3);
|
||||
vst1q_u32(sad_array, vcombine_u32(d0, d1));
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4], const int height) {
|
||||
int i, j;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
const uint8x8_t s = vld1_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
for (j = 0; j < 4; ++j) {
|
||||
const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
|
||||
ref_loop[j] += ref_stride;
|
||||
sum[j] = vabal_u8(sum[j], s, b_u8);
|
||||
}
|
||||
}
|
||||
|
||||
sad_512_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
|
||||
}
|
||||
|
||||
void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
|
||||
}
|
||||
|
||||
void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
|
||||
uint32x4_t *const sum) {
|
||||
const uint8x16_t r = vld1q_u8(ref_ptr);
|
||||
const uint8x16_t diff = vabdq_u8(src_ptr, r);
|
||||
*sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
|
||||
}
|
||||
|
||||
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4], const int height) {
|
||||
int i;
|
||||
uint32x4_t r0, r1;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
|
||||
sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
|
||||
}
|
||||
int i = 0;
|
||||
do {
|
||||
const uint8x16_t s = vld1q_u8(src + i * src_stride);
|
||||
sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
|
||||
sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
|
||||
sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
|
||||
sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
|
||||
|
||||
r0 = vpaddq_u32(sum[0], sum[1]);
|
||||
r1 = vpaddq_u32(sum[2], sum[3]);
|
||||
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
|
||||
i++;
|
||||
} while (i < h);
|
||||
|
||||
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
|
||||
}
|
||||
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
#else // !defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
|
||||
uint16x8_t *const sum) {
|
||||
const uint8x16_t r = vld1q_u8(ref_ptr);
|
||||
*sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
|
||||
*sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
|
||||
static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
|
||||
uint16x8_t *const sad_sum) {
|
||||
uint8x16_t abs_diff = vabdq_u8(src, ref);
|
||||
*sad_sum = vpadalq_u8(*sad_sum, abs_diff);
|
||||
}
|
||||
|
||||
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4], const int height) {
|
||||
int i;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
int h_tmp = h > 64 ? 64 : h;
|
||||
int i = 0;
|
||||
vst1q_u32(res, vdupq_n_u32(0));
|
||||
|
||||
do {
|
||||
uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
|
||||
do {
|
||||
uint8x16_t s0, s1, s2, s3;
|
||||
|
||||
s0 = vld1q_u8(src + i * src_stride);
|
||||
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
|
||||
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
|
||||
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
|
||||
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
|
||||
|
||||
s1 = vld1q_u8(src + i * src_stride + 16);
|
||||
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
|
||||
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
|
||||
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
|
||||
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
|
||||
|
||||
s2 = vld1q_u8(src + i * src_stride + 32);
|
||||
sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
|
||||
sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
|
||||
sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
|
||||
sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
|
||||
|
||||
s3 = vld1q_u8(src + i * src_stride + 48);
|
||||
sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
|
||||
sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
|
||||
sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
|
||||
sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
|
||||
|
||||
i++;
|
||||
} while (i < h_tmp);
|
||||
|
||||
res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
|
||||
res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
|
||||
res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
|
||||
res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
|
||||
|
||||
h_tmp += 64;
|
||||
} while (i < h);
|
||||
}
|
||||
|
||||
static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
|
||||
int i = 0;
|
||||
do {
|
||||
uint8x16_t s0, s1;
|
||||
|
||||
s0 = vld1q_u8(src + i * src_stride);
|
||||
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
|
||||
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
|
||||
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
|
||||
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
|
||||
|
||||
s1 = vld1q_u8(src + i * src_stride + 16);
|
||||
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
|
||||
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
|
||||
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
|
||||
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
|
||||
|
||||
i++;
|
||||
} while (i < h);
|
||||
|
||||
res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
|
||||
res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
|
||||
res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
|
||||
res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
|
||||
}
|
||||
|
||||
static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
const uint8x16_t s = vld1q_u8(src_ptr);
|
||||
src_ptr += src_stride;
|
||||
/* Manual unrolling here stops the compiler from getting confused. */
|
||||
sad16_neon(ref_loop[0], s, &sum[0]);
|
||||
ref_loop[0] += ref_stride;
|
||||
sad16_neon(ref_loop[1], s, &sum[1]);
|
||||
ref_loop[1] += ref_stride;
|
||||
sad16_neon(ref_loop[2], s, &sum[2]);
|
||||
ref_loop[2] += ref_stride;
|
||||
sad16_neon(ref_loop[3], s, &sum[3]);
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
int i = 0;
|
||||
do {
|
||||
const uint8x16_t s = vld1q_u8(src + i * src_stride);
|
||||
sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
|
||||
sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
|
||||
sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
|
||||
sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
|
||||
|
||||
sad_512_pel_final_neon(sum, sad_array);
|
||||
i++;
|
||||
} while (i < h);
|
||||
|
||||
res[0] = horizontal_add_uint16x8(sum[0]);
|
||||
res[1] = horizontal_add_uint16x8(sum[1]);
|
||||
res[2] = horizontal_add_uint16x8(sum[2]);
|
||||
res[3] = horizontal_add_uint16x8(sum[3]);
|
||||
}
|
||||
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
#endif // defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
|
||||
static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
|
||||
uint16x8_t *const sad_sum) {
|
||||
uint8x8_t abs_diff = vabd_u8(src, ref);
|
||||
*sad_sum = vaddw_u8(*sad_sum, abs_diff);
|
||||
}
|
||||
|
||||
void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
|
||||
}
|
||||
|
||||
void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4], const int height) {
|
||||
int i;
|
||||
uint32x4_t r0, r1;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
uint8x16_t s;
|
||||
|
||||
s = vld1q_u8(src_ptr + 0 * 16);
|
||||
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 1 * 16);
|
||||
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
|
||||
|
||||
src_ptr += src_stride;
|
||||
ref_loop[0] += ref_stride;
|
||||
ref_loop[1] += ref_stride;
|
||||
ref_loop[2] += ref_stride;
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
|
||||
r0 = vpaddq_u32(sum[0], sum[1]);
|
||||
r1 = vpaddq_u32(sum[2], sum[3]);
|
||||
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
|
||||
}
|
||||
|
||||
void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
|
||||
}
|
||||
|
||||
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
|
||||
}
|
||||
|
||||
void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
|
||||
}
|
||||
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
const int height, uint16x8_t *const sum) {
|
||||
int i;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
|
||||
sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
|
||||
|
||||
for (i = 0; i < height; ++i) {
|
||||
uint8x16_t s;
|
||||
|
||||
s = vld1q_u8(src_ptr + 0 * 16);
|
||||
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 1 * 16);
|
||||
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
|
||||
|
||||
src_ptr += src_stride;
|
||||
ref_loop[0] += ref_stride;
|
||||
ref_loop[1] += ref_stride;
|
||||
ref_loop[2] += ref_stride;
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
}
|
||||
|
||||
void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
uint16x8_t sum[4];
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
|
||||
sad_512_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
uint16x8_t sum[4];
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
|
||||
sad_1024_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
uint16x8_t sum[4];
|
||||
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
|
||||
sad_2048_pel_final_neon(sum, sad_array);
|
||||
}
|
||||
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
|
||||
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
int i;
|
||||
uint32x4_t r0, r1;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0) };
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
uint8x16_t s;
|
||||
|
||||
s = vld1q_u8(src_ptr + 0 * 16);
|
||||
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 1 * 16);
|
||||
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 2 * 16);
|
||||
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 3 * 16);
|
||||
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
|
||||
|
||||
src_ptr += src_stride;
|
||||
ref_loop[0] += ref_stride;
|
||||
ref_loop[1] += ref_stride;
|
||||
ref_loop[2] += ref_stride;
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
|
||||
r0 = vpaddq_u32(sum[0], sum[1]);
|
||||
r1 = vpaddq_u32(sum[2], sum[3]);
|
||||
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
|
||||
}
|
||||
|
||||
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
int i;
|
||||
uint32x4_t r0, r1, r2, r3;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
|
||||
vdupq_n_u32(0), vdupq_n_u32(0) };
|
||||
|
||||
for (i = 0; i < 64; ++i) {
|
||||
uint8x16_t s;
|
||||
|
||||
s = vld1q_u8(src_ptr + 0 * 16);
|
||||
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
|
||||
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 1 * 16);
|
||||
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
|
||||
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 2 * 16);
|
||||
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
|
||||
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
|
||||
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 3 * 16);
|
||||
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
|
||||
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
|
||||
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
|
||||
|
||||
src_ptr += src_stride;
|
||||
ref_loop[0] += ref_stride;
|
||||
ref_loop[1] += ref_stride;
|
||||
ref_loop[2] += ref_stride;
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
|
||||
r0 = vpaddq_u32(sum[0], sum[1]);
|
||||
r1 = vpaddq_u32(sum[2], sum[3]);
|
||||
r2 = vpaddq_u32(sum[4], sum[5]);
|
||||
r3 = vpaddq_u32(sum[6], sum[7]);
|
||||
r0 = vpaddq_u32(r0, r1);
|
||||
r1 = vpaddq_u32(r2, r3);
|
||||
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
|
||||
}
|
||||
|
||||
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
|
||||
|
||||
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
int i;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
uint8x16_t s;
|
||||
int i = 0;
|
||||
do {
|
||||
const uint8x8_t s = vld1_u8(src + i * src_stride);
|
||||
sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
|
||||
sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
|
||||
sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
|
||||
sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 0 * 16);
|
||||
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
|
||||
i++;
|
||||
} while (i < h);
|
||||
|
||||
s = vld1q_u8(src_ptr + 1 * 16);
|
||||
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 2 * 16);
|
||||
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 3 * 16);
|
||||
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
|
||||
|
||||
src_ptr += src_stride;
|
||||
ref_loop[0] += ref_stride;
|
||||
ref_loop[1] += ref_stride;
|
||||
ref_loop[2] += ref_stride;
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
|
||||
sad_2048_pel_final_neon(sum, sad_array);
|
||||
res[0] = horizontal_add_uint16x8(sum[0]);
|
||||
res[1] = horizontal_add_uint16x8(sum[1]);
|
||||
res[2] = horizontal_add_uint16x8(sum[2]);
|
||||
res[3] = horizontal_add_uint16x8(sum[3]);
|
||||
}
|
||||
|
||||
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
|
||||
const uint8_t *const ref_array[4], int ref_stride,
|
||||
uint32_t sad_array[4]) {
|
||||
int i;
|
||||
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
|
||||
ref_array[3] };
|
||||
uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0), vdupq_n_u16(0) };
|
||||
static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
|
||||
const uint8_t *const ref[4], int ref_stride,
|
||||
uint32_t res[4], int h) {
|
||||
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
|
||||
vdupq_n_u16(0) };
|
||||
|
||||
for (i = 0; i < 64; ++i) {
|
||||
uint8x16_t s;
|
||||
int i = 0;
|
||||
do {
|
||||
uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
|
||||
uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
|
||||
uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
|
||||
uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
|
||||
uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
|
||||
|
||||
s = vld1q_u8(src_ptr + 0 * 16);
|
||||
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
|
||||
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
|
||||
sad8_neon(s, r0, &sum[0]);
|
||||
sad8_neon(s, r1, &sum[1]);
|
||||
sad8_neon(s, r2, &sum[2]);
|
||||
sad8_neon(s, r3, &sum[3]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 1 * 16);
|
||||
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
|
||||
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
|
||||
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
|
||||
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
|
||||
i += 2;
|
||||
} while (i < h);
|
||||
|
||||
s = vld1q_u8(src_ptr + 2 * 16);
|
||||
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
|
||||
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
|
||||
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
|
||||
|
||||
s = vld1q_u8(src_ptr + 3 * 16);
|
||||
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
|
||||
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
|
||||
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
|
||||
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
|
||||
|
||||
src_ptr += src_stride;
|
||||
ref_loop[0] += ref_stride;
|
||||
ref_loop[1] += ref_stride;
|
||||
ref_loop[2] += ref_stride;
|
||||
ref_loop[3] += ref_stride;
|
||||
}
|
||||
|
||||
sad_4096_pel_final_neon(sum, sad_array);
|
||||
res[0] = horizontal_add_uint16x8(sum[0]);
|
||||
res[1] = horizontal_add_uint16x8(sum[1]);
|
||||
res[2] = horizontal_add_uint16x8(sum[2]);
|
||||
res[3] = horizontal_add_uint16x8(sum[3]);
|
||||
}
|
||||
|
||||
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
|
||||
#define SAD_WXH_4D_NEON(w, h) \
|
||||
void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
|
||||
const uint8_t *const ref[4], int ref_stride, \
|
||||
uint32_t res[4]) { \
|
||||
sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
|
||||
}
|
||||
|
||||
SAD_WXH_4D_NEON(4, 4)
|
||||
SAD_WXH_4D_NEON(4, 8)
|
||||
|
||||
SAD_WXH_4D_NEON(8, 4)
|
||||
SAD_WXH_4D_NEON(8, 8)
|
||||
SAD_WXH_4D_NEON(8, 16)
|
||||
|
||||
SAD_WXH_4D_NEON(16, 8)
|
||||
SAD_WXH_4D_NEON(16, 16)
|
||||
SAD_WXH_4D_NEON(16, 32)
|
||||
|
||||
SAD_WXH_4D_NEON(32, 16)
|
||||
SAD_WXH_4D_NEON(32, 32)
|
||||
SAD_WXH_4D_NEON(32, 64)
|
||||
|
||||
SAD_WXH_4D_NEON(64, 32)
|
||||
SAD_WXH_4D_NEON(64, 64)
|
||||
|
||||
#undef SAD_WXH_4D_NEON
|
||||
|
|
|
@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
|
|||
|
||||
int i = h / 2;
|
||||
do {
|
||||
uint32x2_t s, r;
|
||||
uint32_t s0, s1, r0, r1;
|
||||
uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
|
||||
uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
|
||||
|
||||
memcpy(&s0, src_ptr, 4);
|
||||
memcpy(&r0, ref_ptr, 4);
|
||||
s = vdup_n_u32(s0);
|
||||
r = vdup_n_u32(r0);
|
||||
src_ptr += src_stride;
|
||||
ref_ptr += ref_stride;
|
||||
sum = vabal_u8(sum, s, r);
|
||||
|
||||
memcpy(&s1, src_ptr, 4);
|
||||
memcpy(&r1, ref_ptr, 4);
|
||||
s = vset_lane_u32(s1, s, 1);
|
||||
r = vset_lane_u32(r1, r, 1);
|
||||
src_ptr += src_stride;
|
||||
ref_ptr += ref_stride;
|
||||
|
||||
sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
|
||||
src_ptr += 2 * src_stride;
|
||||
ref_ptr += 2 * ref_stride;
|
||||
} while (--i != 0);
|
||||
|
||||
return horizontal_add_uint16x8(sum);
|
||||
|
@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
|
|||
|
||||
int i = h / 2;
|
||||
do {
|
||||
uint32x2_t s, r;
|
||||
uint32_t s0, s1, r0, r1;
|
||||
uint8x8_t p, avg;
|
||||
uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
|
||||
uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
|
||||
uint8x8_t p = vld1_u8(second_pred);
|
||||
|
||||
memcpy(&s0, src_ptr, 4);
|
||||
memcpy(&r0, ref_ptr, 4);
|
||||
s = vdup_n_u32(s0);
|
||||
r = vdup_n_u32(r0);
|
||||
src_ptr += src_stride;
|
||||
ref_ptr += ref_stride;
|
||||
uint8x8_t avg = vrhadd_u8(r, p);
|
||||
sum = vabal_u8(sum, s, avg);
|
||||
|
||||
memcpy(&s1, src_ptr, 4);
|
||||
memcpy(&r1, ref_ptr, 4);
|
||||
s = vset_lane_u32(s1, s, 1);
|
||||
r = vset_lane_u32(r1, r, 1);
|
||||
src_ptr += src_stride;
|
||||
ref_ptr += ref_stride;
|
||||
|
||||
p = vld1_u8(second_pred);
|
||||
avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
|
||||
|
||||
sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
|
||||
src_ptr += 2 * src_stride;
|
||||
ref_ptr += 2 * ref_stride;
|
||||
second_pred += 8;
|
||||
} while (--i != 0);
|
||||
|
||||
|
|
|
@ -40,6 +40,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
|
|||
#endif
|
||||
}
|
||||
|
||||
static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
|
||||
const uint16x8_t vec_hi) {
|
||||
#if defined(__aarch64__)
|
||||
return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
|
||||
#else
|
||||
const uint32x4_t vec_l_lo =
|
||||
vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
|
||||
const uint32x4_t vec_l_hi =
|
||||
vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
|
||||
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
|
||||
const uint64x2_t b = vpaddlq_u32(a);
|
||||
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
|
||||
vreinterpret_u32_u64(vget_high_u64(b)));
|
||||
return vget_lane_u32(c, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
|
||||
#if defined(__aarch64__)
|
||||
return vaddv_s32(a);
|
||||
|
@ -77,4 +94,20 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
|
|||
return vget_lane_u32(c, 0);
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
|
||||
#if defined(__aarch64__)
|
||||
uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
|
||||
uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
|
||||
return vpaddq_u32(res01, res23);
|
||||
#else
|
||||
uint32x4_t res = vdupq_n_u32(0);
|
||||
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
|
||||
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
|
||||
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
|
||||
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
|
||||
return res;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_
|
||||
|
|
|
@ -23,10 +23,17 @@
|
|||
// b0.val[1]: 04 05 06 07 20 21 22 23
|
||||
static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
|
||||
int16x8x2_t b0;
|
||||
#if defined(__aarch64__)
|
||||
b0.val[0] = vreinterpretq_s16_s64(
|
||||
vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
|
||||
b0.val[1] = vreinterpretq_s16_s64(
|
||||
vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
|
||||
#else
|
||||
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
|
||||
vreinterpret_s16_s32(vget_low_s32(a1)));
|
||||
b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
|
||||
vreinterpret_s16_s32(vget_high_s32(a1)));
|
||||
#endif
|
||||
return b0;
|
||||
}
|
||||
|
||||
|
@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
|
|||
|
||||
static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
|
||||
uint16x8x2_t b0;
|
||||
#if defined(__aarch64__)
|
||||
b0.val[0] = vreinterpretq_u16_u64(
|
||||
vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
|
||||
b0.val[1] = vreinterpretq_u16_u64(
|
||||
vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
|
||||
#else
|
||||
b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
|
||||
vreinterpret_u16_u32(vget_low_u32(a1)));
|
||||
b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
|
||||
vreinterpret_u16_u32(vget_high_u32(a1)));
|
||||
#endif
|
||||
return b0;
|
||||
}
|
||||
|
||||
|
@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
|
|||
}
|
||||
|
||||
// Transpose 8x8 to a new location.
|
||||
static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
|
||||
// Swap 16 bit elements.
|
||||
const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
|
||||
const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
|
||||
const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
|
||||
const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
|
||||
static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
|
||||
// Swap 16 bit elements. Goes from:
|
||||
// a0: 00 01 02 03 04 05 06 07
|
||||
// a1: 10 11 12 13 14 15 16 17
|
||||
// a2: 20 21 22 23 24 25 26 27
|
||||
// a3: 30 31 32 33 34 35 36 37
|
||||
// a4: 40 41 42 43 44 45 46 47
|
||||
// a5: 50 51 52 53 54 55 56 57
|
||||
// a6: 60 61 62 63 64 65 66 67
|
||||
// a7: 70 71 72 73 74 75 76 77
|
||||
// to:
|
||||
// b0.val[0]: 00 10 02 12 04 14 06 16
|
||||
// b0.val[1]: 01 11 03 13 05 15 07 17
|
||||
// b1.val[0]: 20 30 22 32 24 34 26 36
|
||||
// b1.val[1]: 21 31 23 33 25 35 27 37
|
||||
// b2.val[0]: 40 50 42 52 44 54 46 56
|
||||
// b2.val[1]: 41 51 43 53 45 55 47 57
|
||||
// b3.val[0]: 60 70 62 72 64 74 66 76
|
||||
// b3.val[1]: 61 71 63 73 65 75 67 77
|
||||
|
||||
// Swap 32 bit elements.
|
||||
const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
|
||||
vreinterpretq_s32_s16(c1.val[0]));
|
||||
const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
|
||||
vreinterpretq_s32_s16(c1.val[1]));
|
||||
const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
|
||||
vreinterpretq_s32_s16(c3.val[0]));
|
||||
const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
|
||||
vreinterpretq_s32_s16(c3.val[1]));
|
||||
const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
|
||||
const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
|
||||
const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
|
||||
const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
|
||||
|
||||
// Swap 64 bit elements
|
||||
const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
|
||||
const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
|
||||
const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
|
||||
const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
|
||||
// Swap 32 bit elements resulting in:
|
||||
// c0.val[0]: 00 10 20 30 04 14 24 34
|
||||
// c0.val[1]: 02 12 22 32 06 16 26 36
|
||||
// c1.val[0]: 01 11 21 31 05 15 25 35
|
||||
// c1.val[1]: 03 13 23 33 07 17 27 37
|
||||
// c2.val[0]: 40 50 60 70 44 54 64 74
|
||||
// c2.val[1]: 42 52 62 72 46 56 66 76
|
||||
// c3.val[0]: 41 51 61 71 45 55 65 75
|
||||
// c3.val[1]: 43 53 63 73 47 57 67 77
|
||||
|
||||
b[0] = e0.val[0];
|
||||
b[1] = e1.val[0];
|
||||
b[2] = e2.val[0];
|
||||
b[3] = e3.val[0];
|
||||
b[4] = e0.val[1];
|
||||
b[5] = e1.val[1];
|
||||
b[6] = e2.val[1];
|
||||
b[7] = e3.val[1];
|
||||
const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
|
||||
vreinterpretq_s32_s16(b1.val[0]));
|
||||
const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
|
||||
vreinterpretq_s32_s16(b1.val[1]));
|
||||
const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
|
||||
vreinterpretq_s32_s16(b3.val[0]));
|
||||
const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
|
||||
vreinterpretq_s32_s16(b3.val[1]));
|
||||
|
||||
// Swap 64 bit elements resulting in:
|
||||
// d0.val[0]: 00 10 20 30 40 50 60 70
|
||||
// d0.val[1]: 04 14 24 34 44 54 64 74
|
||||
// d1.val[0]: 01 11 21 31 41 51 61 71
|
||||
// d1.val[1]: 05 15 25 35 45 55 65 75
|
||||
// d2.val[0]: 02 12 22 32 42 52 62 72
|
||||
// d2.val[1]: 06 16 26 36 46 56 66 76
|
||||
// d3.val[0]: 03 13 23 33 43 53 63 73
|
||||
// d3.val[1]: 07 17 27 37 47 57 67 77
|
||||
|
||||
const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
|
||||
const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
|
||||
const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
|
||||
const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
|
||||
|
||||
out[0] = d0.val[0];
|
||||
out[1] = d1.val[0];
|
||||
out[2] = d2.val[0];
|
||||
out[3] = d3.val[0];
|
||||
out[4] = d0.val[1];
|
||||
out[5] = d1.val[1];
|
||||
out[6] = d2.val[1];
|
||||
out[7] = d3.val[1];
|
||||
}
|
||||
|
||||
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
|
||||
|
@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
|
|||
// d2.val[1]: 06 16 26 36 46 56 66 76
|
||||
// d3.val[0]: 03 13 23 33 43 53 63 73
|
||||
// d3.val[1]: 07 17 27 37 47 57 67 77
|
||||
|
||||
const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
|
||||
const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
|
||||
const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
|
||||
|
@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
|
|||
// d2.val[1]: 06 16 26 36 46 56 66 76
|
||||
// d3.val[0]: 03 13 23 33 43 53 63 73
|
||||
// d3.val[1]: 07 17 27 37 47 57 67 77
|
||||
|
||||
const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
|
||||
const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
|
||||
const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
|
||||
|
|
|
@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
|
|||
/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
|
||||
* and highbd_8_variance(). It should not.
|
||||
*/
|
||||
static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
|
||||
int b_stride, int w, int h, unsigned int *sse,
|
||||
int *sum) {
|
||||
static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
|
||||
int b_stride, int w, int h) {
|
||||
int i, j;
|
||||
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
int64_t sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
sse += diff * diff;
|
||||
}
|
||||
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
|
||||
return sse;
|
||||
}
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride, int w,
|
||||
int h, uint64_t *sse, int64_t *sum) {
|
||||
static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride, int w,
|
||||
int h) {
|
||||
int i, j;
|
||||
int64_t sse = 0;
|
||||
|
||||
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
|
||||
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
|
||||
*sum = 0;
|
||||
*sse = 0;
|
||||
|
||||
for (i = 0; i < h; i++) {
|
||||
for (j = 0; j < w; j++) {
|
||||
const int diff = a[j] - b[j];
|
||||
*sum += diff;
|
||||
*sse += diff * diff;
|
||||
sse += diff * diff;
|
||||
}
|
||||
a += a_stride;
|
||||
b += b_stride;
|
||||
}
|
||||
}
|
||||
|
||||
static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
|
||||
const uint8_t *b8, int b_stride, int w,
|
||||
int h, unsigned int *sse, int *sum) {
|
||||
uint64_t sse_long = 0;
|
||||
int64_t sum_long = 0;
|
||||
encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
|
||||
&sum_long);
|
||||
*sse = (unsigned int)sse_long;
|
||||
*sum = (int)sum_long;
|
||||
return sse;
|
||||
}
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
|
@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
|
|||
const int dw = width % 16;
|
||||
const int dh = height % 16;
|
||||
int64_t total_sse = 0;
|
||||
unsigned int sse = 0;
|
||||
int sum = 0;
|
||||
int x, y;
|
||||
|
||||
if (dw > 0) {
|
||||
encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
|
||||
height, &sse, &sum);
|
||||
total_sse += sse;
|
||||
total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
|
||||
dw, height);
|
||||
}
|
||||
|
||||
if (dh > 0) {
|
||||
encoder_variance(&a[(height - dh) * a_stride], a_stride,
|
||||
&b[(height - dh) * b_stride], b_stride, width - dw, dh,
|
||||
&sse, &sum);
|
||||
total_sse += sse;
|
||||
total_sse +=
|
||||
encoder_sse(&a[(height - dh) * a_stride], a_stride,
|
||||
&b[(height - dh) * b_stride], b_stride, width - dw, dh);
|
||||
}
|
||||
|
||||
for (y = 0; y < height / 16; ++y) {
|
||||
const uint8_t *pa = a;
|
||||
const uint8_t *pb = b;
|
||||
unsigned int sse;
|
||||
for (x = 0; x < width / 16; ++x) {
|
||||
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||
total_sse += sse;
|
||||
|
@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
|
|||
int x, y;
|
||||
const int dw = width % 16;
|
||||
const int dh = height % 16;
|
||||
unsigned int sse = 0;
|
||||
int sum = 0;
|
||||
if (dw > 0) {
|
||||
encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
|
||||
b_stride, dw, height, &sse, &sum);
|
||||
total_sse += sse;
|
||||
total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
|
||||
b_stride, dw, height);
|
||||
}
|
||||
if (dh > 0) {
|
||||
encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
|
||||
&b[(height - dh) * b_stride], b_stride,
|
||||
width - dw, dh, &sse, &sum);
|
||||
total_sse += sse;
|
||||
total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
|
||||
&b[(height - dh) * b_stride], b_stride,
|
||||
width - dw, dh);
|
||||
}
|
||||
for (y = 0; y < height / 16; ++y) {
|
||||
const uint8_t *pa = a;
|
||||
const uint8_t *pb = b;
|
||||
unsigned int sse;
|
||||
for (x = 0; x < width / 16; ++x) {
|
||||
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
|
||||
total_sse += sse;
|
||||
|
|
|
@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
|
|||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
|
||||
|
@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
|||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
|
|
|
@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
|
|||
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
|
||||
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
|
||||
|
@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
|
|||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
|
||||
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
|
||||
DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
|
||||
endif # CONFIG_VP9_HIGHBITDEPTH
|
||||
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
|
||||
|
||||
|
|
|
@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
|
|||
_img->fmt = _y4m->vpx_fmt;
|
||||
_img->w = _img->d_w = _y4m->pic_w;
|
||||
_img->h = _img->d_h = _y4m->pic_h;
|
||||
_img->bit_depth = _y4m->bit_depth;
|
||||
_img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
|
||||
_img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
|
||||
_img->bps = _y4m->bps;
|
||||
|
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9 (Thu Jan 26 21:31:14 2023).
|
||||
release: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf (Tue Feb 14 02:46:51 2023).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9
|
||||
revision: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
@ -181,6 +181,7 @@ files = {
|
|||
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
|
||||
'libvpx/vp9/encoder/vp9_temporal_filter.c',
|
||||
'libvpx/vp9/encoder/vp9_tokenize.c',
|
||||
'libvpx/vp9/encoder/vp9_tpl_model.c',
|
||||
'libvpx/vp9/encoder/vp9_treewriter.c',
|
||||
'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
|
||||
'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
|
||||
|
@ -450,6 +451,7 @@ files = {
|
|||
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
|
||||
'libvpx/vp9/encoder/vp9_temporal_filter.c',
|
||||
'libvpx/vp9/encoder/vp9_tokenize.c',
|
||||
'libvpx/vp9/encoder/vp9_tpl_model.c',
|
||||
'libvpx/vp9/encoder/vp9_treewriter.c',
|
||||
'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
|
||||
'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
|
||||
|
@ -699,6 +701,7 @@ files = {
|
|||
'libvpx/vp9/encoder/vp9_subexp.c',
|
||||
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
|
||||
'libvpx/vp9/encoder/vp9_tokenize.c',
|
||||
'libvpx/vp9/encoder/vp9_tpl_model.c',
|
||||
'libvpx/vp9/encoder/vp9_treewriter.c',
|
||||
'libvpx/vp9/vp9_cx_iface.c',
|
||||
'libvpx/vp9/vp9_dx_iface.c',
|
||||
|
@ -944,6 +947,7 @@ files = {
|
|||
'libvpx/vp9/encoder/vp9_subexp.c',
|
||||
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
|
||||
'libvpx/vp9/encoder/vp9_tokenize.c',
|
||||
'libvpx/vp9/encoder/vp9_tpl_model.c',
|
||||
'libvpx/vp9/encoder/vp9_treewriter.c',
|
||||
'libvpx/vp9/vp9_cx_iface.c',
|
||||
'libvpx/vp9/vp9_dx_iface.c',
|
||||
|
@ -1158,6 +1162,7 @@ files = {
|
|||
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
|
||||
'libvpx/vp9/encoder/vp9_temporal_filter.c',
|
||||
'libvpx/vp9/encoder/vp9_tokenize.c',
|
||||
'libvpx/vp9/encoder/vp9_tpl_model.c',
|
||||
'libvpx/vp9/encoder/vp9_treewriter.c',
|
||||
'libvpx/vp9/vp9_cx_iface.c',
|
||||
'libvpx/vp9/vp9_dx_iface.c',
|
||||
|
|
Загрузка…
Ссылка в новой задаче