Bug 1816486 - Update libvpx to bc2965f r=webrtc-reviewers,ng

Run `./mach vendor media/libvpx/moz.yaml --patch-mode=none` as what
media/libvpx/README_MOZILLA said.

The updated libvpx revision is bc2965ff72af7d7b21ffeab10549fcc67ed66ccf.

Differential Revision: https://phabricator.services.mozilla.com/D169840
This commit is contained in:
Chun-Min Chang 2023-02-14 22:26:52 +00:00
Родитель bef38fea84
Коммит 1b617be5fa
72 изменённых файлов: 3743 добавлений и 2881 удалений

Просмотреть файл

@ -77,7 +77,6 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_CONSISTENT_RECODE , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
@ -90,4 +89,5 @@
.equ CONFIG_EMULATE_HARDWARE , 0
.equ CONFIG_NON_GREEDY_MV , 0
.equ CONFIG_RATE_CTRL , 0
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
.section .note.GNU-stack,"",%progbits

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -77,7 +77,6 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_CONSISTENT_RECODE , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
@ -90,4 +89,5 @@
.equ CONFIG_EMULATE_HARDWARE , 0
.equ CONFIG_NON_GREEDY_MV , 0
.equ CONFIG_RATE_CTRL , 0
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
.section .note.GNU-stack,"",%progbits

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -77,7 +77,6 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_CONSISTENT_RECODE , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
@ -90,4 +89,5 @@
.equ CONFIG_EMULATE_HARDWARE , 0
.equ CONFIG_NON_GREEDY_MV , 0
.equ CONFIG_RATE_CTRL , 0
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
.section .note.GNU-stack,"",%progbits

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -74,7 +74,6 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_CONSISTENT_RECODE 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
%define CONFIG_EMULATE_HARDWARE 0
%define CONFIG_NON_GREEDY_MV 0
%define CONFIG_RATE_CTRL 0
%define CONFIG_COLLECT_COMPONENT_TIMING 0

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -74,7 +74,6 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_CONSISTENT_RECODE 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
%define CONFIG_EMULATE_HARDWARE 0
%define CONFIG_NON_GREEDY_MV 0
%define CONFIG_RATE_CTRL 0
%define CONFIG_COLLECT_COMPONENT_TIMING 0

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -74,7 +74,6 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_CONSISTENT_RECODE 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
%define CONFIG_EMULATE_HARDWARE 0
%define CONFIG_NON_GREEDY_MV 0
%define CONFIG_RATE_CTRL 0
%define CONFIG_COLLECT_COMPONENT_TIMING 0

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -74,7 +74,6 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_CONSISTENT_RECODE 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
%define CONFIG_EMULATE_HARDWARE 0
%define CONFIG_NON_GREEDY_MV 0
%define CONFIG_RATE_CTRL 0
%define CONFIG_COLLECT_COMPONENT_TIMING 0

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -1,8 +1,8 @@
// This file is generated. Do not edit.
#define VERSION_MAJOR 1
#define VERSION_MINOR 12
#define VERSION_MINOR 13
#define VERSION_PATCH 0
#define VERSION_EXTRA ""
#define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
#define VERSION_STRING_NOSP "v1.12.0"
#define VERSION_STRING " v1.12.0"
#define VERSION_STRING_NOSP "v1.13.0"
#define VERSION_STRING " v1.13.0"

Просмотреть файл

@ -77,7 +77,6 @@
.equ CONFIG_MULTI_RES_ENCODING , 1
.equ CONFIG_TEMPORAL_DENOISING , 1
.equ CONFIG_VP9_TEMPORAL_DENOISING , 0
.equ CONFIG_CONSISTENT_RECODE , 0
.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0
.equ CONFIG_VP9_HIGHBITDEPTH , 0
.equ CONFIG_BETTER_HW_COMPATIBILITY , 0
@ -90,4 +89,5 @@
.equ CONFIG_EMULATE_HARDWARE , 0
.equ CONFIG_NON_GREEDY_MV , 0
.equ CONFIG_RATE_CTRL , 0
.equ CONFIG_COLLECT_COMPONENT_TIMING , 0
.section .note.GNU-stack,"",%progbits

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -74,7 +74,6 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_CONSISTENT_RECODE 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
%define CONFIG_EMULATE_HARDWARE 0
%define CONFIG_NON_GREEDY_MV 0
%define CONFIG_RATE_CTRL 0
%define CONFIG_COLLECT_COMPONENT_TIMING 0

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -74,7 +74,6 @@
%define CONFIG_MULTI_RES_ENCODING 1
%define CONFIG_TEMPORAL_DENOISING 1
%define CONFIG_VP9_TEMPORAL_DENOISING 0
%define CONFIG_CONSISTENT_RECODE 0
%define CONFIG_COEFFICIENT_RANGE_CHECKING 0
%define CONFIG_VP9_HIGHBITDEPTH 0
%define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -87,3 +86,4 @@
%define CONFIG_EMULATE_HARDWARE 0
%define CONFIG_NON_GREEDY_MV 0
%define CONFIG_RATE_CTRL 0
%define CONFIG_COLLECT_COMPONENT_TIMING 0

Просмотреть файл

@ -86,7 +86,6 @@
#define CONFIG_MULTI_RES_ENCODING 1
#define CONFIG_TEMPORAL_DENOISING 1
#define CONFIG_VP9_TEMPORAL_DENOISING 0
#define CONFIG_CONSISTENT_RECODE 0
#define CONFIG_COEFFICIENT_RANGE_CHECKING 0
#define CONFIG_VP9_HIGHBITDEPTH 0
#define CONFIG_BETTER_HW_COMPATIBILITY 0
@ -99,6 +98,7 @@
#define CONFIG_EMULATE_HARDWARE 0
#define CONFIG_NON_GREEDY_MV 0
#define CONFIG_RATE_CTRL 0
#define CONFIG_COLLECT_COMPONENT_TIMING 0
#define DECODE_WIDTH_LIMIT 8192
#define DECODE_HEIGHT_LIMIT 4608
#endif /* VPX_CONFIG_H */

Просмотреть файл

@ -25,6 +25,7 @@ Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
Johann <johann@duck.com> <johann.koenig@gmail.com>
John Koleszar <jkoleszar@google.com>
Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
Marco Paniconi <marpan@google.com>
Marco Paniconi <marpan@google.com> <marpan@chromium.org>
Martin Storsjö <martin@martin.st>

Просмотреть файл

@ -21,6 +21,7 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
Andres Mejia <mcitadel@gmail.com>
Andrew Lewis <andrewlewis@google.com>
Andrew Russell <anrussell@google.com>
Andrew Salkeld <andrew.salkeld@arm.com>
Angie Chen <yunqi@google.com>
Angie Chiang <angiebird@google.com>
Anton Venema <anton.venema@liveswitch.com>
@ -175,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
Ronald S. Bultje <rsbultje@gmail.com>
Rui Ueyama <ruiu@google.com>
Sai Deng <sdeng@google.com>
Salome Thirot <salome.thirot@arm.com>
Sami Pietilä <samipietila@google.com>
Sam James <sam@gentoo.org>
Sarah Parker <sarahparker@google.com>
Sasi Inguva <isasi@google.com>
Scott Graham <scottmg@chromium.org>

Просмотреть файл

@ -1,3 +1,39 @@
2023-01-31 v1.13.0 "Ugly Duckling"
This release includes more Neon and AVX2 optimizations, adds a new codec
control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
numerous bug fixes.
- Upgrading:
This release is ABI incompatible with the previous release.
New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
GoogleTest is upgraded to v1.12.1.
.clang-format is upgraded to clang-format-11.
VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
feature of using external rate control models for vp9.
- Enhancement:
Numerous improvements on Neon optimizations.
Numerous improvements on AVX2 optimizations.
Additional ARM targets added for Visual Studio.
- Bug fixes:
Fix to calculating internal stats when frame dropped.
Fix to segfault for external resize test in vp9.
Fix to build system with replacing egrep with grep -E.
Fix to a few bugs with external RTC rate control library.
Fix to make SVC work with VBR.
Fix to key frame setting in VP9 external RC.
Fix to -Wimplicit-int (Clang 16).
Fix to VP8 external RC for buffer levels.
Fix to VP8 external RC for dynamic update of layers.
Fix to VP9 auto level.
Fix to off-by-one error of max w/h in validate_config.
Fix to make SVC work for Profile 1.
2022-06-17 v1.12.0 "Torrent Duck"
This release adds optimizations for Loongarch, adds support for vp8 in the
real-time rate control library, upgrades GoogleTest to v1.11.0, updates

Просмотреть файл

@ -1,4 +1,4 @@
v1.12.0 Torrent Duck
v1.13.0 Ugly Duckling
Welcome to the WebM VP8/VP9 Codec SDK!

3
media/libvpx/libvpx/configure поставляемый
Просмотреть файл

@ -293,6 +293,7 @@ EXPERIMENT_LIST="
emulate_hardware
non_greedy_mv
rate_ctrl
collect_component_timing
"
CONFIG_LIST="
dependency_tracking
@ -342,7 +343,6 @@ CONFIG_LIST="
multi_res_encoding
temporal_denoising
vp9_temporal_denoising
consistent_recode
coefficient_range_checking
vp9_highbitdepth
better_hw_compatibility
@ -406,7 +406,6 @@ CMDLINE_SELECT="
multi_res_encoding
temporal_denoising
vp9_temporal_denoising
consistent_recode
coefficient_range_checking
better_hw_compatibility
vp9_highbitdepth

Просмотреть файл

@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
# To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
# SO_VERSION_* then follow the rules in the link to detemine the new version
# (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
SO_VERSION_MAJOR := 7
SO_VERSION_MINOR := 1
SO_VERSION_MAJOR := 8
SO_VERSION_MINOR := 0
SO_VERSION_PATCH := 0
ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib

Просмотреть файл

@ -260,5 +260,11 @@ INSTANTIATE_TEST_SUITE_P(
::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
#endif // HAVE_SSE2
#if HAVE_NEON
INSTANTIATE_TEST_SUITE_P(
NEON, AvgPredTestHBD,
::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
#endif // HAVE_NEON
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace

Просмотреть файл

@ -1572,6 +1572,10 @@ INSTANTIATE_TEST_SUITE_P(
12),
SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
12),
SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
12),
SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
12),
SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
10),
SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
@ -1594,6 +1598,10 @@ INSTANTIATE_TEST_SUITE_P(
10),
SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
10),
SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
10),
SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
10),
SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
8),
SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
@ -1613,7 +1621,9 @@ INSTANTIATE_TEST_SUITE_P(
SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
8),
SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
8)));
INSTANTIATE_TEST_SUITE_P(
@ -1652,6 +1662,12 @@ INSTANTIATE_TEST_SUITE_P(
SubpelAvgVarianceParams(3, 2,
&vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
12),
SubpelAvgVarianceParams(2, 3,
&vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
12),
SubpelAvgVarianceParams(2, 2,
&vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
12),
SubpelAvgVarianceParams(6, 6,
&vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
10),
@ -1685,6 +1701,12 @@ INSTANTIATE_TEST_SUITE_P(
SubpelAvgVarianceParams(3, 2,
&vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
10),
SubpelAvgVarianceParams(2, 3,
&vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
10),
SubpelAvgVarianceParams(2, 2,
&vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
10),
SubpelAvgVarianceParams(6, 6,
&vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
8),
@ -1717,6 +1739,12 @@ INSTANTIATE_TEST_SUITE_P(
8),
SubpelAvgVarianceParams(3, 2,
&vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
8),
SubpelAvgVarianceParams(2, 3,
&vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
8),
SubpelAvgVarianceParams(2, 2,
&vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
8)));
#endif // CONFIG_VP9_HIGHBITDEPTH

Просмотреть файл

@ -127,14 +127,15 @@ class Vp8RcInterfaceTest
encoder->Control(VP8E_SET_CPUUSED, -6);
encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
} else if (frame_params_.frame_type == INTER_FRAME) {
} else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
// Disable golden frame update.
frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
}
}
frame_params_.frame_type =
video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
frame_params_.frame_type = video->frame() % key_interval_ == 0
? libvpx::RcFrameType::kKeyFrame
: libvpx::RcFrameType::kInterFrame;
encoder_exit_ = video->frame() == test_video_.frames;
}

Просмотреть файл

@ -41,7 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16;
constexpr int kReadMinGfInterval = 5;
constexpr int kReadMaxGfInterval = 13;
const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
const double kPsnrThreshold = 30.50;
const double kPsnrThreshold = 30.4;
struct ToyRateCtrl {
int magic_number;

Просмотреть файл

@ -57,9 +57,11 @@ class RcInterfaceTest
encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
}
frame_params_.frame_type =
video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
frame_params_.frame_type = video->frame() % key_interval_ == 0
? libvpx::RcFrameType::kKeyFrame
: libvpx::RcFrameType::kInterFrame;
if (rc_cfg_.rc_mode == VPX_CBR &&
frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
// Disable golden frame update.
frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
@ -183,8 +185,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
encoder->Control(VP9E_SET_SVC, 1);
encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
}
frame_params_.frame_type =
video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
frame_params_.frame_type = video->frame() % key_interval_ == 0
? libvpx::RcFrameType::kKeyFrame
: libvpx::RcFrameType::kInterFrame;
encoder_exit_ = video->frame() == kNumFrames;
current_superframe_ = video->frame();
if (dynamic_spatial_layers_ == 1) {
@ -247,7 +250,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
else
frame_params_.temporal_layer_id = 0;
rc_api_->ComputeQP(frame_params_);
frame_params_.frame_type = INTER_FRAME;
frame_params_.frame_type = libvpx::RcFrameType::kInterFrame;
rc_api_->PostEncodeUpdate(sizes_[sl]);
}
}

Просмотреть файл

@ -26,13 +26,6 @@ struct VP8_COMP;
/* Create/destroy static data structures. */
typedef enum {
NORMAL = 0,
FOURFIVE = 1,
THREEFIVE = 2,
ONETWO = 3
} VPX_SCALING;
typedef enum {
USAGE_LOCAL_FILE_PLAYBACK = 0x0,
USAGE_STREAM_FROM_SERVER = 0x1,
@ -58,19 +51,19 @@ typedef enum {
#include <assert.h>
static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
switch (mode) {
case NORMAL:
case VP8E_NORMAL:
*hr = 1;
*hs = 1;
break;
case FOURFIVE:
case VP8E_FOURFIVE:
*hr = 4;
*hs = 5;
break;
case THREEFIVE:
case VP8E_THREEFIVE:
*hr = 3;
*hs = 5;
break;
case ONETWO:
case VP8E_ONETWO:
*hr = 1;
*hs = 2;
break;
@ -273,8 +266,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
unsigned int threshold[4]);
int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
unsigned int rows, unsigned int cols);
int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
VPX_SCALING vert_mode);
int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
VPX_SCALING_MODE vert_mode);
int vp8_get_quantizer(struct VP8_COMP *cpi);
#ifdef __cplusplus

Просмотреть файл

@ -92,8 +92,7 @@ typedef struct macroblock {
signed int last_act_zbin_adj;
int *mvcost[2];
/* MSVC generates code that thinks this is 16-byte aligned */
DECLARE_ALIGNED(16, int*, mvsadcost[2]);
int *mvsadcost[2];
int (*mbmode_cost)[MB_MODE_COUNT];
int (*intra_uv_mode_cost)[MB_MODE_COUNT];
int (*bmode_costs)[10][10];

Просмотреть файл

@ -2990,8 +2990,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
/* Set back to unscaled by defaults */
cpi->common.horiz_scale = NORMAL;
cpi->common.vert_scale = NORMAL;
cpi->common.horiz_scale = VP8E_NORMAL;
cpi->common.vert_scale = VP8E_NORMAL;
/* Calculate Average bits per frame. */
av_bits_per_frame = cpi->oxcf.target_bandwidth /

Просмотреть файл

@ -1667,7 +1667,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
cm->sharpness_level = cpi->oxcf.Sharpness;
if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
int hr, hs, vr, vs;
Scale2Ratio(cm->horiz_scale, &hr, &hs);
@ -2504,15 +2504,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
cpi->oxcf.optimal_buffer_level / 100)) {
cm->horiz_scale =
(cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
(cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
cm->vert_scale =
(cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
}
/* Should we now start scaling back up */
else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
cpi->oxcf.optimal_buffer_level / 100)) {
cm->horiz_scale =
(cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
(cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
cm->vert_scale =
(cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
}
/* Get the new height and width */
@ -5380,15 +5382,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
}
}
int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
VPX_SCALING vert_mode) {
if (horiz_mode <= ONETWO) {
int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
VPX_SCALING_MODE vert_mode) {
if (horiz_mode <= VP8E_ONETWO) {
cpi->common.horiz_scale = horiz_mode;
} else {
return -1;
}
if (vert_mode <= ONETWO) {
if (vert_mode <= VP8E_ONETWO) {
cpi->common.vert_scale = vert_mode;
} else {
return -1;

Просмотреть файл

@ -947,19 +947,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
if (img != NULL) {
res = image2yuvconfig(img, &sd);
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
/* from vpx_encoder.h for g_w/g_h:
"Note that the frames passed as input to the encoder must have this
resolution"
*/
ctx->base.err_detail = "Invalid input frame resolution";
res = VPX_CODEC_INVALID_PARAM;
} else {
if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags,
&sd, dst_time_stamp, dst_end_time_stamp)) {
VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
res = update_error_state(ctx, &cpi->common.error);
}
if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd,
dst_time_stamp, dst_end_time_stamp)) {
VP8_COMP *cpi = (VP8_COMP *)ctx->cpi;
res = update_error_state(ctx, &cpi->common.error);
}
/* reset for next frame */
@ -1233,8 +1224,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
if (data) {
int res;
vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
(VPX_SCALING)scalemode.v_scaling_mode);
res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
scalemode.v_scaling_mode);
if (!res) {
/*force next frame a key frame to effect scaling mode */

Просмотреть файл

@ -10,7 +10,9 @@
#include <math.h>
#include <new>
#include "vp8/common/common.h"
#include "vp8/vp8_ratectrl_rtc.h"
#include "vp8/encoder/onyx_int.h"
#include "vp8/encoder/ratectrl.h"
#include "vpx_ports/system_state.h"
@ -65,6 +67,13 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
return rc_api;
}
VP8RateControlRTC::~VP8RateControlRTC() {
if (cpi_) {
vpx_free(cpi_->gf_active_flags);
vpx_free(cpi_);
}
}
void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
VP8_COMMON *cm = &cpi_->common;
VP8_CONFIG *oxcf = &cpi_->oxcf;
@ -203,7 +212,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
vp8_restore_layer_context(cpi_, layer);
vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
}
cm->frame_type = frame_params.frame_type;
cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {

Просмотреть файл

@ -12,23 +12,24 @@
#define VPX_VP8_RATECTRL_RTC_H_
#include <cstdint>
#include <cstring>
#include <memory>
#include "vp8/encoder/onyx_int.h"
#include "vp8/common/common.h"
#include "vpx/internal/vpx_ratectrl_rtc.h"
struct VP8_COMP;
namespace libvpx {
struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
public:
VP8RateControlRtcConfig() {
vp8_zero(layer_target_bitrate);
vp8_zero(ts_rate_decimator);
memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
}
};
struct VP8FrameParamsQpRTC {
FRAME_TYPE frame_type;
RcFrameType frame_type;
int temporal_layer_id;
};
@ -36,12 +37,7 @@ class VP8RateControlRTC {
public:
static std::unique_ptr<VP8RateControlRTC> Create(
const VP8RateControlRtcConfig &cfg);
~VP8RateControlRTC() {
if (cpi_) {
vpx_free(cpi_->gf_active_flags);
vpx_free(cpi_);
}
}
~VP8RateControlRTC();
void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
// GetQP() needs to be called after ComputeQP() to get the latest QP
@ -54,7 +50,7 @@ class VP8RateControlRTC {
private:
VP8RateControlRTC() {}
void InitRateControl(const VP8RateControlRtcConfig &cfg);
VP8_COMP *cpi_;
struct VP8_COMP *cpi_;
int q_;
};

Просмотреть файл

@ -220,7 +220,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
// Look up the component cost of the residual motion vector
{
uint32_t cost[4];
int16_t __attribute__((aligned(16))) rowcol[8];
DECLARE_ALIGNED(16, int16_t, rowcol[8]);
vst1q_s16(rowcol, v_diff_mv_w);
// Note: This is a use case for gather instruction

Просмотреть файл

@ -1980,6 +1980,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
int64_t best_rd = INT64_MAX;
vpx_clear_system_state();
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, rd_pick_sb_modes_time);
#endif
// Use the lower precision, but faster, 32x32 fdct for mode selection.
x->use_lp32x32fdct = 1;
@ -2047,15 +2050,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
} else {
if (bsize >= BLOCK_8X8) {
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
#endif
if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
ctx, best_rd);
else
vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
bsize, ctx, best_rd);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
#endif
} else {
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
#endif
vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
bsize, ctx, best_rd);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
#endif
}
}
@ -2078,6 +2093,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
ctx->rate = rd_cost->rate;
ctx->dist = rd_cost->dist;
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, rd_pick_sb_modes_time);
#endif
}
#endif // !CONFIG_REALTIME_ONLY
@ -4411,8 +4429,14 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
if (should_encode_sb && pc_tree->index != 3) {
int output_enabled = (bsize == BLOCK_64X64);
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, encode_sb_time);
#endif
encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
pc_tree);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, encode_sb_time);
#endif
#if CONFIG_RATE_CTRL
if (oxcf->use_simple_encode_api) {
// Store partition, motion vector of the superblock.
@ -4539,8 +4563,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
&x->min_partition_size, &x->max_partition_size);
}
td->pc_root->none.rdcost = 0;
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, rd_pick_partition_time);
#endif
rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
&dummy_rdc, dummy_rdc, td->pc_root);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, rd_pick_partition_time);
#endif
}
(*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
sb_col_in_tile, num_sb_cols);
@ -5810,14 +5841,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
for (i = 0; i < BLOCK_SIZES; ++i) {
for (j = 0; j < MAX_MODES; ++j) {
tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
#if CONFIG_RATE_CTRL
if (cpi->oxcf.use_simple_encode_api) {
tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
}
#endif // CONFIG_RATE_CTRL
#if CONFIG_CONSISTENT_RECODE
tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
#endif // CONFIG_CONSISTENT_RECODE
tile_data->mode_map[i][j] = j;
}
}
@ -6037,9 +6061,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
#endif // CONFIG_VP9_HIGHBITDEPTH
x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
#if CONFIG_CONSISTENT_RECODE
x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
#endif
if (xd->lossless) x->optimize = 0;
x->sharpness = cpi->oxcf.sharpness;
x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
@ -6184,13 +6206,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
return sum_delta / (cm->mi_rows * cm->mi_cols);
}
#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
static void restore_encode_params(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
const int tile_cols = 1 << cm->log2_tile_cols;
const int tile_rows = 1 << cm->log2_tile_rows;
int tile_col, tile_row;
int tile_idx;
int i, j;
TileDataEnc *tile_data;
RD_OPT *rd_opt = &cpi->rd;
for (i = 0; i < MAX_REF_FRAMES; i++) {
for (j = 0; j < REFERENCE_MODES; j++)
@ -6201,35 +6221,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
}
if (cpi->tile_data != NULL) {
for (tile_row = 0; tile_row < tile_rows; ++tile_row)
for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
TileDataEnc *tile_data =
&cpi->tile_data[tile_row * tile_cols + tile_col];
for (i = 0; i < BLOCK_SIZES; ++i) {
for (j = 0; j < MAX_MODES; ++j) {
tile_data->thresh_freq_fact[i][j] =
tile_data->thresh_freq_fact_prev[i][j];
}
}
}
for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
assert(cpi->tile_data);
tile_data = &cpi->tile_data[tile_idx];
vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
}
cm->interp_filter = cpi->sf.default_interp_filter;
}
#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
void vp9_encode_frame(VP9_COMP *cpi) {
VP9_COMMON *const cm = &cpi->common;
#if CONFIG_RATE_CTRL
if (cpi->oxcf.use_simple_encode_api) {
restore_encode_params(cpi);
}
#endif // CONFIG_RATE_CTRL
#if CONFIG_CONSISTENT_RECODE
restore_encode_params(cpi);
#endif
#if CONFIG_MISMATCH_DEBUG
mismatch_reset_frame(MAX_MB_PLANE);
@ -6283,7 +6287,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
if (cm->interp_filter == SWITCHABLE)
cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, encode_frame_internal_time);
#endif
encode_frame_internal(cpi);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, encode_frame_internal_time);
#endif
for (i = 0; i < REFERENCE_MODES; ++i)
mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -90,13 +90,6 @@ typedef enum {
ENCODE_BREAKOUT_LIMITED = 2
} ENCODE_BREAKOUT_TYPE;
typedef enum {
NORMAL = 0,
FOURFIVE = 1,
THREEFIVE = 2,
ONETWO = 3
} VPX_SCALING;
typedef enum {
// Good Quality Fast Encoding. The encoder balances quality with the amount of
// time it takes to encode the output. Speed setting controls how fast.
@ -336,9 +329,7 @@ typedef struct TplDepFrame {
typedef struct TileDataEnc {
TileInfo tile_info;
int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int8_t mode_map[BLOCK_SIZES][MAX_MODES];
FIRSTPASS_DATA fp_data;
VP9RowMTSync row_mt_sync;
@ -659,6 +650,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
#endif // CONFIG_RATE_CTRL
#if CONFIG_COLLECT_COMPONENT_TIMING
#include "vpx_ports/vpx_timer.h"
// Adjust the following to add new components.
typedef enum {
vp9_get_compressed_data_time,
vp9_temporal_filter_time,
vp9_rc_get_second_pass_params_time,
setup_tpl_stats_time,
Pass2Encode_time,
encode_with_recode_loop_time,
loopfilter_frame_time,
vp9_pack_bitstream_time,
encode_frame_internal_time,
rd_pick_partition_time,
rd_pick_sb_modes_time,
encode_sb_time,
vp9_rd_pick_inter_mode_sb_time,
vp9_rd_pick_inter_mode_sub8x8_time,
intra_mode_search_time,
handle_inter_mode_time,
single_motion_search_time,
joint_motion_search_time,
interp_filter_time,
kTimingComponents,
} TIMING_COMPONENT;
static INLINE char const *get_component_name(int index) {
switch (index) {
case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
case vp9_rc_get_second_pass_params_time:
return "vp9_rc_get_second_pass_params_time";
case setup_tpl_stats_time: return "setup_tpl_stats_time";
case Pass2Encode_time: return "Pass2Encode_time";
case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
case loopfilter_frame_time: return "loopfilter_frame_time";
case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
case encode_frame_internal_time: return "encode_frame_internal_time";
case rd_pick_partition_time: return "rd_pick_partition_time";
case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
case encode_sb_time: return "encode_sb_time";
case vp9_rd_pick_inter_mode_sb_time:
return "vp9_rd_pick_inter_mode_sb_time";
case vp9_rd_pick_inter_mode_sub8x8_time:
return "vp9_rd_pick_inter_mode_sub8x8_time";
case intra_mode_search_time: return "intra_mode_search_time";
case handle_inter_mode_time: return "handle_inter_mode_time";
case single_motion_search_time: return "single_motion_search_time";
case joint_motion_search_time: return "joint_motion_search_time";
case interp_filter_time: return "interp_filter_time";
default: assert(0);
}
return "error";
}
#endif
typedef struct VP9_COMP {
FRAME_INFO frame_info;
QUANTS quants;
@ -973,6 +1030,22 @@ typedef struct VP9_COMP {
EXT_RATECTRL ext_ratectrl;
int fixed_qp_onepass;
#if CONFIG_COLLECT_COMPONENT_TIMING
/*!
* component_time[] are initialized to zero while encoder starts.
*/
uint64_t component_time[kTimingComponents];
/*!
* Stores timing for individual components between calls of start_timing()
* and end_timing().
*/
struct vpx_usec_timer component_timer[kTimingComponents];
/*!
* frame_component_time[] are initialized to zero at beginning of each frame.
*/
uint64_t frame_component_time[kTimingComponents];
#endif
} VP9_COMP;
#if CONFIG_RATE_CTRL
@ -1154,8 +1227,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
int cols);
int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
VPX_SCALING vert_mode);
int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
VPX_SCALING_MODE vert_mode);
int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
unsigned int height);
@ -1392,6 +1465,38 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
#define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
new_fb_ptr->mi_cols < cm->mi_cols) {
vpx_free(new_fb_ptr->mvs);
CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
(MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
sizeof(*new_fb_ptr->mvs)));
new_fb_ptr->mi_rows = cm->mi_rows;
new_fb_ptr->mi_cols = cm->mi_cols;
}
}
#if CONFIG_COLLECT_COMPONENT_TIMING
static INLINE void start_timing(VP9_COMP *cpi, int component) {
vpx_usec_timer_start(&cpi->component_timer[component]);
}
static INLINE void end_timing(VP9_COMP *cpi, int component) {
vpx_usec_timer_mark(&cpi->component_timer[component]);
cpi->frame_component_time[component] +=
vpx_usec_timer_elapsed(&cpi->component_timer[component]);
}
static INLINE char const *get_frame_type_enum(int type) {
switch (type) {
case 0: return "KEY_FRAME";
case 1: return "INTER_FRAME";
default: assert(0);
}
return "error";
}
#endif
#ifdef __cplusplus
} // extern "C"
#endif

Просмотреть файл

@ -121,11 +121,9 @@ typedef struct RD_OPT {
int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
int RDMULT;
int RDDIV;
double r0;

Просмотреть файл

@ -2832,8 +2832,14 @@ static int64_t handle_inter_mode(
frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, joint_motion_search_time);
#endif
joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
single_newmv, &rate_mv);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, joint_motion_search_time);
#endif
} else {
rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
&x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@ -2845,7 +2851,13 @@ static int64_t handle_inter_mode(
*rate2 += rate_mv;
} else {
int_mv tmp_mv;
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, single_motion_search_time);
#endif
single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, single_motion_search_time);
#endif
if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@ -2908,6 +2920,9 @@ static int64_t handle_inter_mode(
intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, interp_filter_time);
#endif
// Search for best switchable filter by checking the variance of
// pred error irrespective of whether the filter will be used
for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
@ -3005,6 +3020,9 @@ static int64_t handle_inter_mode(
restore_dst_buf(xd, orig_dst, orig_dst_stride);
}
}
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, interp_filter_time);
#endif
// Set the appropriate filter
mi->interp_filter =
cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
@ -3707,19 +3725,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
if (ref_frame == INTRA_FRAME) {
TX_SIZE uv_tx;
struct macroblockd_plane *const pd = &xd->plane[1];
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, intra_mode_search_time);
#endif
memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
best_rd, recon);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, intra_mode_search_time);
#endif
if (rate_y == INT_MAX) continue;
uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
[pd->subsampling_y];
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, intra_mode_search_time);
#endif
if (rate_uv_intra[uv_tx] == INT_MAX) {
choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
&rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
&skip_uv[uv_tx], &mode_uv[uv_tx]);
}
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, intra_mode_search_time);
#endif
rate_uv = rate_uv_tokenonly[uv_tx];
distortion_uv = dist_uv[uv_tx];
skippable = skippable && skip_uv[uv_tx];
@ -3730,11 +3759,17 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
} else {
#if CONFIG_COLLECT_COMPONENT_TIMING
start_timing(cpi, handle_inter_mode_time);
#endif
this_rd = handle_inter_mode(
cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
single_inter_filter, single_skippable, &total_sse, best_rd,
&mask_filter, filter_cache);
#if CONFIG_COLLECT_COMPONENT_TIMING
end_timing(cpi, handle_inter_mode_time);
#endif
if (this_rd == INT64_MAX) continue;
compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@ -3970,13 +4005,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
}
if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
// If adaptive interp filter is enabled, then the current leaf node of 8x8
// data is needed for sub8x8. Hence preserve the context.
#if CONFIG_CONSISTENT_RECODE
// If adaptive interp filter is enabled, then the current leaf node of 8x8
// data is needed for sub8x8. Hence preserve the context.
if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
#else
if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
#endif
rd_cost->rate = INT_MAX;
rd_cost->rdcost = INT64_MAX;
return;

Просмотреть файл

@ -16,8 +16,11 @@
#include "vpx_dsp/vpx_dsp_common.h"
// Mesh search patters for various speed settings
static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
{ 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
// FC_GRAPHICS_ANIMATION content type.
static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
{ { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
{ { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
};
#if !CONFIG_REALTIME_ONLY
@ -209,15 +212,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
const int boosted = frame_is_boosted(cpi);
int i;
sf->tx_size_search_breakout = 1;
sf->adaptive_pred_interp_filter = 1;
sf->adaptive_rd_thresh = 1;
sf->adaptive_rd_thresh_row_mt = 0;
sf->allow_skip_recode = 1;
sf->less_rectangular_check = 1;
sf->use_square_partition_only = !boosted;
sf->mv.auto_mv_step_size = 1;
sf->prune_ref_frame_for_rect_partitions = 1;
sf->rd_ml_partition.var_pruning = 1;
sf->temporal_filter_search_method = NSTEP;
sf->tx_size_search_breakout = 1;
sf->use_square_partition_only = !boosted;
sf->rd_ml_partition.var_pruning = 1;
sf->rd_ml_partition.prune_rect_thresh[0] = -1;
sf->rd_ml_partition.prune_rect_thresh[1] = 350;
sf->rd_ml_partition.prune_rect_thresh[2] = 325;
@ -238,7 +244,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
}
if (speed >= 1) {
sf->temporal_filter_search_method = NSTEP;
sf->rd_ml_partition.var_pruning = !boosted;
sf->rd_ml_partition.prune_rect_thresh[1] = 225;
sf->rd_ml_partition.prune_rect_thresh[2] = 225;
@ -263,11 +268,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
sf->less_rectangular_check = 1;
sf->use_rd_breakout = 1;
sf->adaptive_motion_search = 1;
sf->mv.auto_mv_step_size = 1;
sf->adaptive_rd_thresh = 2;
sf->mv.subpel_search_level = 1;
if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
sf->adaptive_pred_interp_filter = 1;
sf->allow_acl = 0;
sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
@ -991,10 +994,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
sf->exhaustive_searches_thresh =
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
: INT_MAX;
if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
{
const int mesh_density_level =
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
for (i = 0; i < MAX_MESH_STEP; ++i) {
sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
sf->mesh_patterns[i].range =
best_quality_mesh_pattern[mesh_density_level][i].range;
sf->mesh_patterns[i].interval =
best_quality_mesh_pattern[mesh_density_level][i].interval;
}
}

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,44 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
#ifdef __cplusplus
extern "C" {
#endif
#ifndef M_LOG2_E
#define M_LOG2_E 0.693147180559945309417
#endif
#define log2f(x) (log(x) / (float)M_LOG2_E)
typedef struct GF_PICTURE {
YV12_BUFFER_CONFIG *frame;
int ref_frame[3];
FRAME_UPDATE_TYPE update_type;
} GF_PICTURE;
void vp9_init_tpl_buffer(VP9_COMP *cpi);
void vp9_setup_tpl_stats(VP9_COMP *cpi);
void vp9_free_tpl_buffer(VP9_COMP *cpi);
void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
TX_SIZE tx_size);
#if CONFIG_VP9_HIGHBITDEPTH
void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
TX_SIZE tx_size);
#endif
#ifdef __cplusplus
} // extern "C"
#endif
#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_

Просмотреть файл

@ -48,6 +48,29 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
return rc_api;
}
VP9RateControlRTC::~VP9RateControlRTC() {
if (cpi_) {
if (cpi_->svc.number_spatial_layers > 1 ||
cpi_->svc.number_temporal_layers > 1) {
for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
vpx_free(lc->map);
vpx_free(lc->last_coded_q_map);
vpx_free(lc->consec_zero_mv);
}
}
}
if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
vpx_free(cpi_->segmentation_map);
cpi_->segmentation_map = NULL;
vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
}
vpx_free(cpi_);
}
}
void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
VP9_COMMON *cm = &cpi_->common;
VP9EncoderConfig *oxcf = &cpi_->oxcf;
@ -157,7 +180,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
cm->height = height;
}
vp9_set_mb_mi(cm, cm->width, cm->height);
cm->frame_type = frame_params.frame_type;
cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
// This is needed to ensure key frame does not get unset in rc_get_svc_params.
cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;

Просмотреть файл

@ -19,14 +19,14 @@
#include "vp9/common/vp9_onyxc_int.h"
#include "vp9/vp9_iface_common.h"
#include "vp9/encoder/vp9_aq_cyclicrefresh.h"
#include "vp9/encoder/vp9_encoder.h"
#include "vp9/encoder/vp9_firstpass.h"
#include "vp9/vp9_cx_iface.h"
#include "vpx/internal/vpx_ratectrl_rtc.h"
#include "vpx_mem/vpx_mem.h"
namespace libvpx {
struct VP9_COMP;
namespace libvpx {
struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
public:
VP9RateControlRtcConfig() {
@ -53,7 +53,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
};
struct VP9FrameParamsQpRTC {
FRAME_TYPE frame_type;
RcFrameType frame_type;
int spatial_layer_id;
int temporal_layer_id;
};
@ -90,28 +90,7 @@ class VP9RateControlRTC {
public:
static std::unique_ptr<VP9RateControlRTC> Create(
const VP9RateControlRtcConfig &cfg);
~VP9RateControlRTC() {
if (cpi_) {
if (cpi_->svc.number_spatial_layers > 1 ||
cpi_->svc.number_temporal_layers > 1) {
for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
vpx_free(lc->map);
vpx_free(lc->last_coded_q_map);
vpx_free(lc->consec_zero_mv);
}
}
}
if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
vpx_free(cpi_->segmentation_map);
cpi_->segmentation_map = NULL;
vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
}
vpx_free(cpi_);
}
}
~VP9RateControlRTC();
void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
// GetQP() needs to be called after ComputeQP() to get the latest QP
@ -125,7 +104,7 @@ class VP9RateControlRTC {
private:
VP9RateControlRTC() {}
void InitRateControl(const VP9RateControlRtcConfig &cfg);
VP9_COMP *cpi_;
struct VP9_COMP *cpi_;
};
} // namespace libvpx

Просмотреть файл

@ -1372,22 +1372,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
timebase_units_to_ticks(timestamp_ratio, pts + duration);
res = image2yuvconfig(img, &sd);
if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) {
/* from vpx_encoder.h for g_w/g_h:
"Note that the frames passed as input to the encoder must have this
resolution"
*/
ctx->base.err_detail = "Invalid input frame resolution";
res = VPX_CODEC_INVALID_PARAM;
} else {
// Store the original flags in to the frame buffer. Will extract the
// key frame flag when we actually encode this frame.
if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
// Store the original flags in to the frame buffer. Will extract the
// key frame flag when we actually encode this frame.
if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd,
dst_time_stamp, dst_end_time_stamp)) {
res = update_error_state(ctx, &cpi->common.error);
}
ctx->next_frame_flags = 0;
res = update_error_state(ctx, &cpi->common.error);
}
ctx->next_frame_flags = 0;
}
cx_data = ctx->cx_data;
@ -1684,9 +1675,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
if (mode) {
const int res =
vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
(VPX_SCALING)mode->v_scaling_mode);
const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
mode->v_scaling_mode);
return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
}
return VPX_CODEC_INVALID_PARAM;

Просмотреть файл

@ -104,6 +104,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
endif
VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h

Просмотреть файл

@ -14,6 +14,9 @@
#include "vpx/vpx_encoder.h"
namespace libvpx {
enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
struct VpxRateControlRtcConfig {
public:
VpxRateControlRtcConfig() {

Просмотреть файл

@ -165,8 +165,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Transpose top left and top right quarters into one contiguous location to
// process to the top half.
transpose_s16_8x8_new(&temp0[0], &temp2[0]);
transpose_s16_8x8_new(&temp1[0], &temp2[8]);
transpose_s16_8x8q(&temp0[0], &temp2[0]);
transpose_s16_8x8q(&temp1[0], &temp2[8]);
partial_round_shift(temp2);
cross_input(temp2, temp3);
vpx_fdct8x16_body(temp3, temp2);
@ -180,7 +180,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
// Transpose bottom left and bottom right quarters into one contiguous
// location to process to the bottom half.
transpose_s16_8x8_new(&temp0[8], &temp1[0]);
transpose_s16_8x8q(&temp0[8], &temp1[0]);
transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
&temp1[13], &temp1[14], &temp1[15]);

Просмотреть файл

@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
transpose_s16_8x8_new(&temp1[0], &temp0[0]);
transpose_s16_8x8_new(&temp2[0], &temp0[8]);
transpose_s16_8x8_new(&temp3[0], &temp0[16]);
transpose_s16_8x8_new(&temp4[0], &temp0[24]);
transpose_s16_8x8q(&temp1[0], &temp0[0]);
transpose_s16_8x8q(&temp2[0], &temp0[8]);
transpose_s16_8x8q(&temp3[0], &temp0[16]);
transpose_s16_8x8q(&temp4[0], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output, temp5);
// Second row of 8x32.
transpose_s16_8x8_new(&temp1[8], &temp0[0]);
transpose_s16_8x8_new(&temp2[8], &temp0[8]);
transpose_s16_8x8_new(&temp3[8], &temp0[16]);
transpose_s16_8x8_new(&temp4[8], &temp0[24]);
transpose_s16_8x8q(&temp1[8], &temp0[0]);
transpose_s16_8x8q(&temp2[8], &temp0[8]);
transpose_s16_8x8q(&temp3[8], &temp0[16]);
transpose_s16_8x8q(&temp4[8], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output + 8 * 32, temp5);
// Third row of 8x32
transpose_s16_8x8_new(&temp1[16], &temp0[0]);
transpose_s16_8x8_new(&temp2[16], &temp0[8]);
transpose_s16_8x8_new(&temp3[16], &temp0[16]);
transpose_s16_8x8_new(&temp4[16], &temp0[24]);
transpose_s16_8x8q(&temp1[16], &temp0[0]);
transpose_s16_8x8q(&temp2[16], &temp0[8]);
transpose_s16_8x8q(&temp3[16], &temp0[16]);
transpose_s16_8x8q(&temp4[16], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
store(output + 16 * 32, temp5);
// Final row of 8x32.
transpose_s16_8x8_new(&temp1[24], &temp0[0]);
transpose_s16_8x8_new(&temp2[24], &temp0[8]);
transpose_s16_8x8_new(&temp3[24], &temp0[16]);
transpose_s16_8x8_new(&temp4[24], &temp0[24]);
transpose_s16_8x8q(&temp1[24], &temp0[0]);
transpose_s16_8x8q(&temp2[24], &temp0[8]);
transpose_s16_8x8q(&temp3[24], &temp0[16]);
transpose_s16_8x8q(&temp4[24], &temp0[24]);
dct_body_second_pass(temp0, temp5);
@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
dct_body_first_pass(temp5, temp4);
// Generate the top row by munging the first set of 8 from each one together.
transpose_s16_8x8_new(&temp1[0], &temp0[0]);
transpose_s16_8x8_new(&temp2[0], &temp0[8]);
transpose_s16_8x8_new(&temp3[0], &temp0[16]);
transpose_s16_8x8_new(&temp4[0], &temp0[24]);
transpose_s16_8x8q(&temp1[0], &temp0[0]);
transpose_s16_8x8q(&temp2[0], &temp0[8]);
transpose_s16_8x8q(&temp3[0], &temp0[16]);
transpose_s16_8x8q(&temp4[0], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output, temp5);
// Second row of 8x32.
transpose_s16_8x8_new(&temp1[8], &temp0[0]);
transpose_s16_8x8_new(&temp2[8], &temp0[8]);
transpose_s16_8x8_new(&temp3[8], &temp0[16]);
transpose_s16_8x8_new(&temp4[8], &temp0[24]);
transpose_s16_8x8q(&temp1[8], &temp0[0]);
transpose_s16_8x8q(&temp2[8], &temp0[8]);
transpose_s16_8x8q(&temp3[8], &temp0[16]);
transpose_s16_8x8q(&temp4[8], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output + 8 * 32, temp5);
// Third row of 8x32
transpose_s16_8x8_new(&temp1[16], &temp0[0]);
transpose_s16_8x8_new(&temp2[16], &temp0[8]);
transpose_s16_8x8_new(&temp3[16], &temp0[16]);
transpose_s16_8x8_new(&temp4[16], &temp0[24]);
transpose_s16_8x8q(&temp1[16], &temp0[0]);
transpose_s16_8x8q(&temp2[16], &temp0[8]);
transpose_s16_8x8q(&temp3[16], &temp0[16]);
transpose_s16_8x8q(&temp4[16], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);
@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
store(output + 16 * 32, temp5);
// Final row of 8x32.
transpose_s16_8x8_new(&temp1[24], &temp0[0]);
transpose_s16_8x8_new(&temp2[24], &temp0[8]);
transpose_s16_8x8_new(&temp3[24], &temp0[16]);
transpose_s16_8x8_new(&temp4[24], &temp0[24]);
transpose_s16_8x8q(&temp1[24], &temp0[0]);
transpose_s16_8x8q(&temp2[24], &temp0[8]);
transpose_s16_8x8q(&temp3[24], &temp0[16]);
transpose_s16_8x8q(&temp4[24], &temp0[24]);
dct_body_second_pass_rd(temp0, temp5);

Просмотреть файл

@ -0,0 +1,64 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include <assert.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
int width, int height, const uint16_t *ref,
int ref_stride) {
int i = height;
if (width > 8) {
do {
int j = 0;
do {
const uint16x8_t p = vld1q_u16(pred + j);
const uint16x8_t r = vld1q_u16(ref + j);
uint16x8_t avg = vrhaddq_u16(p, r);
vst1q_u16(comp_pred + j, avg);
j += 8;
} while (j < width);
comp_pred += width;
pred += width;
ref += ref_stride;
} while (--i != 0);
} else if (width == 8) {
do {
const uint16x8_t p = vld1q_u16(pred);
const uint16x8_t r = vld1q_u16(ref);
uint16x8_t avg = vrhaddq_u16(p, r);
vst1q_u16(comp_pred, avg);
comp_pred += width;
pred += width;
ref += ref_stride;
} while (--i != 0);
} else {
assert(width == 4);
do {
const uint16x4_t p = vld1_u16(pred);
const uint16x4_t r = vld1_u16(ref);
uint16x4_t avg = vrhadd_u16(p, r);
vst1_u16(comp_pred, avg);
comp_pred += width;
pred += width;
ref += ref_stride;
} while (--i != 0);
}
}

Просмотреть файл

@ -0,0 +1,233 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/sum_neon.h"
static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_ptr[4],
int ref_stride, uint32_t res[4],
int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
int i = 0;
do {
uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
sum[0] = vabal_u16(sum[0], s, r0);
sum[1] = vabal_u16(sum[1], s, r1);
sum[2] = vabal_u16(sum[2], s, r2);
sum[3] = vabal_u16(sum[3], s, r3);
} while (++i < h);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
uint32x4_t *const sad_sum) {
uint16x8_t abs_diff = vabdq_u16(src, ref);
*sad_sum = vpadalq_u16(*sad_sum, abs_diff);
}
static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_ptr[4],
int ref_stride, uint32_t res[4],
int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
int i = 0;
do {
uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
} while (++i < h);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *const ref_ptr[4],
int ref_stride, uint32_t res[4],
int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum[4];
int i = 0;
do {
uint16x8_t s0, s1;
s0 = vld1q_u16(src16_ptr + i * src_stride);
sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
} while (++i < h);
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_ptr[4],
int ref_stride, uint32_t res[4], int w,
int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum[4];
int i = 0;
do {
int j = 0;
do {
uint16x8_t s0, s1, s2, s3;
s0 = vld1q_u16(src16_ptr + i * src_stride + j);
sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
&sum_lo[0]);
sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
&sum_lo[1]);
sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
&sum_lo[2]);
sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
&sum_lo[3]);
s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
&sum_hi[0]);
sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
&sum_hi[1]);
sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
&sum_hi[2]);
sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
&sum_hi[3]);
j += 32;
} while (j < w);
} while (++i < h);
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *const ref_ptr[4],
int ref_stride, uint32_t res[4],
int h) {
highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
}
static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *const ref_ptr[4],
int ref_stride, uint32_t res[4],
int h) {
highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
}
#define HBD_SAD_WXH_4D_NEON(w, h) \
void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
const uint8_t *const ref[4], \
int ref_stride, uint32_t res[4]) { \
highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
}
HBD_SAD_WXH_4D_NEON(4, 4)
HBD_SAD_WXH_4D_NEON(4, 8)
HBD_SAD_WXH_4D_NEON(8, 4)
HBD_SAD_WXH_4D_NEON(8, 8)
HBD_SAD_WXH_4D_NEON(8, 16)
HBD_SAD_WXH_4D_NEON(16, 8)
HBD_SAD_WXH_4D_NEON(16, 16)
HBD_SAD_WXH_4D_NEON(16, 32)
HBD_SAD_WXH_4D_NEON(32, 16)
HBD_SAD_WXH_4D_NEON(32, 32)
HBD_SAD_WXH_4D_NEON(32, 64)
HBD_SAD_WXH_4D_NEON(64, 32)
HBD_SAD_WXH_4D_NEON(64, 64)

Просмотреть файл

@ -17,209 +17,363 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/sum_neon.h"
static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int width,
int height) {
int i, j;
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 4) {
const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
}
uint32x4_t sum = vdupq_n_u32(0);
int i = h;
do {
uint16x4_t s = vld1_u16(src16_ptr);
uint16x4_t r = vld1_u16(ref16_ptr);
sum = vabal_u16(sum, s, r);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
}
} while (--i != 0);
return horizontal_add_uint32x4(sum_abs_diff);
return horizontal_add_uint32x4(sum);
}
static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int width,
int height) {
int i, j;
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 8) {
const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
sum_abs_diff =
vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
vget_high_u16(ref_u16));
}
uint32x4_t sum = vdupq_n_u32(0);
int i = h;
do {
uint16x8_t s = vld1q_u16(src16_ptr);
uint16x8_t r = vld1q_u16(ref16_ptr);
uint16x8_t diff = vabdq_u16(s, r);
sum = vpadalq_u16(sum, diff);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
}
} while (--i != 0);
return horizontal_add_uint32x4(sum_abs_diff);
return horizontal_add_uint32x4(sum);
}
static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
int ref_stride, const uint8_t *second_pred, int width, int height) {
int i, j;
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 4) {
const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
}
uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
int i = h;
do {
uint16x8_t s0, s1, r0, r1;
uint16x8_t diff0, diff1;
s0 = vld1q_u16(src16_ptr);
r0 = vld1q_u16(ref16_ptr);
diff0 = vabdq_u16(s0, r0);
sum[0] = vpadalq_u16(sum[0], diff0);
s1 = vld1q_u16(src16_ptr + 8);
r1 = vld1q_u16(ref16_ptr + 8);
diff1 = vabdq_u16(s1, r1);
sum[1] = vpadalq_u16(sum[1], diff1);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
pred_ptr += width;
}
} while (--i != 0);
return horizontal_add_uint32x4(sum_abs_diff);
sum[0] = vaddq_u32(sum[0], sum[1]);
return horizontal_add_uint32x4(sum[0]);
}
static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
int ref_stride, const uint8_t *second_pred, int width, int height) {
int i, j;
uint32x4_t sum_abs_diff = vdupq_n_u32(0);
static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int w, int h) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
for (i = 0; i < height; i++) {
for (j = 0; j < width; j += 8) {
const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
sum_abs_diff =
vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
sum_abs_diff =
vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
}
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
int i = h;
do {
int j = 0;
do {
uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
uint16x8_t diff0, diff1, diff2, diff3;
s0 = vld1q_u16(src16_ptr + j);
r0 = vld1q_u16(ref16_ptr + j);
diff0 = vabdq_u16(s0, r0);
sum[0] = vpadalq_u16(sum[0], diff0);
s1 = vld1q_u16(src16_ptr + j + 8);
r1 = vld1q_u16(ref16_ptr + j + 8);
diff1 = vabdq_u16(s1, r1);
sum[1] = vpadalq_u16(sum[1], diff1);
s2 = vld1q_u16(src16_ptr + j + 16);
r2 = vld1q_u16(ref16_ptr + j + 16);
diff2 = vabdq_u16(s2, r2);
sum[2] = vpadalq_u16(sum[2], diff2);
s3 = vld1q_u16(src16_ptr + j + 24);
r3 = vld1q_u16(ref16_ptr + j + 24);
diff3 = vabdq_u16(s3, r3);
sum[3] = vpadalq_u16(sum[3], diff3);
j += 32;
} while (j < w);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
pred_ptr += width;
}
} while (--i != 0);
return horizontal_add_uint32x4(sum_abs_diff);
sum[0] = vaddq_u32(sum[0], sum[1]);
sum[2] = vaddq_u32(sum[2], sum[3]);
sum[0] = vaddq_u32(sum[0], sum[2]);
return horizontal_add_uint32x4(sum[0]);
}
#define highbd_sad4MxN(m, n) \
unsigned int vpx_highbd_sad##m##x##n##_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride) { \
return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h) {
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
}
static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h) {
return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
}
#define HBD_SAD_WXH_NEON(w, h) \
unsigned int vpx_highbd_sad##w##x##h##_neon( \
const uint8_t *src, int src_stride, const uint8_t *ref, \
int ref_stride) { \
return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
}
#define highbd_sadMxN(m, n) \
unsigned int vpx_highbd_sad##m##x##n##_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride) { \
return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
HBD_SAD_WXH_NEON(4, 4)
HBD_SAD_WXH_NEON(4, 8)
HBD_SAD_WXH_NEON(8, 4)
HBD_SAD_WXH_NEON(8, 8)
HBD_SAD_WXH_NEON(8, 16)
HBD_SAD_WXH_NEON(16, 8)
HBD_SAD_WXH_NEON(16, 16)
HBD_SAD_WXH_NEON(16, 32)
HBD_SAD_WXH_NEON(32, 16)
HBD_SAD_WXH_NEON(32, 32)
HBD_SAD_WXH_NEON(32, 64)
HBD_SAD_WXH_NEON(64, 32)
HBD_SAD_WXH_NEON(64, 64)
static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
uint32x4_t sum = vdupq_n_u32(0);
int i = h;
do {
uint16x4_t s = vld1_u16(src16_ptr);
uint16x4_t r = vld1_u16(ref16_ptr);
uint16x4_t p = vld1_u16(pred16_ptr);
uint16x4_t avg = vrhadd_u16(r, p);
sum = vabal_u16(sum, s, avg);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
pred16_ptr += 4;
} while (--i != 0);
return horizontal_add_uint32x4(sum);
}
static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
uint32x4_t sum = vdupq_n_u32(0);
int i = h;
do {
uint16x8_t s = vld1q_u16(src16_ptr);
uint16x8_t r = vld1q_u16(ref16_ptr);
uint16x8_t p = vld1q_u16(pred16_ptr);
uint16x8_t avg = vrhaddq_u16(r, p);
uint16x8_t diff = vabdq_u16(s, avg);
sum = vpadalq_u16(sum, diff);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
pred16_ptr += 8;
} while (--i != 0);
return horizontal_add_uint32x4(sum);
}
static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
int i = h;
do {
uint16x8_t s0, s1, r0, r1, p0, p1;
uint16x8_t avg0, avg1, diff0, diff1;
s0 = vld1q_u16(src16_ptr);
r0 = vld1q_u16(ref16_ptr);
p0 = vld1q_u16(pred16_ptr);
avg0 = vrhaddq_u16(r0, p0);
diff0 = vabdq_u16(s0, avg0);
sum[0] = vpadalq_u16(sum[0], diff0);
s1 = vld1q_u16(src16_ptr + 8);
r1 = vld1q_u16(ref16_ptr + 8);
p1 = vld1q_u16(pred16_ptr + 8);
avg1 = vrhaddq_u16(r1, p1);
diff1 = vabdq_u16(s1, avg1);
sum[1] = vpadalq_u16(sum[1], diff1);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
pred16_ptr += 16;
} while (--i != 0);
sum[0] = vaddq_u32(sum[0], sum[1]);
return horizontal_add_uint32x4(sum[0]);
}
static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int w, int h,
const uint8_t *second_pred) {
const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
int i = h;
do {
int j = 0;
do {
uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
s0 = vld1q_u16(src16_ptr + j);
r0 = vld1q_u16(ref16_ptr + j);
p0 = vld1q_u16(pred16_ptr + j);
avg0 = vrhaddq_u16(r0, p0);
diff0 = vabdq_u16(s0, avg0);
sum[0] = vpadalq_u16(sum[0], diff0);
s1 = vld1q_u16(src16_ptr + j + 8);
r1 = vld1q_u16(ref16_ptr + j + 8);
p1 = vld1q_u16(pred16_ptr + j + 8);
avg1 = vrhaddq_u16(r1, p1);
diff1 = vabdq_u16(s1, avg1);
sum[1] = vpadalq_u16(sum[1], diff1);
s2 = vld1q_u16(src16_ptr + j + 16);
r2 = vld1q_u16(ref16_ptr + j + 16);
p2 = vld1q_u16(pred16_ptr + j + 16);
avg2 = vrhaddq_u16(r2, p2);
diff2 = vabdq_u16(s2, avg2);
sum[2] = vpadalq_u16(sum[2], diff2);
s3 = vld1q_u16(src16_ptr + j + 24);
r3 = vld1q_u16(ref16_ptr + j + 24);
p3 = vld1q_u16(pred16_ptr + j + 24);
avg3 = vrhaddq_u16(r3, p3);
diff3 = vabdq_u16(s3, avg3);
sum[3] = vpadalq_u16(sum[3], diff3);
j += 32;
} while (j < w);
src16_ptr += src_stride;
ref16_ptr += ref_stride;
pred16_ptr += w;
} while (--i != 0);
sum[0] = vaddq_u32(sum[0], sum[1]);
sum[2] = vaddq_u32(sum[2], sum[3]);
sum[0] = vaddq_u32(sum[0], sum[2]);
return horizontal_add_uint32x4(sum[0]);
}
static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
second_pred);
}
static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride, int h,
const uint8_t *second_pred) {
return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
second_pred);
}
#define HBD_SAD_WXH_AVG_NEON(w, h) \
uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \
const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
const uint8_t *second_pred) { \
return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
second_pred); \
}
#define highbd_sad4MxN_avg(m, n) \
unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
second_pred, m, n); \
}
HBD_SAD_WXH_AVG_NEON(4, 4)
HBD_SAD_WXH_AVG_NEON(4, 8)
#define highbd_sadMxN_avg(m, n) \
unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, const uint8_t *second_pred) { \
return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
second_pred, m, n); \
}
HBD_SAD_WXH_AVG_NEON(8, 4)
HBD_SAD_WXH_AVG_NEON(8, 8)
HBD_SAD_WXH_AVG_NEON(8, 16)
#define highbd_sadMxNx4D(m, n) \
void vpx_highbd_sad##m##x##n##x4d_neon( \
const uint8_t *src_ptr, int src_stride, \
const uint8_t *const ref_array[4], int ref_stride, \
uint32_t sad_array[4]) { \
int i; \
for (i = 0; i < 4; ++i) { \
sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \
ref_array[i], ref_stride); \
} \
}
HBD_SAD_WXH_AVG_NEON(16, 8)
HBD_SAD_WXH_AVG_NEON(16, 16)
HBD_SAD_WXH_AVG_NEON(16, 32)
/* clang-format off */
// 4x4
highbd_sad4MxN(4, 4)
highbd_sad4MxN_avg(4, 4)
highbd_sadMxNx4D(4, 4)
HBD_SAD_WXH_AVG_NEON(32, 16)
HBD_SAD_WXH_AVG_NEON(32, 32)
HBD_SAD_WXH_AVG_NEON(32, 64)
// 4x8
highbd_sad4MxN(4, 8)
highbd_sad4MxN_avg(4, 8)
highbd_sadMxNx4D(4, 8)
// 8x4
highbd_sadMxN(8, 4)
highbd_sadMxN_avg(8, 4)
highbd_sadMxNx4D(8, 4)
// 8x8
highbd_sadMxN(8, 8)
highbd_sadMxN_avg(8, 8)
highbd_sadMxNx4D(8, 8)
// 8x16
highbd_sadMxN(8, 16)
highbd_sadMxN_avg(8, 16)
highbd_sadMxNx4D(8, 16)
// 16x8
highbd_sadMxN(16, 8)
highbd_sadMxN_avg(16, 8)
highbd_sadMxNx4D(16, 8)
// 16x16
highbd_sadMxN(16, 16)
highbd_sadMxN_avg(16, 16)
highbd_sadMxNx4D(16, 16)
// 16x32
highbd_sadMxN(16, 32)
highbd_sadMxN_avg(16, 32)
highbd_sadMxNx4D(16, 32)
// 32x16
highbd_sadMxN(32, 16)
highbd_sadMxN_avg(32, 16)
highbd_sadMxNx4D(32, 16)
// 32x32
highbd_sadMxN(32, 32)
highbd_sadMxN_avg(32, 32)
highbd_sadMxNx4D(32, 32)
// 32x64
highbd_sadMxN(32, 64)
highbd_sadMxN_avg(32, 64)
highbd_sadMxNx4D(32, 64)
// 64x32
highbd_sadMxN(64, 32)
highbd_sadMxN_avg(64, 32)
highbd_sadMxNx4D(64, 32)
// 64x64
highbd_sadMxN(64, 64)
highbd_sadMxN_avg(64, 64)
highbd_sadMxNx4D(64, 64)
/* clang-format on */
HBD_SAD_WXH_AVG_NEON(64, 32)
HBD_SAD_WXH_AVG_NEON(64, 64)

Просмотреть файл

@ -0,0 +1,594 @@
/*
* Copyright (c) 2023 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <arm_neon.h>
#include <assert.h>
#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/mem_neon.h"
// The bilinear filters look like this:
//
// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }}
//
// We can factor out the highest common multiple, such that the sum of both
// weights will be 8 instead of 128. The benefits of this are two-fold:
//
// 1) We can infer the filter values from the filter_offset parameter in the
// bilinear filter functions below - we don't have to actually load the values
// from memory:
// f0 = 8 - filter_offset
// f1 = filter_offset
//
// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
// 16-bit data types at all times, rather than widening out to 32-bit and
// requiring double the number of data processing instructions. (12-bit * 8 =
// 15-bit.)
// Process a block exactly 4 wide and a multiple of 2 high.
static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
uint16_t *dst_ptr, int src_stride,
int pixel_step, int dst_height,
int filter_offset) {
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
int i = dst_height;
do {
uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
uint16x8_t blend = vmulq_u16(s0, f0);
blend = vmlaq_u16(blend, s1, f1);
blend = vrshrq_n_u16(blend, 3);
vst1q_u16(dst_ptr, blend);
src_ptr += 2 * src_stride;
dst_ptr += 8;
i -= 2;
} while (i != 0);
}
// Process a block which is a multiple of 8 and any height.
static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
uint16_t *dst_ptr,
int src_stride, int pixel_step,
int dst_width, int dst_height,
int filter_offset) {
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
int i = dst_height;
do {
int j = 0;
do {
uint16x8_t s0 = vld1q_u16(src_ptr + j);
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
uint16x8_t blend = vmulq_u16(s0, f0);
blend = vmlaq_u16(blend, s1, f1);
blend = vrshrq_n_u16(blend, 3);
vst1q_u16(dst_ptr + j, blend);
j += 8;
} while (j < dst_width);
src_ptr += src_stride;
dst_ptr += dst_width;
} while (--i != 0);
}
static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
uint16_t *dst_ptr, int src_stride,
int pixel_step, int dst_height,
int filter_offset) {
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
8, dst_height, filter_offset);
}
static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
uint16_t *dst_ptr, int src_stride,
int pixel_step, int dst_height,
int filter_offset) {
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
16, dst_height, filter_offset);
}
static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
uint16_t *dst_ptr, int src_stride,
int pixel_step, int dst_height,
int filter_offset) {
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
32, dst_height, filter_offset);
}
static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
uint16_t *dst_ptr, int src_stride,
int pixel_step, int dst_height,
int filter_offset) {
highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
64, dst_height, filter_offset);
}
static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
uint16_t *dst_ptr, int src_stride,
int pixel_step, int dst_width,
int dst_height) {
int i = dst_height;
// We only specialize on the filter values for large block sizes (>= 16x16.)
assert(dst_width >= 16 && dst_width % 16 == 0);
do {
int j = 0;
do {
uint16x8_t s0 = vld1q_u16(src_ptr + j);
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
uint16x8_t avg = vrhaddq_u16(s0, s1);
vst1q_u16(dst_ptr + j, avg);
j += 8;
} while (j < dst_width);
src_ptr += src_stride;
dst_ptr += dst_width;
} while (--i != 0);
}
#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
const uint8_t *ref, int ref_stride, uint32_t *sse) { \
uint16_t tmp0[w * (h + padding)]; \
uint16_t tmp1[w * h]; \
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
\
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + padding), xoffset); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
\
return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
w, ref, ref_stride, sse); \
}
#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
const uint8_t *ref, int ref_stride, unsigned int *sse) { \
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
\
if (xoffset == 0) { \
if (yoffset == 0) { \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp[w * h]; \
highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
h); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \
src_stride, h, yoffset); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
uint16_t tmp0[w * (h + padding)]; \
if (yoffset == 0) { \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * (h + padding)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
(h + padding)); \
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * (h + padding)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \
(h + padding)); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} else { \
uint16_t tmp0[w * (h + padding)]; \
if (yoffset == 0) { \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \
xoffset); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + padding), xoffset); \
highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + padding), xoffset); \
highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} \
}
// 4x<h> blocks are processed two rows at a time, so require an extra row of
// padding.
// 8-bit
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2)
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2)
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1)
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1)
HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1)
// 10-bit
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2)
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2)
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1)
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1)
HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1)
// 12-bit
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2)
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2)
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1)
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1)
HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1)
HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1)
// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
// width 4.
static void highbd_avg_pred_var_filter_block2d_bil_w4(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_height, int filter_offset, const uint16_t *second_pred) {
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
int i = dst_height;
do {
uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride);
uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride);
uint16x8_t p = vld1q_u16(second_pred);
uint16x8_t blend = vmulq_u16(s0, f0);
blend = vmlaq_u16(blend, s1, f1);
blend = vrshrq_n_u16(blend, 3);
vst1q_u16(dst_ptr, vrhaddq_u16(blend, p));
src_ptr += 2 * src_stride;
dst_ptr += 2 * 4;
second_pred += 2 * 4;
i -= 2;
} while (i != 0);
}
// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
static void highbd_avg_pred_var_filter_block2d_bil_large(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_width, int dst_height, int filter_offset,
const uint16_t *second_pred) {
const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
const uint16x8_t f1 = vdupq_n_u16(filter_offset);
int i = dst_height;
do {
int j = 0;
do {
uint16x8_t s0 = vld1q_u16(src_ptr + j);
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
uint16x8_t p = vld1q_u16(second_pred);
uint16x8_t blend = vmulq_u16(s0, f0);
blend = vmlaq_u16(blend, s1, f1);
blend = vrshrq_n_u16(blend, 3);
vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
j += 8;
second_pred += 8;
} while (j < dst_width);
src_ptr += src_stride;
dst_ptr += dst_width;
} while (--i != 0);
}
static void highbd_avg_pred_var_filter_block2d_bil_w8(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_height, int filter_offset, const uint16_t *second_pred) {
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
pixel_step, 8, dst_height,
filter_offset, second_pred);
}
static void highbd_avg_pred_var_filter_block2d_bil_w16(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_height, int filter_offset, const uint16_t *second_pred) {
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
pixel_step, 16, dst_height,
filter_offset, second_pred);
}
static void highbd_avg_pred_var_filter_block2d_bil_w32(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_height, int filter_offset, const uint16_t *second_pred) {
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
pixel_step, 32, dst_height,
filter_offset, second_pred);
}
static void highbd_avg_pred_var_filter_block2d_bil_w64(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_height, int filter_offset, const uint16_t *second_pred) {
highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
pixel_step, 64, dst_height,
filter_offset, second_pred);
}
// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
static void highbd_avg_pred_var_filter_block2d_avg(
const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
int dst_width, int dst_height, const uint16_t *second_pred) {
int i = dst_height;
// We only specialize on the filter values for large block sizes (>= 16x16.)
assert(dst_width >= 16 && dst_width % 16 == 0);
do {
int j = 0;
do {
uint16x8_t s0 = vld1q_u16(src_ptr + j);
uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
uint16x8_t avg = vrhaddq_u16(s0, s1);
uint16x8_t p = vld1q_u16(second_pred);
avg = vrhaddq_u16(avg, p);
vst1q_u16(dst_ptr + j, avg);
j += 8;
second_pred += 8;
} while (j < dst_width);
src_ptr += src_stride;
dst_ptr += dst_width;
} while (--i != 0);
}
// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
int src_stride, int dst_width, int dst_height,
const uint16_t *second_pred) {
int i = dst_height;
// We only specialize on the filter values for large block sizes (>= 16x16.)
assert(dst_width >= 16 && dst_width % 16 == 0);
do {
int j = 0;
do {
uint16x8_t s = vld1q_u16(src_ptr + j);
uint16x8_t p = vld1q_u16(second_pred);
uint16x8_t avg = vrhaddq_u16(s, p);
vst1q_u16(dst_ptr + j, avg);
j += 8;
second_pred += 8;
} while (j < dst_width);
src_ptr += src_stride;
dst_ptr += dst_width;
} while (--i != 0);
}
#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
const uint8_t *src, int src_stride, int xoffset, int yoffset, \
const uint8_t *ref, int ref_stride, uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t tmp0[w * (h + padding)]; \
uint16_t tmp1[w * h]; \
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
\
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \
(h + padding), xoffset); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
\
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
}
#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \
unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
const uint8_t *src, int source_stride, int xoffset, int yoffset, \
const uint8_t *ref, int ref_stride, unsigned int *sse, \
const uint8_t *second_pred) { \
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \
\
if (xoffset == 0) { \
uint16_t tmp[w * h]; \
if (yoffset == 0) { \
highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
highbd_avg_pred_var_filter_block2d_avg( \
src_ptr, tmp, source_stride, source_stride, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} else { \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
src_ptr, tmp, source_stride, source_stride, h, yoffset, \
CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \
} \
} else if (xoffset == 4) { \
uint16_t tmp0[w * (h + padding)]; \
if (yoffset == 0) { \
highbd_avg_pred_var_filter_block2d_avg( \
src_ptr, tmp0, source_stride, 1, w, h, \
CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * (h + padding)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
(h + padding)); \
highbd_avg_pred_var_filter_block2d_avg( \
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * (h + padding)]; \
highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \
(h + padding)); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} else { \
uint16_t tmp0[w * (h + padding)]; \
if (yoffset == 0) { \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
src_ptr, tmp0, source_stride, 1, h, xoffset, \
CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \
} else if (yoffset == 4) { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
(h + padding), xoffset); \
highbd_avg_pred_var_filter_block2d_avg( \
tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} else { \
uint16_t tmp1[w * h]; \
highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \
(h + padding), xoffset); \
highbd_avg_pred_var_filter_block2d_bil_w##w( \
tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \
return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \
CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \
} \
} \
}
// 4x<h> blocks are processed two rows at a time, so require an extra row of
// padding.
// 8-bit
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1)
// 10-bit
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1)
// 12-bit
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1)
HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1)
HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1)

Просмотреть файл

@ -18,11 +18,6 @@
#include "vpx_dsp/arm/sum_neon.h"
#include "vpx_ports/mem.h"
static const uint8_t bilinear_filters[8][2] = {
{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
{ 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
};
static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
const uint16_t *ref_ptr, int ref_stride,
int w, int h, uint64_t *sse,
@ -136,7 +131,7 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
*sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
}
#define HIGHBD_VAR(W, H) \
#define HBD_VARIANCE_WXH_NEON(W, H) \
uint32_t vpx_highbd_8_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
int ref_stride, uint32_t *sse) { \
@ -218,274 +213,19 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
return *sse; \
}
static INLINE void highbd_var_filter_block2d_bil_first_pass(
const uint8_t *src_ptr8, uint16_t *output_ptr,
unsigned int src_pixels_per_line, int pixel_step,
unsigned int output_height, unsigned int output_width,
const uint8_t *filter) {
uint32_t i, j;
uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
if (output_width >= 8) {
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 8) {
const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
uint16x4_t out1_u16;
uint16x4_t out2_u16;
sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
}
// Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
} else {
assert(output_width >= 4);
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 4) {
const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
uint16x4_t out_u16;
sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
vst1_u16(&output_ptr[j], out_u16);
}
// Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
}
}
static INLINE void highbd_var_filter_block2d_bil_second_pass(
const uint16_t *src_ptr, uint16_t *output_ptr,
unsigned int src_pixels_per_line, unsigned int pixel_step,
unsigned int output_height, unsigned int output_width,
const uint8_t *filter) {
uint32_t i, j;
uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
if (output_width >= 8) {
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 8) {
const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
uint16x4_t out1_u16;
uint16x4_t out2_u16;
sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
}
// Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
} else {
assert(output_width >= 4);
for (i = 0; i < output_height; ++i) {
for (j = 0; j < output_width; j += 4) {
const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
uint16x4_t out_u16;
sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
vst1_u16(&output_ptr[j], out_u16);
}
// Next row...
src_ptr += src_pixels_per_line;
output_ptr += output_width;
}
}
}
#define HIGHBD_SUBPIX_VAR(W, H) \
uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass( \
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[y_offset]); \
\
return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
ref_ptr, ref_stride, sse); \
} \
\
uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass( \
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[y_offset]); \
\
return vpx_highbd_10_variance##W##x##H##_neon( \
CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
} \
\
uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
\
highbd_var_filter_block2d_bil_first_pass( \
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[y_offset]); \
\
return vpx_highbd_12_variance##W##x##H##_neon( \
CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \
}
#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass( \
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[y_offset]); \
\
vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
H, temp2, W); \
\
return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
ref_ptr, ref_stride, sse); \
} \
\
uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass( \
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[y_offset]); \
\
vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
H, temp2, W); \
\
return vpx_highbd_10_variance##W##x##H##_neon( \
CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
} \
\
uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \
const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \
const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \
const uint8_t *second_pred) { \
uint16_t fdata3[(H + 1) * W]; \
uint16_t temp2[H * W]; \
DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
\
highbd_var_filter_block2d_bil_first_pass( \
src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
bilinear_filters[y_offset]); \
\
vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \
H, temp2, W); \
\
return vpx_highbd_12_variance##W##x##H##_neon( \
CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \
}
void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
int width, int height, const uint16_t *ref,
int ref_stride) {
int i, j;
uint32x4_t one_u32 = vdupq_n_u32(1);
if (width >= 8) {
for (i = 0; i < height; ++i) {
for (j = 0; j < width; j += 8) {
const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
const uint32x4_t sum1_u32 =
vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
const uint32x4_t sum2_u32 =
vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
const uint16x4_t sum1_u16 =
vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
const uint16x4_t sum2_u16 =
vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
vst1q_u16(&comp_pred[j], vcomp_pred);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
} else {
assert(width >= 4);
for (i = 0; i < height; ++i) {
for (j = 0; j < width; j += 4) {
const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
const uint16x4_t vcomp_pred =
vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
vst1_u16(&comp_pred[j], vcomp_pred);
}
comp_pred += width;
pred += width;
ref += ref_stride;
}
}
}
/* All three forms of the variance are available in the same sizes. */
#define HIGHBD_VARIANCES(W, H) \
HIGHBD_VAR(W, H) \
HIGHBD_SUBPIX_VAR(W, H) \
HIGHBD_SUBPIX_AVG_VAR(W, H)
HIGHBD_VARIANCES(64, 64)
HIGHBD_VARIANCES(64, 32)
HIGHBD_VARIANCES(32, 64)
HIGHBD_VARIANCES(32, 32)
HIGHBD_VARIANCES(32, 16)
HIGHBD_VARIANCES(16, 32)
HIGHBD_VARIANCES(16, 16)
HIGHBD_VARIANCES(16, 8)
HIGHBD_VARIANCES(8, 16)
HIGHBD_VARIANCES(8, 8)
HIGHBD_VARIANCES(8, 4)
HIGHBD_VARIANCES(4, 8)
HIGHBD_VARIANCES(4, 4)
HBD_VARIANCE_WXH_NEON(64, 64)
HBD_VARIANCE_WXH_NEON(64, 32)
HBD_VARIANCE_WXH_NEON(32, 64)
HBD_VARIANCE_WXH_NEON(32, 32)
HBD_VARIANCE_WXH_NEON(32, 16)
HBD_VARIANCE_WXH_NEON(16, 32)
HBD_VARIANCE_WXH_NEON(16, 16)
HBD_VARIANCE_WXH_NEON(16, 8)
HBD_VARIANCE_WXH_NEON(8, 16)
HBD_VARIANCE_WXH_NEON(8, 8)
HBD_VARIANCE_WXH_NEON(8, 4)
HBD_VARIANCE_WXH_NEON(4, 8)
HBD_VARIANCE_WXH_NEON(4, 4)
HIGHBD_GET_VAR(8)
HIGHBD_GET_VAR(16)

Просмотреть файл

@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
(void)bd;
if (w < 8) { // copy4
uint16x4_t s0, s1;
do {
vst1_u16(dst, vld1_u16(src));
s0 = vld1_u16(src);
src += src_stride;
s1 = vld1_u16(src);
src += src_stride;
vst1_u16(dst, s0);
dst += dst_stride;
vst1_u16(dst, vld1_u16(src));
src += src_stride;
vst1_u16(dst, s1);
dst += dst_stride;
h -= 2;
} while (h > 0);
} while (h != 0);
} else if (w == 8) { // copy8
uint16x8_t s0, s1;
do {
vst1q_u16(dst, vld1q_u16(src));
s0 = vld1q_u16(src);
src += src_stride;
s1 = vld1q_u16(src);
src += src_stride;
vst1q_u16(dst, s0);
dst += dst_stride;
vst1q_u16(dst, vld1q_u16(src));
src += src_stride;
vst1q_u16(dst, s1);
dst += dst_stride;
h -= 2;
} while (h > 0);
} while (h != 0);
} else if (w < 32) { // copy16
uint16x8_t s0, s1, s2, s3;
do {
vst2q_u16(dst, vld2q_u16(src));
s0 = vld1q_u16(src);
s1 = vld1q_u16(src + 8);
src += src_stride;
dst += dst_stride;
vst2q_u16(dst, vld2q_u16(src));
s2 = vld1q_u16(src);
s3 = vld1q_u16(src + 8);
src += src_stride;
vst1q_u16(dst, s0);
vst1q_u16(dst + 8, s1);
dst += dst_stride;
vst2q_u16(dst, vld2q_u16(src));
src += src_stride;
vst1q_u16(dst, s2);
vst1q_u16(dst + 8, s3);
dst += dst_stride;
vst2q_u16(dst, vld2q_u16(src));
src += src_stride;
dst += dst_stride;
h -= 4;
} while (h > 0);
h -= 2;
} while (h != 0);
} else if (w == 32) { // copy32
uint16x8_t s0, s1, s2, s3;
do {
vst4q_u16(dst, vld4q_u16(src));
s0 = vld1q_u16(src);
s1 = vld1q_u16(src + 8);
s2 = vld1q_u16(src + 16);
s3 = vld1q_u16(src + 24);
src += src_stride;
vst1q_u16(dst, s0);
vst1q_u16(dst + 8, s1);
vst1q_u16(dst + 16, s2);
vst1q_u16(dst + 24, s3);
dst += dst_stride;
vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
vst4q_u16(dst, vld4q_u16(src));
src += src_stride;
dst += dst_stride;
h -= 4;
} while (h > 0);
} while (--h != 0);
} else { // copy64
uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
do {
vst4q_u16(dst, vld4q_u16(src));
vst4q_u16(dst + 32, vld4q_u16(src + 32));
s0 = vld1q_u16(src);
s1 = vld1q_u16(src + 8);
s2 = vld1q_u16(src + 16);
s3 = vld1q_u16(src + 24);
s4 = vld1q_u16(src + 32);
s5 = vld1q_u16(src + 40);
s6 = vld1q_u16(src + 48);
s7 = vld1q_u16(src + 56);
src += src_stride;
vst1q_u16(dst, s0);
vst1q_u16(dst + 8, s1);
vst1q_u16(dst + 16, s2);
vst1q_u16(dst + 24, s3);
vst1q_u16(dst + 32, s4);
vst1q_u16(dst + 40, s5);
vst1q_u16(dst + 48, s6);
vst1q_u16(dst + 56, s7);
dst += dst_stride;
vst4q_u16(dst, vld4q_u16(src));
vst4q_u16(dst + 32, vld4q_u16(src + 32));
src += src_stride;
dst += dst_stride;
vst4q_u16(dst, vld4q_u16(src));
vst4q_u16(dst + 32, vld4q_u16(src + 32));
src += src_stride;
dst += dst_stride;
vst4q_u16(dst, vld4q_u16(src));
vst4q_u16(dst + 32, vld4q_u16(src + 32));
src += src_stride;
dst += dst_stride;
h -= 4;
} while (h > 0);
} while (--h != 0);
}
}

Просмотреть файл

@ -126,6 +126,20 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
return vreinterpret_u8_u32(a_u32);
}
// Load 2 sets of 8 bytes when alignment is not guaranteed.
static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
ptrdiff_t stride) {
uint64_t a;
uint64x2_t a_u64;
if (stride == 4) return vld1q_u16(buf);
memcpy(&a, buf, 8);
buf += stride;
a_u64 = vdupq_n_u64(a);
memcpy(&a, buf, 8);
a_u64 = vsetq_lane_u64(a, a_u64, 1);
return vreinterpretq_u16_u64(a_u64);
}
// Store 2 sets of 4 bytes when alignment is not guaranteed.
static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
const uint8x8_t a) {

Просмотреть файл

@ -17,633 +17,316 @@
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/sum_neon.h"
static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
const void *const buf1) {
uint32_t a;
uint32x2_t aa;
memcpy(&a, buf0, 4);
aa = vdup_n_u32(a);
memcpy(&a, buf1, 4);
aa = vset_lane_u32(a, aa, 1);
return vreinterpret_u8_u32(aa);
#if defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
uint32x4_t *const sad_sum) {
uint8x16_t abs_diff = vabdq_u8(src, ref);
*sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
}
static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
const uint8_t *const ref_array[4],
const int ref_stride, const int height,
uint32_t sad_array[4]) {
int i;
uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
#if !defined(__aarch64__)
uint16x4_t a[2];
#endif
uint32x4_t r;
static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum[4];
assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
assert(!(src_stride % sizeof(uint32_t)));
int i = 0;
do {
uint8x16_t s0, s1, s2, s3;
for (i = 0; i < height; ++i) {
const uint8x8_t s = vreinterpret_u8_u32(
vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
const uint8x8_t ref01 = load_unaligned_2_buffers(
ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
const uint8x8_t ref23 = load_unaligned_2_buffers(
ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
abs[0] = vabal_u8(abs[0], s, ref01);
abs[1] = vabal_u8(abs[1], s, ref23);
}
s0 = vld1q_u8(src + i * src_stride);
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
#if defined(__aarch64__)
abs[0] = vpaddq_u16(abs[0], abs[1]);
r = vpaddlq_u16(abs[0]);
#else
a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
#endif
vst1q_u32(sad_array, r);
s1 = vld1q_u8(src + i * src_stride + 16);
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
s2 = vld1q_u8(src + i * src_stride + 32);
sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
s3 = vld1q_u8(src + i * src_stride + 48);
sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
i++;
} while (i < h);
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
uint32x4_t sum[4];
int i = 0;
do {
uint8x16_t s0, s1;
s0 = vld1q_u8(src + i * src_stride);
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
s1 = vld1q_u8(src + i * src_stride + 16);
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
i++;
} while (i < h);
sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
}
////////////////////////////////////////////////////////////////////////////////
// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
const uint16x8_t b0 = vpaddq_u16(a0, a1);
const uint32x4_t r = vpaddlq_u16(b0);
#else
const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
const uint16x4_t b0 = vpadd_u16(a0, a1);
const uint16x4_t b1 = vpadd_u16(a2, a3);
const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
#endif
vst1q_u32(sad_array, r);
}
#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
// Can handle 1024 pixels' sad sum (such as 32x32)
static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
const uint32x4_t b0 = vpaddlq_u16(a0);
const uint32x4_t b1 = vpaddlq_u16(a1);
const uint32x4_t r = vpaddq_u32(b0, b1);
vst1q_u32(sad_array, r);
#else
const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
vst1q_u32(sad_array, vcombine_u32(c0, c1));
#endif
}
// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
const uint32x4_t b0 = vpaddq_u32(a0, a1);
const uint32x4_t b1 = vpaddq_u32(a2, a3);
const uint32x4_t r = vpaddq_u32(b0, b1);
vst1q_u32(sad_array, r);
#else
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
const uint32x2_t c0 = vpadd_u32(b0, b1);
const uint32x2_t c1 = vpadd_u32(b2, b3);
vst1q_u32(sad_array, vcombine_u32(c0, c1));
#endif
}
// Can handle 4096 pixels' sad sum (such as 64x64)
static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
const uint32x4_t a4 = vpaddlq_u16(sum[4]);
const uint32x4_t a5 = vpaddlq_u16(sum[5]);
const uint32x4_t a6 = vpaddlq_u16(sum[6]);
const uint32x4_t a7 = vpaddlq_u16(sum[7]);
const uint32x4_t b0 = vaddq_u32(a0, a1);
const uint32x4_t b1 = vaddq_u32(a2, a3);
const uint32x4_t b2 = vaddq_u32(a4, a5);
const uint32x4_t b3 = vaddq_u32(a6, a7);
const uint32x4_t c0 = vpaddq_u32(b0, b1);
const uint32x4_t c1 = vpaddq_u32(b2, b3);
const uint32x4_t r = vpaddq_u32(c0, c1);
vst1q_u32(sad_array, r);
#else
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
const uint32x4_t a2 = vpaddlq_u16(sum[2]);
const uint32x4_t a3 = vpaddlq_u16(sum[3]);
const uint32x4_t a4 = vpaddlq_u16(sum[4]);
const uint32x4_t a5 = vpaddlq_u16(sum[5]);
const uint32x4_t a6 = vpaddlq_u16(sum[6]);
const uint32x4_t a7 = vpaddlq_u16(sum[7]);
const uint32x4_t b0 = vaddq_u32(a0, a1);
const uint32x4_t b1 = vaddq_u32(a2, a3);
const uint32x4_t b2 = vaddq_u32(a4, a5);
const uint32x4_t b3 = vaddq_u32(a6, a7);
const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
const uint32x2_t d0 = vpadd_u32(c0, c1);
const uint32x2_t d1 = vpadd_u32(c2, c3);
vst1q_u32(sad_array, vcombine_u32(d0, d1));
#endif
}
#endif
static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4], const int height) {
int i, j;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < height; ++i) {
const uint8x8_t s = vld1_u8(src_ptr);
src_ptr += src_stride;
for (j = 0; j < 4; ++j) {
const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
ref_loop[j] += ref_stride;
sum[j] = vabal_u8(sum[j], s, b_u8);
}
}
sad_512_pel_final_neon(sum, sad_array);
}
void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
}
void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
}
void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
}
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint32x4_t *const sum) {
const uint8x16_t r = vld1q_u8(ref_ptr);
const uint8x16_t diff = vabdq_u8(src_ptr, r);
*sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
}
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4], const int height) {
int i;
uint32x4_t r0, r1;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
for (i = 0; i < height; ++i) {
const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
}
int i = 0;
do {
const uint8x16_t s = vld1q_u8(src + i * src_stride);
sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
i++;
} while (i < h);
vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
}
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
#else // !defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
uint16x8_t *const sum) {
const uint8x16_t r = vld1q_u8(ref_ptr);
*sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
*sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
uint16x8_t *const sad_sum) {
uint8x16_t abs_diff = vabdq_u8(src, ref);
*sad_sum = vpadalq_u8(*sad_sum, abs_diff);
}
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4], const int height) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
int h_tmp = h > 64 ? 64 : h;
int i = 0;
vst1q_u32(res, vdupq_n_u32(0));
do {
uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
do {
uint8x16_t s0, s1, s2, s3;
s0 = vld1q_u8(src + i * src_stride);
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
s1 = vld1q_u8(src + i * src_stride + 16);
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
s2 = vld1q_u8(src + i * src_stride + 32);
sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
s3 = vld1q_u8(src + i * src_stride + 48);
sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
i++;
} while (i < h_tmp);
res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
h_tmp += 64;
} while (i < h);
}
static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
int i = 0;
do {
uint8x16_t s0, s1;
s0 = vld1q_u8(src + i * src_stride);
sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
s1 = vld1q_u8(src + i * src_stride + 16);
sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
i++;
} while (i < h);
res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]);
res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]);
res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]);
res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]);
}
static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < height; ++i) {
const uint8x16_t s = vld1q_u8(src_ptr);
src_ptr += src_stride;
/* Manual unrolling here stops the compiler from getting confused. */
sad16_neon(ref_loop[0], s, &sum[0]);
ref_loop[0] += ref_stride;
sad16_neon(ref_loop[1], s, &sum[1]);
ref_loop[1] += ref_stride;
sad16_neon(ref_loop[2], s, &sum[2]);
ref_loop[2] += ref_stride;
sad16_neon(ref_loop[3], s, &sum[3]);
ref_loop[3] += ref_stride;
}
int i = 0;
do {
const uint8x16_t s = vld1q_u8(src + i * src_stride);
sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
sad_512_pel_final_neon(sum, sad_array);
i++;
} while (i < h);
res[0] = horizontal_add_uint16x8(sum[0]);
res[1] = horizontal_add_uint16x8(sum[1]);
res[2] = horizontal_add_uint16x8(sum[2]);
res[3] = horizontal_add_uint16x8(sum[3]);
}
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
#endif // defined(__ARM_FEATURE_DOTPROD)
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
uint16x8_t *const sad_sum) {
uint8x8_t abs_diff = vabd_u8(src, ref);
*sad_sum = vaddw_u8(*sad_sum, abs_diff);
}
void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
}
void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
}
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4], const int height) {
int i;
uint32x4_t r0, r1;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
for (i = 0; i < height; ++i) {
uint8x16_t s;
s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
ref_loop[3] += ref_stride;
}
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
}
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
}
void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
}
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
const int height, uint16x8_t *const sum) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
uint8x16_t s;
s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
ref_loop[3] += ref_stride;
}
}
void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
uint16x8_t sum[4];
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
sad_512_pel_final_neon(sum, sad_array);
}
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
uint16x8_t sum[4];
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
sad_1024_pel_final_neon(sum, sad_array);
}
void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
uint16x8_t sum[4];
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
sad_2048_pel_final_neon(sum, sad_array);
}
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
////////////////////////////////////////////////////////////////////////////////
#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
int i;
uint32x4_t r0, r1;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0) };
for (i = 0; i < 32; ++i) {
uint8x16_t s;
s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 2 * 16);
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 3 * 16);
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
ref_loop[3] += ref_stride;
}
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
int i;
uint32x4_t r0, r1, r2, r3;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
vdupq_n_u32(0), vdupq_n_u32(0) };
for (i = 0; i < 64; ++i) {
uint8x16_t s;
s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
s = vld1q_u8(src_ptr + 2 * 16);
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
s = vld1q_u8(src_ptr + 3 * 16);
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
ref_loop[3] += ref_stride;
}
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
r2 = vpaddq_u32(sum[4], sum[5]);
r3 = vpaddq_u32(sum[6], sum[7]);
r0 = vpaddq_u32(r0, r1);
r1 = vpaddq_u32(r2, r3);
vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < 32; ++i) {
uint8x16_t s;
int i = 0;
do {
const uint8x8_t s = vld1_u8(src + i * src_stride);
sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
i++;
} while (i < h);
s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 2 * 16);
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
s = vld1q_u8(src_ptr + 3 * 16);
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
ref_loop[3] += ref_stride;
}
sad_2048_pel_final_neon(sum, sad_array);
res[0] = horizontal_add_uint16x8(sum[0]);
res[1] = horizontal_add_uint16x8(sum[1]);
res[2] = horizontal_add_uint16x8(sum[2]);
res[3] = horizontal_add_uint16x8(sum[3]);
}
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
uint32_t sad_array[4]) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0), vdupq_n_u16(0) };
static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t res[4], int h) {
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
for (i = 0; i < 64; ++i) {
uint8x16_t s;
int i = 0;
do {
uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
s = vld1q_u8(src_ptr + 0 * 16);
sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
sad8_neon(s, r0, &sum[0]);
sad8_neon(s, r1, &sum[1]);
sad8_neon(s, r2, &sum[2]);
sad8_neon(s, r3, &sum[3]);
s = vld1q_u8(src_ptr + 1 * 16);
sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
i += 2;
} while (i < h);
s = vld1q_u8(src_ptr + 2 * 16);
sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
s = vld1q_u8(src_ptr + 3 * 16);
sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
src_ptr += src_stride;
ref_loop[0] += ref_stride;
ref_loop[1] += ref_stride;
ref_loop[2] += ref_stride;
ref_loop[3] += ref_stride;
}
sad_4096_pel_final_neon(sum, sad_array);
res[0] = horizontal_add_uint16x8(sum[0]);
res[1] = horizontal_add_uint16x8(sum[1]);
res[2] = horizontal_add_uint16x8(sum[2]);
res[3] = horizontal_add_uint16x8(sum[3]);
}
#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
#define SAD_WXH_4D_NEON(w, h) \
void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
const uint8_t *const ref[4], int ref_stride, \
uint32_t res[4]) { \
sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \
}
SAD_WXH_4D_NEON(4, 4)
SAD_WXH_4D_NEON(4, 8)
SAD_WXH_4D_NEON(8, 4)
SAD_WXH_4D_NEON(8, 8)
SAD_WXH_4D_NEON(8, 16)
SAD_WXH_4D_NEON(16, 8)
SAD_WXH_4D_NEON(16, 16)
SAD_WXH_4D_NEON(16, 32)
SAD_WXH_4D_NEON(32, 16)
SAD_WXH_4D_NEON(32, 32)
SAD_WXH_4D_NEON(32, 64)
SAD_WXH_4D_NEON(64, 32)
SAD_WXH_4D_NEON(64, 64)
#undef SAD_WXH_4D_NEON

Просмотреть файл

@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
int i = h / 2;
do {
uint32x2_t s, r;
uint32_t s0, s1, r0, r1;
uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
memcpy(&s0, src_ptr, 4);
memcpy(&r0, ref_ptr, 4);
s = vdup_n_u32(s0);
r = vdup_n_u32(r0);
src_ptr += src_stride;
ref_ptr += ref_stride;
sum = vabal_u8(sum, s, r);
memcpy(&s1, src_ptr, 4);
memcpy(&r1, ref_ptr, 4);
s = vset_lane_u32(s1, s, 1);
r = vset_lane_u32(r1, r, 1);
src_ptr += src_stride;
ref_ptr += ref_stride;
sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r));
src_ptr += 2 * src_stride;
ref_ptr += 2 * ref_stride;
} while (--i != 0);
return horizontal_add_uint16x8(sum);
@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
int i = h / 2;
do {
uint32x2_t s, r;
uint32_t s0, s1, r0, r1;
uint8x8_t p, avg;
uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
uint8x8_t p = vld1_u8(second_pred);
memcpy(&s0, src_ptr, 4);
memcpy(&r0, ref_ptr, 4);
s = vdup_n_u32(s0);
r = vdup_n_u32(r0);
src_ptr += src_stride;
ref_ptr += ref_stride;
uint8x8_t avg = vrhadd_u8(r, p);
sum = vabal_u8(sum, s, avg);
memcpy(&s1, src_ptr, 4);
memcpy(&r1, ref_ptr, 4);
s = vset_lane_u32(s1, s, 1);
r = vset_lane_u32(r1, r, 1);
src_ptr += src_stride;
ref_ptr += ref_stride;
p = vld1_u8(second_pred);
avg = vrhadd_u8(vreinterpret_u8_u32(r), p);
sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg);
src_ptr += 2 * src_stride;
ref_ptr += 2 * ref_stride;
second_pred += 8;
} while (--i != 0);

Просмотреть файл

@ -40,6 +40,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
#endif
}
static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
#if defined(__aarch64__)
return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
#else
const uint32x4_t vec_l_lo =
vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
const uint32x4_t vec_l_hi =
vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
const uint64x2_t b = vpaddlq_u32(a);
const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
return vget_lane_u32(c, 0);
#endif
}
static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
#if defined(__aarch64__)
return vaddv_s32(a);
@ -77,4 +94,20 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
return vget_lane_u32(c, 0);
#endif
}
static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
#if defined(__aarch64__)
uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
return vpaddq_u32(res01, res23);
#else
uint32x4_t res = vdupq_n_u32(0);
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
return res;
#endif
}
#endif // VPX_VPX_DSP_ARM_SUM_NEON_H_

Просмотреть файл

@ -23,10 +23,17 @@
// b0.val[1]: 04 05 06 07 20 21 22 23
static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
#if defined(__aarch64__)
b0.val[0] = vreinterpretq_s16_s64(
vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
b0.val[1] = vreinterpretq_s16_s64(
vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
#else
b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
vreinterpret_s16_s32(vget_low_s32(a1)));
b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
vreinterpret_s16_s32(vget_high_s32(a1)));
#endif
return b0;
}
@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
uint16x8x2_t b0;
#if defined(__aarch64__)
b0.val[0] = vreinterpretq_u16_u64(
vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
b0.val[1] = vreinterpretq_u16_u64(
vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
#else
b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
vreinterpret_u16_u32(vget_low_u32(a1)));
b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
vreinterpret_u16_u32(vget_high_u32(a1)));
#endif
return b0;
}
@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
}
// Transpose 8x8 to a new location.
static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
// Swap 16 bit elements.
const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
// Swap 16 bit elements. Goes from:
// a0: 00 01 02 03 04 05 06 07
// a1: 10 11 12 13 14 15 16 17
// a2: 20 21 22 23 24 25 26 27
// a3: 30 31 32 33 34 35 36 37
// a4: 40 41 42 43 44 45 46 47
// a5: 50 51 52 53 54 55 56 57
// a6: 60 61 62 63 64 65 66 67
// a7: 70 71 72 73 74 75 76 77
// to:
// b0.val[0]: 00 10 02 12 04 14 06 16
// b0.val[1]: 01 11 03 13 05 15 07 17
// b1.val[0]: 20 30 22 32 24 34 26 36
// b1.val[1]: 21 31 23 33 25 35 27 37
// b2.val[0]: 40 50 42 52 44 54 46 56
// b2.val[1]: 41 51 43 53 45 55 47 57
// b3.val[0]: 60 70 62 72 64 74 66 76
// b3.val[1]: 61 71 63 73 65 75 67 77
// Swap 32 bit elements.
const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
vreinterpretq_s32_s16(c1.val[0]));
const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
vreinterpretq_s32_s16(c1.val[1]));
const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
vreinterpretq_s32_s16(c3.val[0]));
const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
vreinterpretq_s32_s16(c3.val[1]));
const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
// Swap 64 bit elements
const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
// Swap 32 bit elements resulting in:
// c0.val[0]: 00 10 20 30 04 14 24 34
// c0.val[1]: 02 12 22 32 06 16 26 36
// c1.val[0]: 01 11 21 31 05 15 25 35
// c1.val[1]: 03 13 23 33 07 17 27 37
// c2.val[0]: 40 50 60 70 44 54 64 74
// c2.val[1]: 42 52 62 72 46 56 66 76
// c3.val[0]: 41 51 61 71 45 55 65 75
// c3.val[1]: 43 53 63 73 47 57 67 77
b[0] = e0.val[0];
b[1] = e1.val[0];
b[2] = e2.val[0];
b[3] = e3.val[0];
b[4] = e0.val[1];
b[5] = e1.val[1];
b[6] = e2.val[1];
b[7] = e3.val[1];
const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
vreinterpretq_s32_s16(b1.val[0]));
const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
vreinterpretq_s32_s16(b1.val[1]));
const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
vreinterpretq_s32_s16(b3.val[0]));
const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
vreinterpretq_s32_s16(b3.val[1]));
// Swap 64 bit elements resulting in:
// d0.val[0]: 00 10 20 30 40 50 60 70
// d0.val[1]: 04 14 24 34 44 54 64 74
// d1.val[0]: 01 11 21 31 41 51 61 71
// d1.val[1]: 05 15 25 35 45 55 65 75
// d2.val[0]: 02 12 22 32 42 52 62 72
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
out[0] = d0.val[0];
out[1] = d1.val[0];
out[2] = d2.val[0];
out[3] = d3.val[0];
out[4] = d0.val[1];
out[5] = d1.val[1];
out[6] = d2.val[1];
out[7] = d3.val[1];
}
static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
// d2.val[1]: 06 16 26 36 46 56 66 76
// d3.val[0]: 03 13 23 33 43 53 63 73
// d3.val[1]: 07 17 27 37 47 57 67 77
const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);

Просмотреть файл

@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
* and highbd_8_variance(). It should not.
*/
static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h, unsigned int *sse,
int *sum) {
static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
int b_stride, int w, int h) {
int i, j;
*sum = 0;
*sse = 0;
int64_t sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
return sse;
}
#if CONFIG_VP9_HIGHBITDEPTH
static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, int w,
int h, uint64_t *sse, int64_t *sum) {
static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, int w,
int h) {
int i, j;
int64_t sse = 0;
uint16_t *a = CONVERT_TO_SHORTPTR(a8);
uint16_t *b = CONVERT_TO_SHORTPTR(b8);
*sum = 0;
*sse = 0;
for (i = 0; i < h; i++) {
for (j = 0; j < w; j++) {
const int diff = a[j] - b[j];
*sum += diff;
*sse += diff * diff;
sse += diff * diff;
}
a += a_stride;
b += b_stride;
}
}
static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
const uint8_t *b8, int b_stride, int w,
int h, unsigned int *sse, int *sum) {
uint64_t sse_long = 0;
int64_t sum_long = 0;
encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
&sum_long);
*sse = (unsigned int)sse_long;
*sum = (int)sum_long;
return sse;
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
const int dw = width % 16;
const int dh = height % 16;
int64_t total_sse = 0;
unsigned int sse = 0;
int sum = 0;
int x, y;
if (dw > 0) {
encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
height, &sse, &sum);
total_sse += sse;
total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
dw, height);
}
if (dh > 0) {
encoder_variance(&a[(height - dh) * a_stride], a_stride,
&b[(height - dh) * b_stride], b_stride, width - dw, dh,
&sse, &sum);
total_sse += sse;
total_sse +=
encoder_sse(&a[(height - dh) * a_stride], a_stride,
&b[(height - dh) * b_stride], b_stride, width - dw, dh);
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
unsigned int sse;
for (x = 0; x < width / 16; ++x) {
vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;
@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
int x, y;
const int dw = width % 16;
const int dh = height % 16;
unsigned int sse = 0;
int sum = 0;
if (dw > 0) {
encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
b_stride, dw, height, &sse, &sum);
total_sse += sse;
total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
b_stride, dw, height);
}
if (dh > 0) {
encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
&b[(height - dh) * b_stride], b_stride,
width - dw, dh, &sse, &sum);
total_sse += sse;
total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
&b[(height - dh) * b_stride], b_stride,
width - dw, dh);
}
for (y = 0; y < height / 16; ++y) {
const uint8_t *pa = a;
const uint8_t *pb = b;
unsigned int sse;
for (x = 0; x < width / 16; ++x) {
vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
total_sse += sse;

Просмотреть файл

@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

Просмотреть файл

@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm
DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c
endif # CONFIG_VP9_HIGHBITDEPTH
endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC

Просмотреть файл

@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
_img->fmt = _y4m->vpx_fmt;
_img->w = _img->d_w = _y4m->pic_w;
_img->h = _img->d_h = _y4m->pic_h;
_img->bit_depth = _y4m->bit_depth;
_img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
_img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
_img->bps = _y4m->bps;

Просмотреть файл

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9 (Thu Jan 26 21:31:14 2023).
release: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf (Tue Feb 14 02:46:51 2023).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9
revision: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

Просмотреть файл

@ -181,6 +181,7 @@ files = {
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
'libvpx/vp9/encoder/vp9_temporal_filter.c',
'libvpx/vp9/encoder/vp9_tokenize.c',
'libvpx/vp9/encoder/vp9_tpl_model.c',
'libvpx/vp9/encoder/vp9_treewriter.c',
'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
@ -450,6 +451,7 @@ files = {
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
'libvpx/vp9/encoder/vp9_temporal_filter.c',
'libvpx/vp9/encoder/vp9_tokenize.c',
'libvpx/vp9/encoder/vp9_tpl_model.c',
'libvpx/vp9/encoder/vp9_treewriter.c',
'libvpx/vp9/encoder/x86/temporal_filter_sse4.c',
'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c',
@ -699,6 +701,7 @@ files = {
'libvpx/vp9/encoder/vp9_subexp.c',
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
'libvpx/vp9/encoder/vp9_tokenize.c',
'libvpx/vp9/encoder/vp9_tpl_model.c',
'libvpx/vp9/encoder/vp9_treewriter.c',
'libvpx/vp9/vp9_cx_iface.c',
'libvpx/vp9/vp9_dx_iface.c',
@ -944,6 +947,7 @@ files = {
'libvpx/vp9/encoder/vp9_subexp.c',
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
'libvpx/vp9/encoder/vp9_tokenize.c',
'libvpx/vp9/encoder/vp9_tpl_model.c',
'libvpx/vp9/encoder/vp9_treewriter.c',
'libvpx/vp9/vp9_cx_iface.c',
'libvpx/vp9/vp9_dx_iface.c',
@ -1158,6 +1162,7 @@ files = {
'libvpx/vp9/encoder/vp9_svc_layercontext.c',
'libvpx/vp9/encoder/vp9_temporal_filter.c',
'libvpx/vp9/encoder/vp9_tokenize.c',
'libvpx/vp9/encoder/vp9_tpl_model.c',
'libvpx/vp9/encoder/vp9_treewriter.c',
'libvpx/vp9/vp9_cx_iface.c',
'libvpx/vp9/vp9_dx_iface.c',