From 1b617be5fa9e13f8d599e850456f2d8cde868c2b Mon Sep 17 00:00:00 2001 From: Chun-Min Chang Date: Tue, 14 Feb 2023 22:26:52 +0000 Subject: [PATCH] Bug 1816486 - Update libvpx to bc2965f r=webrtc-reviewers,ng Run `./mach vendor media/libvpx/moz.yaml --patch-mode=none` as what media/libvpx/README_MOZILLA said. The updated libvpx revision is bc2965ff72af7d7b21ffeab10549fcc67ed66ccf. Differential Revision: https://phabricator.services.mozilla.com/D169840 --- media/libvpx/config/generic/vpx_config.asm | 2 +- media/libvpx/config/generic/vpx_config.h | 2 +- media/libvpx/config/linux/arm/vpx_config.asm | 2 +- media/libvpx/config/linux/arm/vpx_config.h | 2 +- .../libvpx/config/linux/arm64/vpx_config.asm | 2 +- media/libvpx/config/linux/arm64/vpx_config.h | 2 +- media/libvpx/config/linux/ia32/vpx_config.asm | 2 +- media/libvpx/config/linux/ia32/vpx_config.h | 2 +- media/libvpx/config/linux/x64/vpx_config.asm | 2 +- media/libvpx/config/linux/x64/vpx_config.h | 2 +- media/libvpx/config/mac/ia32/vpx_config.asm | 2 +- media/libvpx/config/mac/ia32/vpx_config.h | 2 +- media/libvpx/config/mac/x64/vpx_config.asm | 2 +- media/libvpx/config/mac/x64/vpx_config.h | 2 +- media/libvpx/config/vpx_version.h | 6 +- .../libvpx/config/win/aarch64/vpx_config.asm | 2 +- media/libvpx/config/win/aarch64/vpx_config.h | 2 +- media/libvpx/config/win/ia32/vpx_config.asm | 2 +- media/libvpx/config/win/ia32/vpx_config.h | 2 +- media/libvpx/config/win/x64/vpx_config.asm | 2 +- media/libvpx/config/win/x64/vpx_config.h | 2 +- media/libvpx/libvpx/.mailmap | 1 + media/libvpx/libvpx/AUTHORS | 3 + media/libvpx/libvpx/CHANGELOG | 36 + media/libvpx/libvpx/README | 2 +- media/libvpx/libvpx/configure | 3 +- media/libvpx/libvpx/libs.mk | 4 +- .../libvpx/libvpx/test/comp_avg_pred_test.cc | 6 + media/libvpx/libvpx/test/variance_test.cc | 30 +- .../libvpx/test/vp8_ratectrl_rtc_test.cc | 7 +- .../libvpx/test/vp9_ext_ratectrl_test.cc | 2 +- .../libvpx/test/vp9_ratectrl_rtc_test.cc | 15 +- media/libvpx/libvpx/vp8/common/onyx.h | 19 +- media/libvpx/libvpx/vp8/encoder/block.h | 3 +- media/libvpx/libvpx/vp8/encoder/firstpass.c | 4 +- media/libvpx/libvpx/vp8/encoder/onyx_if.c | 20 +- media/libvpx/libvpx/vp8/vp8_cx_iface.c | 21 +- media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc | 11 +- media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h | 20 +- .../arm/neon/vp9_diamond_search_sad_neon.c | 2 +- .../libvpx/vp9/encoder/vp9_encodeframe.c | 76 +- media/libvpx/libvpx/vp9/encoder/vp9_encoder.c | 1565 ++--------------- media/libvpx/libvpx/vp9/encoder/vp9_encoder.h | 127 +- media/libvpx/libvpx/vp9/encoder/vp9_rd.h | 2 - media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c | 45 +- .../libvpx/vp9/encoder/vp9_speed_features.c | 29 +- .../libvpx/libvpx/vp9/encoder/vp9_tpl_model.c | 1400 +++++++++++++++ .../libvpx/libvpx/vp9/encoder/vp9_tpl_model.h | 44 + media/libvpx/libvpx/vp9/ratectrl_rtc.cc | 25 +- media/libvpx/libvpx/vp9/ratectrl_rtc.h | 31 +- media/libvpx/libvpx/vp9/vp9_cx_iface.c | 24 +- media/libvpx/libvpx/vp9/vp9cx.mk | 2 + .../libvpx/vpx/internal/vpx_ratectrl_rtc.h | 3 + .../libvpx/vpx_dsp/arm/fdct16x16_neon.c | 6 +- .../libvpx/vpx_dsp/arm/fdct32x32_neon.c | 64 +- .../libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c | 64 + .../libvpx/vpx_dsp/arm/highbd_sad4d_neon.c | 233 +++ .../libvpx/vpx_dsp/arm/highbd_sad_neon.c | 494 ++++-- .../vpx_dsp/arm/highbd_subpel_variance_neon.c | 594 +++++++ .../libvpx/vpx_dsp/arm/highbd_variance_neon.c | 288 +-- .../arm/highbd_vpx_convolve_copy_neon.c | 104 +- media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h | 14 + media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c | 863 +++------ media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c | 48 +- media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h | 33 + .../libvpx/vpx_dsp/arm/transpose_neon.h | 108 +- media/libvpx/libvpx/vpx_dsp/psnr.c | 67 +- media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk | 3 + media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig | 3 + media/libvpx/libvpx/y4minput.c | 1 + media/libvpx/moz.yaml | 4 +- media/libvpx/sources.mozbuild | 5 + 72 files changed, 3743 insertions(+), 2881 deletions(-) create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c create mode 100644 media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c create mode 100644 media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c diff --git a/media/libvpx/config/generic/vpx_config.asm b/media/libvpx/config/generic/vpx_config.asm index e7aee68f608d..d4b4fdb773c6 100644 --- a/media/libvpx/config/generic/vpx_config.asm +++ b/media/libvpx/config/generic/vpx_config.asm @@ -77,7 +77,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -90,4 +89,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/generic/vpx_config.h b/media/libvpx/config/generic/vpx_config.h index 89b205bc66f3..a0ddbdbb6393 100644 --- a/media/libvpx/config/generic/vpx_config.h +++ b/media/libvpx/config/generic/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/arm/vpx_config.asm b/media/libvpx/config/linux/arm/vpx_config.asm index fa4a931d277d..8997c1e27d5f 100644 --- a/media/libvpx/config/linux/arm/vpx_config.asm +++ b/media/libvpx/config/linux/arm/vpx_config.asm @@ -77,7 +77,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -90,4 +89,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/arm/vpx_config.h b/media/libvpx/config/linux/arm/vpx_config.h index adb8cd8aaf3f..d478b445c366 100644 --- a/media/libvpx/config/linux/arm/vpx_config.h +++ b/media/libvpx/config/linux/arm/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/arm64/vpx_config.asm b/media/libvpx/config/linux/arm64/vpx_config.asm index c8e1926d703c..1431e51eb7ce 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.asm +++ b/media/libvpx/config/linux/arm64/vpx_config.asm @@ -77,7 +77,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -90,4 +89,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/linux/arm64/vpx_config.h b/media/libvpx/config/linux/arm64/vpx_config.h index c242b7973f57..f0eac244a5e7 100644 --- a/media/libvpx/config/linux/arm64/vpx_config.h +++ b/media/libvpx/config/linux/arm64/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/ia32/vpx_config.asm b/media/libvpx/config/linux/ia32/vpx_config.asm index 46943b13e08b..c3924873a3d6 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.asm +++ b/media/libvpx/config/linux/ia32/vpx_config.asm @@ -74,7 +74,6 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -87,3 +86,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/linux/ia32/vpx_config.h b/media/libvpx/config/linux/ia32/vpx_config.h index 0515701f7aba..17ce90965316 100644 --- a/media/libvpx/config/linux/ia32/vpx_config.h +++ b/media/libvpx/config/linux/ia32/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/linux/x64/vpx_config.asm b/media/libvpx/config/linux/x64/vpx_config.asm index dcf5bc4f4e4b..f9963eecd576 100644 --- a/media/libvpx/config/linux/x64/vpx_config.asm +++ b/media/libvpx/config/linux/x64/vpx_config.asm @@ -74,7 +74,6 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -87,3 +86,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/linux/x64/vpx_config.h b/media/libvpx/config/linux/x64/vpx_config.h index 965715ac1a97..b2877a17f153 100644 --- a/media/libvpx/config/linux/x64/vpx_config.h +++ b/media/libvpx/config/linux/x64/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/ia32/vpx_config.asm b/media/libvpx/config/mac/ia32/vpx_config.asm index 46943b13e08b..c3924873a3d6 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.asm +++ b/media/libvpx/config/mac/ia32/vpx_config.asm @@ -74,7 +74,6 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -87,3 +86,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/mac/ia32/vpx_config.h b/media/libvpx/config/mac/ia32/vpx_config.h index 0515701f7aba..17ce90965316 100644 --- a/media/libvpx/config/mac/ia32/vpx_config.h +++ b/media/libvpx/config/mac/ia32/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/mac/x64/vpx_config.asm b/media/libvpx/config/mac/x64/vpx_config.asm index dcf5bc4f4e4b..f9963eecd576 100644 --- a/media/libvpx/config/mac/x64/vpx_config.asm +++ b/media/libvpx/config/mac/x64/vpx_config.asm @@ -74,7 +74,6 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -87,3 +86,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/mac/x64/vpx_config.h b/media/libvpx/config/mac/x64/vpx_config.h index 965715ac1a97..b2877a17f153 100644 --- a/media/libvpx/config/mac/x64/vpx_config.h +++ b/media/libvpx/config/mac/x64/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/vpx_version.h b/media/libvpx/config/vpx_version.h index a90ab60d9fb2..a58bfac01cfe 100644 --- a/media/libvpx/config/vpx_version.h +++ b/media/libvpx/config/vpx_version.h @@ -1,8 +1,8 @@ // This file is generated. Do not edit. #define VERSION_MAJOR 1 -#define VERSION_MINOR 12 +#define VERSION_MINOR 13 #define VERSION_PATCH 0 #define VERSION_EXTRA "" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.12.0" -#define VERSION_STRING " v1.12.0" +#define VERSION_STRING_NOSP "v1.13.0" +#define VERSION_STRING " v1.13.0" diff --git a/media/libvpx/config/win/aarch64/vpx_config.asm b/media/libvpx/config/win/aarch64/vpx_config.asm index 00c94b294d1b..4018002a5f7c 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.asm +++ b/media/libvpx/config/win/aarch64/vpx_config.asm @@ -77,7 +77,6 @@ .equ CONFIG_MULTI_RES_ENCODING , 1 .equ CONFIG_TEMPORAL_DENOISING , 1 .equ CONFIG_VP9_TEMPORAL_DENOISING , 0 -.equ CONFIG_CONSISTENT_RECODE , 0 .equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 .equ CONFIG_VP9_HIGHBITDEPTH , 0 .equ CONFIG_BETTER_HW_COMPATIBILITY , 0 @@ -90,4 +89,5 @@ .equ CONFIG_EMULATE_HARDWARE , 0 .equ CONFIG_NON_GREEDY_MV , 0 .equ CONFIG_RATE_CTRL , 0 +.equ CONFIG_COLLECT_COMPONENT_TIMING , 0 .section .note.GNU-stack,"",%progbits diff --git a/media/libvpx/config/win/aarch64/vpx_config.h b/media/libvpx/config/win/aarch64/vpx_config.h index d61231df1502..c6cdf1113f74 100644 --- a/media/libvpx/config/win/aarch64/vpx_config.h +++ b/media/libvpx/config/win/aarch64/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/ia32/vpx_config.asm b/media/libvpx/config/win/ia32/vpx_config.asm index 4e0936a17a29..e2b99053a30a 100755 --- a/media/libvpx/config/win/ia32/vpx_config.asm +++ b/media/libvpx/config/win/ia32/vpx_config.asm @@ -74,7 +74,6 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -87,3 +86,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/win/ia32/vpx_config.h b/media/libvpx/config/win/ia32/vpx_config.h index 0dfdb386d7ee..77ca1955abbf 100644 --- a/media/libvpx/config/win/ia32/vpx_config.h +++ b/media/libvpx/config/win/ia32/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/config/win/x64/vpx_config.asm b/media/libvpx/config/win/x64/vpx_config.asm index 5843076b08ef..c9bdc4a0247a 100644 --- a/media/libvpx/config/win/x64/vpx_config.asm +++ b/media/libvpx/config/win/x64/vpx_config.asm @@ -74,7 +74,6 @@ %define CONFIG_MULTI_RES_ENCODING 1 %define CONFIG_TEMPORAL_DENOISING 1 %define CONFIG_VP9_TEMPORAL_DENOISING 0 -%define CONFIG_CONSISTENT_RECODE 0 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0 %define CONFIG_VP9_HIGHBITDEPTH 0 %define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -87,3 +86,4 @@ %define CONFIG_EMULATE_HARDWARE 0 %define CONFIG_NON_GREEDY_MV 0 %define CONFIG_RATE_CTRL 0 +%define CONFIG_COLLECT_COMPONENT_TIMING 0 diff --git a/media/libvpx/config/win/x64/vpx_config.h b/media/libvpx/config/win/x64/vpx_config.h index 75269ecee676..e16e820ffb2b 100644 --- a/media/libvpx/config/win/x64/vpx_config.h +++ b/media/libvpx/config/win/x64/vpx_config.h @@ -86,7 +86,6 @@ #define CONFIG_MULTI_RES_ENCODING 1 #define CONFIG_TEMPORAL_DENOISING 1 #define CONFIG_VP9_TEMPORAL_DENOISING 0 -#define CONFIG_CONSISTENT_RECODE 0 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0 #define CONFIG_VP9_HIGHBITDEPTH 0 #define CONFIG_BETTER_HW_COMPATIBILITY 0 @@ -99,6 +98,7 @@ #define CONFIG_EMULATE_HARDWARE 0 #define CONFIG_NON_GREEDY_MV 0 #define CONFIG_RATE_CTRL 0 +#define CONFIG_COLLECT_COMPONENT_TIMING 0 #define DECODE_WIDTH_LIMIT 8192 #define DECODE_HEIGHT_LIMIT 4608 #endif /* VPX_CONFIG_H */ diff --git a/media/libvpx/libvpx/.mailmap b/media/libvpx/libvpx/.mailmap index 3593de4b9ce7..bb0ddd95b249 100644 --- a/media/libvpx/libvpx/.mailmap +++ b/media/libvpx/libvpx/.mailmap @@ -25,6 +25,7 @@ Johann Koenig Johann John Koleszar Joshua Litt +Konstantinos Margaritis Marco Paniconi Marco Paniconi Martin Storsjö diff --git a/media/libvpx/libvpx/AUTHORS b/media/libvpx/libvpx/AUTHORS index 536e0e7cf0d7..2db4a113e4ca 100644 --- a/media/libvpx/libvpx/AUTHORS +++ b/media/libvpx/libvpx/AUTHORS @@ -21,6 +21,7 @@ Andoni Morales Alastruey Andres Mejia Andrew Lewis Andrew Russell +Andrew Salkeld Angie Chen Angie Chiang Anton Venema @@ -175,7 +176,9 @@ Rob Bradford Ronald S. Bultje Rui Ueyama Sai Deng +Salome Thirot Sami Pietilä +Sam James Sarah Parker Sasi Inguva Scott Graham diff --git a/media/libvpx/libvpx/CHANGELOG b/media/libvpx/libvpx/CHANGELOG index 4f5dcbd44ee7..3fb2d19bbefa 100644 --- a/media/libvpx/libvpx/CHANGELOG +++ b/media/libvpx/libvpx/CHANGELOG @@ -1,3 +1,39 @@ +2023-01-31 v1.13.0 "Ugly Duckling" + This release includes more Neon and AVX2 optimizations, adds a new codec + control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes + numerous bug fixes. + + - Upgrading: + This release is ABI incompatible with the previous release. + + New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP. + + GoogleTest is upgraded to v1.12.1. + + .clang-format is upgraded to clang-format-11. + + VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the + feature of using external rate control models for vp9. + + - Enhancement: + Numerous improvements on Neon optimizations. + Numerous improvements on AVX2 optimizations. + Additional ARM targets added for Visual Studio. + + - Bug fixes: + Fix to calculating internal stats when frame dropped. + Fix to segfault for external resize test in vp9. + Fix to build system with replacing egrep with grep -E. + Fix to a few bugs with external RTC rate control library. + Fix to make SVC work with VBR. + Fix to key frame setting in VP9 external RC. + Fix to -Wimplicit-int (Clang 16). + Fix to VP8 external RC for buffer levels. + Fix to VP8 external RC for dynamic update of layers. + Fix to VP9 auto level. + Fix to off-by-one error of max w/h in validate_config. + Fix to make SVC work for Profile 1. + 2022-06-17 v1.12.0 "Torrent Duck" This release adds optimizations for Loongarch, adds support for vp8 in the real-time rate control library, upgrades GoogleTest to v1.11.0, updates diff --git a/media/libvpx/libvpx/README b/media/libvpx/libvpx/README index 477a145ba32a..e360df05f629 100644 --- a/media/libvpx/libvpx/README +++ b/media/libvpx/libvpx/README @@ -1,4 +1,4 @@ -v1.12.0 Torrent Duck +v1.13.0 Ugly Duckling Welcome to the WebM VP8/VP9 Codec SDK! diff --git a/media/libvpx/libvpx/configure b/media/libvpx/libvpx/configure index ae289f77b4a1..890ad3968a35 100755 --- a/media/libvpx/libvpx/configure +++ b/media/libvpx/libvpx/configure @@ -293,6 +293,7 @@ EXPERIMENT_LIST=" emulate_hardware non_greedy_mv rate_ctrl + collect_component_timing " CONFIG_LIST=" dependency_tracking @@ -342,7 +343,6 @@ CONFIG_LIST=" multi_res_encoding temporal_denoising vp9_temporal_denoising - consistent_recode coefficient_range_checking vp9_highbitdepth better_hw_compatibility @@ -406,7 +406,6 @@ CMDLINE_SELECT=" multi_res_encoding temporal_denoising vp9_temporal_denoising - consistent_recode coefficient_range_checking better_hw_compatibility vp9_highbitdepth diff --git a/media/libvpx/libvpx/libs.mk b/media/libvpx/libvpx/libs.mk index fb6fbbeb202c..1f7f03aa38ec 100644 --- a/media/libvpx/libvpx/libs.mk +++ b/media/libvpx/libvpx/libs.mk @@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS) # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current # SO_VERSION_* then follow the rules in the link to detemine the new version # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1 -SO_VERSION_MAJOR := 7 -SO_VERSION_MINOR := 1 +SO_VERSION_MAJOR := 8 +SO_VERSION_MINOR := 0 SO_VERSION_PATCH := 0 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS)) LIBVPX_SO := libvpx.$(SO_VERSION_MAJOR).dylib diff --git a/media/libvpx/libvpx/test/comp_avg_pred_test.cc b/media/libvpx/libvpx/test/comp_avg_pred_test.cc index 70aeab8d7e35..f747c3524e93 100644 --- a/media/libvpx/libvpx/test/comp_avg_pred_test.cc +++ b/media/libvpx/libvpx/test/comp_avg_pred_test.cc @@ -260,5 +260,11 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(&highbd_wrapper)); #endif // HAVE_SSE2 +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P( + NEON, AvgPredTestHBD, + ::testing::Values(&highbd_wrapper)); +#endif // HAVE_NEON + #endif // CONFIG_VP9_HIGHBITDEPTH } // namespace diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc index a6c8ef048019..33f09209f4c4 100644 --- a/media/libvpx/libvpx/test/variance_test.cc +++ b/media/libvpx/libvpx/test/variance_test.cc @@ -1572,6 +1572,10 @@ INSTANTIATE_TEST_SUITE_P( 12), SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon, 12), + SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon, + 12), + SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon, + 12), SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon, 10), SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon, @@ -1594,6 +1598,10 @@ INSTANTIATE_TEST_SUITE_P( 10), SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon, 10), + SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon, + 10), + SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon, + 10), SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon, 8), SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon, @@ -1613,7 +1621,9 @@ INSTANTIATE_TEST_SUITE_P( SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon, 8), SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8), - SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, + SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8), + SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8), + SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon, 8))); INSTANTIATE_TEST_SUITE_P( @@ -1652,6 +1662,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_neon, 12), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_12_sub_pixel_avg_variance4x8_neon, + 12), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_12_sub_pixel_avg_variance4x4_neon, + 12), SubpelAvgVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_neon, 10), @@ -1685,6 +1701,12 @@ INSTANTIATE_TEST_SUITE_P( SubpelAvgVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_neon, 10), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_10_sub_pixel_avg_variance4x8_neon, + 10), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_10_sub_pixel_avg_variance4x4_neon, + 10), SubpelAvgVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_neon, 8), @@ -1717,6 +1739,12 @@ INSTANTIATE_TEST_SUITE_P( 8), SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_neon, + 8), + SubpelAvgVarianceParams(2, 3, + &vpx_highbd_8_sub_pixel_avg_variance4x8_neon, + 8), + SubpelAvgVarianceParams(2, 2, + &vpx_highbd_8_sub_pixel_avg_variance4x4_neon, 8))); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc index 7410f3c01dcf..56c26a99f4fc 100644 --- a/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc +++ b/media/libvpx/libvpx/test/vp8_ratectrl_rtc_test.cc @@ -127,14 +127,15 @@ class Vp8RcInterfaceTest encoder->Control(VP8E_SET_CPUUSED, -6); encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1); encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); - } else if (frame_params_.frame_type == INTER_FRAME) { + } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; } } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == test_video_.frames; } diff --git a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc index 2bfa6281d738..739c0b7f8e48 100644 --- a/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc +++ b/media/libvpx/libvpx/test/vp9_ext_ratectrl_test.cc @@ -41,7 +41,7 @@ constexpr int kDefaultMaxGfInterval = 16; constexpr int kReadMinGfInterval = 5; constexpr int kReadMaxGfInterval = 13; const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv"; -const double kPsnrThreshold = 30.50; +const double kPsnrThreshold = 30.4; struct ToyRateCtrl { int magic_number; diff --git a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc index 1d1a78f43de0..cce73fcce260 100644 --- a/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc +++ b/media/libvpx/libvpx/test/vp9_ratectrl_rtc_test.cc @@ -57,9 +57,11 @@ class RcInterfaceTest encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000); encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1); } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; - if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) { + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; + if (rc_cfg_.rc_mode == VPX_CBR && + frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) { // Disable golden frame update. frame_flags_ |= VP8_EFLAG_NO_UPD_GF; frame_flags_ |= VP8_EFLAG_NO_UPD_ARF; @@ -183,8 +185,9 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, encoder->Control(VP9E_SET_SVC, 1); encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_); } - frame_params_.frame_type = - video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME; + frame_params_.frame_type = video->frame() % key_interval_ == 0 + ? libvpx::RcFrameType::kKeyFrame + : libvpx::RcFrameType::kInterFrame; encoder_exit_ = video->frame() == kNumFrames; current_superframe_ = video->frame(); if (dynamic_spatial_layers_ == 1) { @@ -247,7 +250,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest, else frame_params_.temporal_layer_id = 0; rc_api_->ComputeQP(frame_params_); - frame_params_.frame_type = INTER_FRAME; + frame_params_.frame_type = libvpx::RcFrameType::kInterFrame; rc_api_->PostEncodeUpdate(sizes_[sl]); } } diff --git a/media/libvpx/libvpx/vp8/common/onyx.h b/media/libvpx/libvpx/vp8/common/onyx.h index 05c72df3faa4..8c35e433e70b 100644 --- a/media/libvpx/libvpx/vp8/common/onyx.h +++ b/media/libvpx/libvpx/vp8/common/onyx.h @@ -26,13 +26,6 @@ struct VP8_COMP; /* Create/destroy static data structures. */ -typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - typedef enum { USAGE_LOCAL_FILE_PLAYBACK = 0x0, USAGE_STREAM_FROM_SERVER = 0x1, @@ -58,19 +51,19 @@ typedef enum { #include static INLINE void Scale2Ratio(int mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; - case ONETWO: + case VP8E_ONETWO: *hr = 1; *hs = 2; break; @@ -273,8 +266,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int threshold[4]); int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols); -int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp8_get_quantizer(struct VP8_COMP *cpi); #ifdef __cplusplus diff --git a/media/libvpx/libvpx/vp8/encoder/block.h b/media/libvpx/libvpx/vp8/encoder/block.h index f0efd3e1e255..1bc5ef75bc29 100644 --- a/media/libvpx/libvpx/vp8/encoder/block.h +++ b/media/libvpx/libvpx/vp8/encoder/block.h @@ -92,8 +92,7 @@ typedef struct macroblock { signed int last_act_zbin_adj; int *mvcost[2]; - /* MSVC generates code that thinks this is 16-byte aligned */ - DECLARE_ALIGNED(16, int*, mvsadcost[2]); + int *mvsadcost[2]; int (*mbmode_cost)[MB_MODE_COUNT]; int (*intra_uv_mode_cost)[MB_MODE_COUNT]; int (*bmode_costs)[10][10]; diff --git a/media/libvpx/libvpx/vp8/encoder/firstpass.c b/media/libvpx/libvpx/vp8/encoder/firstpass.c index 65d2681c91d8..4149fb4bf8fe 100644 --- a/media/libvpx/libvpx/vp8/encoder/firstpass.c +++ b/media/libvpx/libvpx/vp8/encoder/firstpass.c @@ -2990,8 +2990,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) { } /* Set back to unscaled by defaults */ - cpi->common.horiz_scale = NORMAL; - cpi->common.vert_scale = NORMAL; + cpi->common.horiz_scale = VP8E_NORMAL; + cpi->common.vert_scale = VP8E_NORMAL; /* Calculate Average bits per frame. */ av_bits_per_frame = cpi->oxcf.target_bandwidth / diff --git a/media/libvpx/libvpx/vp8/encoder/onyx_if.c b/media/libvpx/libvpx/vp8/encoder/onyx_if.c index 4bbeadef013c..bcf522702973 100644 --- a/media/libvpx/libvpx/vp8/encoder/onyx_if.c +++ b/media/libvpx/libvpx/vp8/encoder/onyx_if.c @@ -1667,7 +1667,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cm->sharpness_level = cpi->oxcf.Sharpness; - if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) { + if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) { int hr, hs, vr, vs; Scale2Ratio(cm->horiz_scale, &hr, &hs); @@ -2504,15 +2504,17 @@ static int resize_key_frame(VP8_COMP *cpi) { if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO; - cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO; + (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO; + cm->vert_scale = + (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO; } /* Should we now start scaling back up */ else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100)) { cm->horiz_scale = - (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL; - cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL; + (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL; + cm->vert_scale = + (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL; } /* Get the new height and width */ @@ -5380,15 +5382,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows, } } -int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { - if (horiz_mode <= ONETWO) { +int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { + if (horiz_mode <= VP8E_ONETWO) { cpi->common.horiz_scale = horiz_mode; } else { return -1; } - if (vert_mode <= ONETWO) { + if (vert_mode <= VP8E_ONETWO) { cpi->common.vert_scale = vert_mode; } else { return -1; diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c index a12ea6c42a20..a9d1f8005da8 100644 --- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -947,19 +947,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (img != NULL) { res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, - &sd, dst_time_stamp, dst_end_time_stamp)) { - VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; - res = update_error_state(ctx, &cpi->common.error); - } + if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; + res = update_error_state(ctx, &cpi->common.error); } /* reset for next frame */ @@ -1233,8 +1224,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx, if (data) { int res; vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data; - res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode, - (VPX_SCALING)scalemode.v_scaling_mode); + res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode, + scalemode.v_scaling_mode); if (!res) { /*force next frame a key frame to effect scaling mode */ diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc index f3f42529dbaa..c36cfea4855d 100644 --- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.cc @@ -10,7 +10,9 @@ #include #include +#include "vp8/common/common.h" #include "vp8/vp8_ratectrl_rtc.h" +#include "vp8/encoder/onyx_int.h" #include "vp8/encoder/ratectrl.h" #include "vpx_ports/system_state.h" @@ -65,6 +67,13 @@ std::unique_ptr VP8RateControlRTC::Create( return rc_api; } +VP8RateControlRTC::~VP8RateControlRTC() { + if (cpi_) { + vpx_free(cpi_->gf_active_flags); + vpx_free(cpi_); + } +} + void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) { VP8_COMMON *cm = &cpi_->common; VP8_CONFIG *oxcf = &cpi_->oxcf; @@ -203,7 +212,7 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) { vp8_restore_layer_context(cpi_, layer); vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate); } - cm->frame_type = frame_params.frame_type; + cm->frame_type = static_cast(frame_params.frame_type); cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) { diff --git a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h index def7dd8f9e10..0e81592eca2c 100644 --- a/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h +++ b/media/libvpx/libvpx/vp8/vp8_ratectrl_rtc.h @@ -12,23 +12,24 @@ #define VPX_VP8_RATECTRL_RTC_H_ #include +#include #include -#include "vp8/encoder/onyx_int.h" -#include "vp8/common/common.h" #include "vpx/internal/vpx_ratectrl_rtc.h" +struct VP8_COMP; + namespace libvpx { struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP8RateControlRtcConfig() { - vp8_zero(layer_target_bitrate); - vp8_zero(ts_rate_decimator); + memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate)); + memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator)); } }; struct VP8FrameParamsQpRTC { - FRAME_TYPE frame_type; + RcFrameType frame_type; int temporal_layer_id; }; @@ -36,12 +37,7 @@ class VP8RateControlRTC { public: static std::unique_ptr Create( const VP8RateControlRtcConfig &cfg); - ~VP8RateControlRTC() { - if (cpi_) { - vpx_free(cpi_->gf_active_flags); - vpx_free(cpi_); - } - } + ~VP8RateControlRTC(); void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP @@ -54,7 +50,7 @@ class VP8RateControlRTC { private: VP8RateControlRTC() {} void InitRateControl(const VP8RateControlRtcConfig &cfg); - VP8_COMP *cpi_; + struct VP8_COMP *cpi_; int q_; }; diff --git a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 33753f77b0a9..997775a668e1 100644 --- a/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/media/libvpx/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -220,7 +220,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Look up the component cost of the residual motion vector { uint32_t cost[4]; - int16_t __attribute__((aligned(16))) rowcol[8]; + DECLARE_ALIGNED(16, int16_t, rowcol[8]); vst1q_s16(rowcol, v_diff_mv_w); // Note: This is a use case for gather instruction diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c index 1483ac069dd4..a522097e614f 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encodeframe.c @@ -1980,6 +1980,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, int64_t best_rd = INT64_MAX; vpx_clear_system_state(); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_sb_modes_time); +#endif // Use the lower precision, but faster, 32x32 fdct for mode selection. x->use_lp32x32fdct = 1; @@ -2047,15 +2050,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd); } else { if (bsize >= BLOCK_8X8) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP)) vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize, ctx, best_rd); else vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sb_time); +#endif } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost, bsize, ctx, best_rd); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time); +#endif } } @@ -2078,6 +2093,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data, ctx->rate = rd_cost->rate; ctx->dist = rd_cost->dist; +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_sb_modes_time); +#endif } #endif // !CONFIG_REALTIME_ONLY @@ -4411,8 +4429,14 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td, if (should_encode_sb && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_sb_time); +#endif encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize, pc_tree); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_sb_time); +#endif #if CONFIG_RATE_CTRL if (oxcf->use_simple_encode_api) { // Store partition, motion vector of the superblock. @@ -4539,8 +4563,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td, &x->min_partition_size, &x->max_partition_size); } td->pc_root->none.rdcost = 0; + +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, rd_pick_partition_time); +#endif rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64, &dummy_rdc, dummy_rdc, td->pc_root); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, rd_pick_partition_time); +#endif } (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row, sb_col_in_tile, num_sb_cols); @@ -5810,14 +5841,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) { for (i = 0; i < BLOCK_SIZES; ++i) { for (j = 0; j < MAX_MODES; ++j) { tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT; -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; - } -#endif // CONFIG_RATE_CTRL -#if CONFIG_CONSISTENT_RECODE tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT; -#endif // CONFIG_CONSISTENT_RECODE tile_data->mode_map[i][j] = j; } } @@ -6037,9 +6061,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4; #endif // CONFIG_VP9_HIGHBITDEPTH x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; -#if CONFIG_CONSISTENT_RECODE x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1; -#endif if (xd->lossless) x->optimize = 0; x->sharpness = cpi->oxcf.sharpness; x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ); @@ -6184,13 +6206,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) { return sum_delta / (cm->mi_rows * cm->mi_cols); } -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static void restore_encode_params(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - int tile_col, tile_row; + int tile_idx; int i, j; + TileDataEnc *tile_data; RD_OPT *rd_opt = &cpi->rd; for (i = 0; i < MAX_REF_FRAMES; i++) { for (j = 0; j < REFERENCE_MODES; j++) @@ -6201,35 +6221,19 @@ static void restore_encode_params(VP9_COMP *cpi) { rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j]; } - if (cpi->tile_data != NULL) { - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact[i][j] = - tile_data->thresh_freq_fact_prev[i][j]; - } - } - } + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev); } cm->interp_filter = cpi->sf.default_interp_filter; } -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL void vp9_encode_frame(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - restore_encode_params(cpi); - } -#endif // CONFIG_RATE_CTRL -#if CONFIG_CONSISTENT_RECODE restore_encode_params(cpi); -#endif #if CONFIG_MISMATCH_DEBUG mismatch_reset_frame(MAX_MB_PLANE); @@ -6283,7 +6287,13 @@ void vp9_encode_frame(VP9_COMP *cpi) { if (cm->interp_filter == SWITCHABLE) cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_frame_internal_time); +#endif encode_frame_internal(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_frame_internal_time); +#endif for (i = 0; i < REFERENCE_MODES; ++i) mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c index b66fdc0bca30..22fbb899fdec 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -34,16 +34,12 @@ #include "vp9/common/vp9_alloccommon.h" #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_idct.h" -#if CONFIG_NON_GREEDY_MV -#include "vp9/common/vp9_mvref_common.h" -#endif #if CONFIG_VP9_POSTPROC #include "vp9/common/vp9_postproc.h" #endif #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" #include "vp9/common/vp9_tile_common.h" -#include "vp9/common/vp9_scan.h" #if !CONFIG_REALTIME_ONLY #include "vp9/encoder/vp9_alt_ref_aq.h" @@ -81,6 +77,7 @@ #include "vp9/encoder/vp9_speed_features.h" #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_temporal_filter.h" +#include "vp9/encoder/vp9_tpl_model.h" #include "vp9/vp9_cx_iface.h" #define AM_SEGMENT_ID_INACTIVE 7 @@ -126,13 +123,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) { } #endif -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size); -#endif -void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size); - #if !CONFIG_REALTIME_ONLY // compute adaptive threshold for skip recoding static int compute_context_model_thresh(const VP9_COMP *const cpi) { @@ -502,22 +492,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = { "Too many reference buffers are used." }; -static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { +static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) { switch (mode) { - case NORMAL: + case VP8E_NORMAL: *hr = 1; *hs = 1; break; - case FOURFIVE: + case VP8E_FOURFIVE: *hr = 4; *hs = 5; break; - case THREEFIVE: + case VP8E_THREEFIVE: *hr = 3; *hs = 5; break; default: - assert(mode == ONETWO); + assert(mode == VP8E_ONETWO); *hr = 1; *hs = 2; break; @@ -2109,11 +2099,6 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { vp9_set_row_mt(cpi); } -#ifndef M_LOG2_E -#define M_LOG2_E 0.693147180559945309417 -#endif -#define log2f(x) (log(x) / (float)M_LOG2_E) - /*********************************************************************** * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts' * *********************************************************************** @@ -2666,8 +2651,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V)) #endif // CONFIG_INTERNAL_STATS -static void free_tpl_buffer(VP9_COMP *cpi); - void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *cm; unsigned int i; @@ -2781,7 +2764,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->kmeans_data_arr); } - free_tpl_buffer(cpi); + vp9_free_tpl_buffer(cpi); vp9_loop_filter_dealloc(&cpi->lf_row_sync); vp9_bitstream_encode_tiles_buffer_dealloc(cpi); @@ -3329,19 +3312,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { vpx_extend_frame_inner_borders(cm->frame_to_show); } -static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { - RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; - if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || - new_fb_ptr->mi_cols < cm->mi_cols) { - vpx_free(new_fb_ptr->mvs); - CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, - (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, - sizeof(*new_fb_ptr->mvs))); - new_fb_ptr->mi_rows = cm->mi_rows; - new_fb_ptr->mi_cols = cm->mi_cols; - } -} - void vp9_scale_references(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; MV_REFERENCE_FRAME ref_frame; @@ -3804,13 +3774,10 @@ static void set_frame_size(VP9_COMP *cpi) { set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME); } -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static void save_encode_params(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; - const int tile_rows = 1 << cm->log2_tile_rows; - int tile_col, tile_row; + int tile_idx; int i, j; + TileDataEnc *tile_data; RD_OPT *rd_opt = &cpi->rd; for (i = 0; i < MAX_REF_FRAMES; i++) { for (j = 0; j < REFERENCE_MODES; j++) @@ -3821,21 +3788,12 @@ static void save_encode_params(VP9_COMP *cpi) { rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j]; } - if (cpi->tile_data != NULL) { - for (tile_row = 0; tile_row < tile_rows; ++tile_row) - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - TileDataEnc *tile_data = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - for (i = 0; i < BLOCK_SIZES; ++i) { - for (j = 0; j < MAX_MODES; ++j) { - tile_data->thresh_freq_fact_prev[i][j] = - tile_data->thresh_freq_fact[i][j]; - } - } - } + for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) { + assert(cpi->tile_data); + tile_data = &cpi->tile_data[tile_idx]; + vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact); } } -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL static INLINE void set_raw_source_frame(VP9_COMP *cpi) { #ifdef ENABLE_KF_DENOISE @@ -4415,6 +4373,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest (cpi->twopass.gf_group.index == 1) : 0; +#if CONFIG_COLLECT_COMPONENT_TIMING + printf("\n Encoding a frame: \n"); +#endif do { vpx_clear_system_state(); @@ -4802,6 +4763,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) if (loop) restore_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + if (loop) printf("\n Recoding:"); +#endif } while (loop); rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth; @@ -5296,16 +5260,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) { vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size, xd->bd); - highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); } else { vpx_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size); - wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); } #else vpx_subtract_block(block_size, block_size, src_diff, block_size, mb_buffer, buf_stride, zero_pred, block_size); - wht_fwd_txfm(src_diff, block_size, coeff, tx_size); + vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size); #endif // CONFIG_VP9_HIGHBITDEPTH coeff[0] = 0; @@ -5508,14 +5472,8 @@ static void encode_frame_to_data_rate( memset(cpi->mode_chosen_counts, 0, MAX_MODES * sizeof(*cpi->mode_chosen_counts)); #endif -#if CONFIG_CONSISTENT_RECODE // Backup to ensure consistency between recodes save_encode_params(cpi); -#elif CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - save_encode_params(cpi); - } -#endif if (cpi->ext_ratectrl.ready && (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) { vpx_codec_err_t codec_status; @@ -5549,8 +5507,14 @@ static void encode_frame_to_data_rate( #if !CONFIG_REALTIME_ONLY #if CONFIG_RATE_CTRL encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history); -#else // CONFIG_RATE_CTRL +#else // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, encode_with_recode_loop_time); +#endif encode_with_recode_loop(cpi, size, dest); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, encode_with_recode_loop_time); +#endif #endif // CONFIG_RATE_CTRL #endif // !CONFIG_REALTIME_ONLY } @@ -5609,13 +5573,25 @@ static void encode_frame_to_data_rate( cm->frame_to_show->render_width = cm->render_width; cm->frame_to_show->render_height = cm->render_height; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, loopfilter_frame_time); +#endif // Pick the loop filter level for the frame. loopfilter_frame(cpi, cm); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, loopfilter_frame_time); +#endif if (cpi->rc.use_post_encode_drop) save_coding_context(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_pack_bitstream_time); +#endif // build the bitstream vp9_pack_bitstream(cpi, dest, size); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_pack_bitstream_time); +#endif if (cpi->ext_ratectrl.ready) { const RefCntBuffer *coded_frame_buf = @@ -6205,1391 +6181,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { } } -typedef struct GF_PICTURE { - YV12_BUFFER_CONFIG *frame; - int ref_frame[3]; - FRAME_UPDATE_TYPE update_type; -} GF_PICTURE; - -static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, - const GF_GROUP *gf_group, int *tpl_group_frames) { - VP9_COMMON *cm = &cpi->common; - int frame_idx = 0; - int i; - int gld_index = -1; - int alt_index = -1; - int lst_index = -1; - int arf_index_stack[MAX_ARF_LAYERS]; - int arf_stack_size = 0; - int extend_frame_count = 0; - int pframe_qindex = cpi->tpl_stats[2].base_qindex; - int frame_gop_offset = 0; - - RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; - int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; - - memset(recon_frame_index, -1, sizeof(recon_frame_index)); - stack_init(arf_index_stack, MAX_ARF_LAYERS); - - // TODO(jingning): To be used later for gf frame type parsing. - (void)gf_group; - - for (i = 0; i < FRAME_BUFFERS; ++i) { - if (frame_bufs[i].ref_count == 0) { - alloc_frame_mvs(cm, i); - if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, - cm->subsampling_x, cm->subsampling_y, -#if CONFIG_VP9_HIGHBITDEPTH - cm->use_highbitdepth, -#endif - VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, - NULL, NULL, NULL)) - vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, - "Failed to allocate frame buffer"); - - recon_frame_index[frame_idx] = i; - ++frame_idx; - - if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; - } - } - - for (i = 0; i < REFS_PER_FRAME + 1; ++i) { - assert(recon_frame_index[i] >= 0); - cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; - } - - *tpl_group_frames = 0; - - // Initialize Golden reference frame. - gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); - for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; - gf_picture[0].update_type = gf_group->update_type[0]; - gld_index = 0; - ++*tpl_group_frames; - - // Initialize base layer ARF frame - gf_picture[1].frame = cpi->Source; - gf_picture[1].ref_frame[0] = gld_index; - gf_picture[1].ref_frame[1] = lst_index; - gf_picture[1].ref_frame[2] = alt_index; - gf_picture[1].update_type = gf_group->update_type[1]; - alt_index = 1; - ++*tpl_group_frames; - - // Initialize P frames - for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { - struct lookahead_entry *buf; - frame_gop_offset = gf_group->frame_gop_index[frame_idx]; - buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); - - if (buf == NULL) break; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; - - switch (gf_group->update_type[frame_idx]) { - case ARF_UPDATE: - stack_push(arf_index_stack, alt_index, arf_stack_size); - ++arf_stack_size; - alt_index = frame_idx; - break; - case LF_UPDATE: lst_index = frame_idx; break; - case OVERLAY_UPDATE: - gld_index = frame_idx; - alt_index = stack_pop(arf_index_stack, arf_stack_size); - --arf_stack_size; - break; - case USE_BUF_FRAME: - lst_index = alt_index; - alt_index = stack_pop(arf_index_stack, arf_stack_size); - --arf_stack_size; - break; - default: break; - } - - ++*tpl_group_frames; - - // The length of group of pictures is baseline_gf_interval, plus the - // beginning golden frame from last GOP, plus the last overlay frame in - // the same GOP. - if (frame_idx == gf_group->gf_group_size) break; - } - - alt_index = -1; - ++frame_idx; - ++frame_gop_offset; - - // Extend two frames outside the current gf group. - for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { - struct lookahead_entry *buf = - vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); - - if (buf == NULL) break; - - cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; - - gf_picture[frame_idx].frame = &buf->img; - gf_picture[frame_idx].ref_frame[0] = gld_index; - gf_picture[frame_idx].ref_frame[1] = lst_index; - gf_picture[frame_idx].ref_frame[2] = alt_index; - gf_picture[frame_idx].update_type = LF_UPDATE; - lst_index = frame_idx; - ++*tpl_group_frames; - ++extend_frame_count; - ++frame_gop_offset; - } -} - -static void init_tpl_stats(VP9_COMP *cpi) { - int frame_idx; - for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - memset(tpl_frame->tpl_stats_ptr, 0, - tpl_frame->height * tpl_frame->width * - sizeof(*tpl_frame->tpl_stats_ptr)); - tpl_frame->is_valid = 0; - } -} - -#if CONFIG_NON_GREEDY_MV -static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, - MotionField *motion_field, - int frame_idx, uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, int stride, - BLOCK_SIZE bsize, int mi_row, - int mi_col, MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - int step_param; - uint32_t bestsme = UINT_MAX; - const MvLimits tmp_mv_limits = x->mv_limits; - // lambda is used to adjust the importance of motion vector consistency. - // TODO(angiebird): Figure out lambda's proper value. - const int lambda = cpi->tpl_stats[frame_idx].lambda; - int_mv nb_full_mvs[NB_MVS_NUM]; - int nb_full_mv_num; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - nb_full_mv_num = - vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); - vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, - lambda, 1, nb_full_mvs, nb_full_mv_num, mv); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - return bestsme; -} - -static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, int stride, - BLOCK_SIZE bsize, MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - uint32_t bestsme = UINT_MAX; - uint32_t distortion; - uint32_t sse; - int cost_list[5]; - - MV best_ref_mv1 = { 0, 0 }; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - // TODO(yunqing): may use higher tap interp filter than 2 taps. - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step( - x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, - USE_2_TAPS); - - return bestsme; -} - -#else // CONFIG_NON_GREEDY_MV -static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, - uint8_t *cur_frame_buf, - uint8_t *ref_frame_buf, - int stride, BLOCK_SIZE bsize, - MV *mv) { - MACROBLOCK *const x = &td->mb; - MACROBLOCKD *const xd = &x->e_mbd; - MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; - const SEARCH_METHODS search_method = NSTEP; - int step_param; - int sadpb = x->sadperbit16; - uint32_t bestsme = UINT_MAX; - uint32_t distortion; - uint32_t sse; - int cost_list[5]; - const MvLimits tmp_mv_limits = x->mv_limits; - - MV best_ref_mv1 = { 0, 0 }; - MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ - - best_ref_mv1_full.col = best_ref_mv1.col >> 3; - best_ref_mv1_full.row = best_ref_mv1.row >> 3; - - // Setup frame pointers - x->plane[0].src.buf = cur_frame_buf; - x->plane[0].src.stride = stride; - xd->plane[0].pre[0].buf = ref_frame_buf; - xd->plane[0].pre[0].stride = stride; - - step_param = mv_sf->reduce_first_step_size; - step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); - - vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); - - vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, - search_method, sadpb, cond_cost_list(cpi, cost_list), - &best_ref_mv1, mv, 0, 0); - - /* restore UMV window */ - x->mv_limits = tmp_mv_limits; - - // TODO(yunqing): may use higher tap interp filter than 2 taps. - // Ignore mv costing by sending NULL pointer instead of cost array - bestsme = cpi->find_fractional_mv_step( - x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, - &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, - cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, - USE_2_TAPS); - - return bestsme; -} -#endif - -static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, - int ref_pos_col, int block, BLOCK_SIZE bsize) { - int width = 0, height = 0; - int bw = 4 << b_width_log2_lookup[bsize]; - int bh = 4 << b_height_log2_lookup[bsize]; - - switch (block) { - case 0: - width = grid_pos_col + bw - ref_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 1: - width = ref_pos_col + bw - grid_pos_col; - height = grid_pos_row + bh - ref_pos_row; - break; - case 2: - width = grid_pos_col + bw - ref_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - case 3: - width = ref_pos_col + bw - grid_pos_col; - height = ref_pos_row + bh - grid_pos_row; - break; - default: assert(0); - } - - return width * height; -} - -static int round_floor(int ref_pos, int bsize_pix) { - int round; - if (ref_pos < 0) - round = -(1 + (-ref_pos - 1) / bsize_pix); - else - round = ref_pos / bsize_pix; - - return round; -} - -static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, - BLOCK_SIZE bsize, int stride) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; - const int64_t mc_flow = tpl_ptr->mc_flow; - const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; - *tpl_ptr = *src_stats; - tpl_ptr->mc_flow = mc_flow; - tpl_ptr->mc_ref_cost = mc_ref_cost; - tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; - } - } -} - -static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; - TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; - MV mv = tpl_stats->mv.as_mv; - int mv_row = mv.row >> 3; - int mv_col = mv.col >> 3; - - int ref_pos_row = mi_row * MI_SIZE + mv_row; - int ref_pos_col = mi_col * MI_SIZE + mv_col; - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int pix_num = bw * bh; - - // top-left on grid block location in pixel - int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; - int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; - int block; - - for (block = 0; block < 4; ++block) { - int grid_pos_row = grid_pos_row_base + bh * (block >> 1); - int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); - - if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && - grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { - int overlap_area = get_overlap_area( - grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); - int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; - int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; - - int64_t mc_flow = tpl_stats->mc_dep_cost - - (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / - tpl_stats->intra_cost; - - int idx, idy; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *des_stats = - &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + - (ref_mi_col + idx)]; - - des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; - des_stats->mc_ref_cost += - ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / - pix_num; - assert(overlap_area >= 0); - } - } - } - } -} - -static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, - int mi_row, int mi_col, const BLOCK_SIZE bsize) { - int idx, idy; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - - for (idy = 0; idy < mi_height; ++idy) { - for (idx = 0; idx < mi_width; ++idx) { - TplDepStats *tpl_ptr = - &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; - tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, - BLOCK_8X8); - } - } -} - -static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, - TX_SIZE tx_size, int64_t *recon_error, - int64_t *sse) { - MACROBLOCKD *const xd = &x->e_mbd; - const struct macroblock_plane *const p = &x->plane[plane]; - const struct macroblockd_plane *const pd = &xd->plane[plane]; - const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; - uint16_t eob; - int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; - const int shift = tx_size == TX_32X32 ? 0 : 2; - - // skip block condition should be handled before this is called. - assert(!x->skip_block); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, - qcoeff, dqcoeff, pd->dequant, &eob, - scan_order->scan, scan_order->iscan); - } else { - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, - scan_order->iscan); - } -#else - vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, - dqcoeff, pd->dequant, &eob, scan_order->scan, - scan_order->iscan); -#endif // CONFIG_VP9_HIGHBITDEPTH - - *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; - *recon_error = VPXMAX(*recon_error, 1); - - *sse = (*sse) >> shift; - *sse = VPXMAX(*sse, 1); -} - -#if CONFIG_VP9_HIGHBITDEPTH -void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. - switch (tx_size) { - case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, - TX_SIZE tx_size) { - switch (tx_size) { - case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; - case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; - case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; - default: assert(0); - } -} - -static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, - int mi_col) { - x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); - x->mv_limits.row_max = - (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); - x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); - x->mv_limits.col_max = - ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); -} - -static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, - struct scale_factors *sf, GF_PICTURE *gf_picture, - int frame_idx, TplDepFrame *tpl_frame, - int16_t *src_diff, tran_low_t *coeff, - tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, - int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, - YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, - int64_t *recon_error, int64_t *sse) { - VP9_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - - const int bw = 4 << b_width_log2_lookup[bsize]; - const int bh = 4 << b_height_log2_lookup[bsize]; - const int pix_num = bw * bh; - int best_rf_idx = -1; - int_mv best_mv; - int64_t best_inter_cost = INT64_MAX; - int64_t inter_cost; - int rf_idx; - const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; - - int64_t best_intra_cost = INT64_MAX; - int64_t intra_cost; - PREDICTION_MODE mode; - int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - MODE_INFO mi_above, mi_left; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - TplDepStats *tpl_stats = - &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - - xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); - xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; - xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); - xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; - xd->above_mi = (mi_row > 0) ? &mi_above : NULL; - xd->left_mi = (mi_col > 0) ? &mi_left : NULL; - - // Intra prediction search - for (mode = DC_PRED; mode <= TM_PRED; ++mode) { - uint8_t *src, *dst; - int src_stride, dst_stride; - - src = xd->cur_buf->y_buffer + mb_y_offset; - src_stride = xd->cur_buf->y_stride; - - dst = &predictor[0]; - dst_stride = bw; - - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - - vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, - src_stride, dst, dst_stride, 0, 0, 0); - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride, xd->bd); - highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_highbd_satd(coeff, pix_num); - } else { - vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, - dst_stride); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_satd(coeff, pix_num); - } -#else - vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - intra_cost = vpx_satd(coeff, pix_num); -#endif // CONFIG_VP9_HIGHBITDEPTH - - if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; - } - - // Motion compensated prediction - best_mv.as_int = 0; - - set_mv_limits(cm, x, mi_row, mi_col); - - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int_mv mv; -#if CONFIG_NON_GREEDY_MV - MotionField *motion_field; -#endif - if (ref_frame[rf_idx] == NULL) continue; - -#if CONFIG_NON_GREEDY_MV - (void)td; - motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); -#else - motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, bsize, &mv.as_mv); -#endif - -#if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_build_inter_predictor( - CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), - ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, - &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, - mi_row * MI_SIZE, xd->bd); - vpx_highbd_subtract_block( - bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); - highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_highbd_satd(coeff, pix_num); - } else { - vp9_build_inter_predictor( - ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, - 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd(coeff, pix_num); - } -#else - vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, - ref_frame[rf_idx]->y_stride, &predictor[0], bw, - &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, - mi_col * MI_SIZE, mi_row * MI_SIZE); - vpx_subtract_block(bh, bw, src_diff, bw, - xd->cur_buf->y_buffer + mb_y_offset, - xd->cur_buf->y_stride, &predictor[0], bw); - wht_fwd_txfm(src_diff, bw, coeff, tx_size); - inter_cost = vpx_satd(coeff, pix_num); -#endif - - if (inter_cost < best_inter_cost) { - best_rf_idx = rf_idx; - best_inter_cost = inter_cost; - best_mv.as_int = mv.as_int; - get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, - sse); - } - } - best_intra_cost = VPXMAX(best_intra_cost, 1); - best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); - tpl_stats->inter_cost = VPXMAX( - 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->intra_cost = VPXMAX( - 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); - tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; - tpl_stats->mv.as_int = best_mv.as_int; -} - -#if CONFIG_NON_GREEDY_MV -static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, - int frame_idx, int rf_idx, int mi_row, - int mi_col, struct buf_2d *src, - struct buf_2d *pre) { - const int mb_y_offset = - mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - YV12_BUFFER_CONFIG *ref_frame = NULL; - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - ref_frame = gf_picture[ref_frame_idx].frame; - src->buf = xd->cur_buf->y_buffer + mb_y_offset; - src->stride = xd->cur_buf->y_stride; - pre->buf = ref_frame->y_buffer + mb_y_offset; - pre->stride = ref_frame->y_stride; - assert(src->stride == pre->stride); - return 1; - } else { - printf("invalid ref_frame_idx"); - assert(ref_frame_idx != -1); - return 0; - } -} - -#define kMvPreCheckLines 5 -#define kMvPreCheckSize 15 - -#define MV_REF_POS_NUM 3 -POSITION mv_ref_pos[MV_REF_POS_NUM] = { - { -1, 0 }, - { 0, -1 }, - { -1, -1 }, -}; - -static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, - int mi_col) { - return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; -} - -static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - int i; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int_mv nearest_mv, near_mv, invalid_mv; - nearest_mv.as_int = INVALID_MV; - near_mv.as_int = INVALID_MV; - invalid_mv.as_int = INVALID_MV; - for (i = 0; i < MV_REF_POS_NUM; ++i) { - int nb_row = mi_row + mv_ref_pos[i].row * mi_height; - int nb_col = mi_col + mv_ref_pos[i].col * mi_width; - assert(mv_ref_pos[i].row <= 0); - assert(mv_ref_pos[i].col <= 0); - if (nb_row >= 0 && nb_col >= 0) { - if (nearest_mv.as_int == INVALID_MV) { - nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); - } else { - int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); - if (mv.as_int == nearest_mv.as_int) { - continue; - } else { - near_mv = mv; - break; - } - } - } - } - if (nearest_mv.as_int == INVALID_MV) { - nearest_mv.as_mv.row = 0; - nearest_mv.as_mv.col = 0; - } - if (near_mv.as_int == INVALID_MV) { - near_mv.as_mv.row = 0; - near_mv.as_mv.col = 0; - } - if (mv_mode == NEAREST_MV_MODE) { - return nearest_mv; - } - if (mv_mode == NEAR_MV_MODE) { - return near_mv; - } - assert(0); - return invalid_mv; -} - -static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, - MotionField *motion_field, - TplDepFrame *tpl_frame, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - int_mv mv; - switch (mv_mode) { - case ZERO_MV_MODE: - mv.as_mv.row = 0; - mv.as_mv.col = 0; - break; - case NEW_MV_MODE: - mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); - break; - case NEAREST_MV_MODE: - mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); - break; - case NEAR_MV_MODE: - mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); - break; - default: - mv.as_int = INVALID_MV; - assert(0); - break; - } - return mv; -} - -static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *mv) { - uint32_t sse; - struct buf_2d src; - struct buf_2d pre; - MV full_mv; - *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, - mi_row, mi_col); - full_mv = get_full_mv(&mv->as_mv); - if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, - &src, &pre)) { - // TODO(angiebird): Consider subpixel when computing the sse. - cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), - pre.stride, &sse); - return (double)(sse << VP9_DIST_SCALE_LOG2); - } else { - assert(0); - return 0; - } -} - -static int get_mv_mode_cost(int mv_mode) { - // TODO(angiebird): The probabilities are roughly inferred from - // default_inter_mode_probs. Check if there is a better way to set the - // probabilities. - const int zero_mv_prob = 16; - const int new_mv_prob = 24 * 1; - const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; - assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); - switch (mv_mode) { - case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; - case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; - case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; - case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; - default: assert(0); return -1; - } -} - -static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { - double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + - log2(1 + abs(new_mv->col - ref_mv->col)); - mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); - return mv_diff_cost; -} -static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, - TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, - int mi_col) { - double mv_cost = get_mv_mode_cost(mv_mode); - if (mv_mode == NEW_MV_MODE) { - MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, - bsize, mi_row, mi_col) - .as_mv; - MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, - tpl_frame, bsize, mi_row, mi_col) - .as_mv; - MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, - bsize, mi_row, mi_col) - .as_mv; - double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); - double near_cost = get_mv_diff_cost(&new_mv, &near_mv); - mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; - } - return mv_cost; -} - -static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - int_mv *mv) { - MACROBLOCKD *xd = &x->e_mbd; - double mv_dist = - get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); - double mv_cost = - get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); - double mult = 180; - - return mv_cost + mult * log2f(1 + mv_dist); -} - -static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, - MotionField *motion_field, int frame_idx, - TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col, - double *rd, int_mv *mv) { - int best_mv_mode = ZERO_MV_MODE; - int update = 0; - int mv_mode; - *rd = 0; - for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { - double this_rd; - int_mv this_mv; - if (mv_mode == NEW_MV_MODE) { - continue; - } - this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); - if (update == 0) { - *rd = this_rd; - *mv = this_mv; - best_mv_mode = mv_mode; - update = 1; - } else { - if (this_rd < *rd) { - *rd = this_rd; - *mv = this_mv; - best_mv_mode = mv_mode; - } - } - } - return best_mv_mode; -} - -static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, MotionField *motion_field, - int frame_idx, TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize, int mi_row, int mi_col) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int tmp_mv_mode_arr[kMvPreCheckSize]; - int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; - double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; - int_mv *select_mv_arr = cpi->select_mv_arr; - int_mv tmp_select_mv_arr[kMvPreCheckSize]; - int stride = tpl_frame->stride; - double new_mv_rd = 0; - double no_new_mv_rd = 0; - double this_new_mv_rd = 0; - double this_no_new_mv_rd = 0; - int idx; - int tmp_idx; - assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); - - // no new mv - // diagonal scan order - tmp_idx = 0; - for (idx = 0; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - double this_rd; - int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; - mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( - cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, - bsize, nb_row, nb_col, &this_rd, mv); - if (r == 0 && c == 0) { - this_no_new_mv_rd = this_rd; - } - no_new_mv_rd += this_rd; - tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; - tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; - ++tmp_idx; - } - } - } - - // new mv - mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; - this_new_mv_rd = eval_mv_mode( - NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, - rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); - new_mv_rd = this_new_mv_rd; - // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE - // beforehand. - for (idx = 1; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - double this_rd; - int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; - mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( - cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, - bsize, nb_row, nb_col, &this_rd, mv); - new_mv_rd += this_rd; - } - } - } - - // update best_mv_mode - tmp_idx = 0; - if (no_new_mv_rd < new_mv_rd) { - for (idx = 0; idx < kMvPreCheckLines; ++idx) { - int r; - for (r = 0; r <= idx; ++r) { - int c = idx - r; - int nb_row = mi_row + r * mi_height; - int nb_col = mi_col + c * mi_width; - if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { - mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; - select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; - ++tmp_idx; - } - } - } - rd_diff_arr[mi_row * stride + mi_col] = 0; - } else { - rd_diff_arr[mi_row * stride + mi_col] = - (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); - } -} - -static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, - GF_PICTURE *gf_picture, - MotionField *motion_field, int frame_idx, - TplDepFrame *tpl_frame, int rf_idx, - BLOCK_SIZE bsize) { - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int unit_rows = tpl_frame->mi_rows / mi_height; - const int unit_cols = tpl_frame->mi_cols / mi_width; - const int max_diagonal_lines = unit_rows + unit_cols - 1; - int idx; - for (idx = 0; idx < max_diagonal_lines; ++idx) { - int r; - for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); - ++r) { - int c = idx - r; - int mi_row = r * mi_height; - int mi_col = c * mi_width; - assert(c >= 0 && c < unit_cols); - assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); - assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); - predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, - rf_idx, bsize, mi_row, mi_col); - } - } -} - -static void do_motion_search(VP9_COMP *cpi, ThreadData *td, - MotionField *motion_field, int frame_idx, - YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, - int mi_row, int mi_col) { - VP9_COMMON *cm = &cpi->common; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - const int mb_y_offset = - mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; - assert(ref_frame != NULL); - set_mv_limits(cm, x, mi_row, mi_col); - { - int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); - uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; - uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; - const int stride = xd->cur_buf->y_stride; - full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, - ref_frame_buf, stride, bsize, mi_row, mi_col, - &mv.as_mv); - sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, - bsize, &mv.as_mv); - vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); - } -} - -static void build_motion_field( - VP9_COMP *cpi, int frame_idx, - YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { - VP9_COMMON *cm = &cpi->common; - ThreadData *td = &cpi->td; - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; - const int ph = num_4x4_blocks_high_lookup[bsize] << 2; - int mi_row, mi_col; - int rf_idx; - - tpl_frame->lambda = (pw * ph) >> 2; - assert(pw * ph == tpl_frame->lambda << 2); - - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - if (ref_frame[rf_idx] == NULL) { - continue; - } - vp9_motion_field_reset_mvs(motion_field); - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], - bsize, mi_row, mi_col); - } - } - } -} -#endif // CONFIG_NON_GREEDY_MV - -static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, - int frame_idx, BLOCK_SIZE bsize) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; - YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; - - VP9_COMMON *cm = &cpi->common; - struct scale_factors sf; - int rdmult, idx; - ThreadData *td = &cpi->td; - MACROBLOCK *x = &td->mb; - MACROBLOCKD *xd = &x->e_mbd; - int mi_row, mi_col; - -#if CONFIG_VP9_HIGHBITDEPTH - DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); - DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); - uint8_t *predictor; -#else - DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); -#endif - DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); - DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); - - const TX_SIZE tx_size = max_txsize_lookup[bsize]; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - int64_t recon_error, sse; -#if CONFIG_NON_GREEDY_MV - int square_block_idx; - int rf_idx; -#endif - - // Setup scaling factor -#if CONFIG_VP9_HIGHBITDEPTH - vp9_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height, - cpi->common.use_highbitdepth); - - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) - predictor = CONVERT_TO_BYTEPTR(predictor16); - else - predictor = predictor8; -#else - vp9_setup_scale_factors_for_frame( - &sf, this_frame->y_crop_width, this_frame->y_crop_height, - this_frame->y_crop_width, this_frame->y_crop_height); -#endif // CONFIG_VP9_HIGHBITDEPTH - - // Prepare reference frame pointers. If any reference frame slot is - // unavailable, the pointer will be set to Null. - for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { - int rf_idx = gf_picture[frame_idx].ref_frame[idx]; - if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; - } - - xd->mi = cm->mi_grid_visible; - xd->mi[0] = cm->mi; - xd->cur_buf = this_frame; - - // Get rd multiplier set up. - rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); - set_error_per_bit(&cpi->td.mb, rdmult); - vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); - - tpl_frame->is_valid = 1; - - cm->base_qindex = tpl_frame->base_qindex; - vp9_frame_init_quantizer(cpi); - -#if CONFIG_NON_GREEDY_MV - for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; - ++square_block_idx) { - BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); - build_motion_field(cpi, frame_idx, ref_frame, square_bsize); - } - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - MotionField *motion_field = vp9_motion_field_info_get_motion_field( - &cpi->motion_field_info, frame_idx, rf_idx, bsize); - predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, - tpl_frame, rf_idx, bsize); - } - } -#endif - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, - src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, - tx_size, ref_frame, predictor, &recon_error, &sse); - // Motion flow dependency dispenser. - tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, - tpl_frame->stride); - - tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, - bsize); - } - } -} - -#if CONFIG_NON_GREEDY_MV -#define DUMP_TPL_STATS 0 -#if DUMP_TPL_STATS -static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { - int i, j; - printf("%d %d\n", h, w); - for (i = 0; i < h; ++i) { - for (j = 0; j < w; ++j) { - printf("%d ", buf[(row + i) * stride + col + j]); - } - } - printf("\n"); -} - -static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { - dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, - frame_buf->y_width); - dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, - frame_buf->uv_height, frame_buf->uv_width); - dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, - frame_buf->uv_height, frame_buf->uv_width); -} - -static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, - const GF_GROUP *gf_group, - const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { - int frame_idx; - const VP9_COMMON *cm = &cpi->common; - int rf_idx; - for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - int mi_row, mi_col; - int ref_frame_idx; - const int mi_height = num_8x8_blocks_high_lookup[bsize]; - const int mi_width = num_8x8_blocks_wide_lookup[bsize]; - ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; - if (ref_frame_idx != -1) { - YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; - const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; - const int ref_gf_frame_offset = - gf_group->frame_gop_index[ref_frame_idx]; - printf("=\n"); - printf( - "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " - "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", - frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, - ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { - int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, - frame_idx, rf_idx, bsize, - mi_row, mi_col); - printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, - mv.as_mv.col); - } - } - } - for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { - for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { - if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { - const TplDepStats *tpl_ptr = - &tpl_frame - ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; - printf("%f ", tpl_ptr->feature_score); - } - } - } - printf("\n"); - - for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { - for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { - const int mv_mode = - tpl_frame - ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; - printf("%d ", mv_mode); - } - } - printf("\n"); - - dump_frame_buf(gf_picture[frame_idx].frame); - dump_frame_buf(ref_frame_buf); - } - } - } -} -#endif // DUMP_TPL_STATS -#endif // CONFIG_NON_GREEDY_MV - -static void init_tpl_buffer(VP9_COMP *cpi) { - VP9_COMMON *cm = &cpi->common; - int frame; - - const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); - const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); -#if CONFIG_NON_GREEDY_MV - int rf_idx; - - vpx_free(cpi->select_mv_arr); - CHECK_MEM_ERROR( - cm, cpi->select_mv_arr, - vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); -#endif - - // TODO(jingning): Reduce the actual memory use for tpl model build up. - for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { - if (cpi->tpl_stats[frame].width >= mi_cols && - cpi->tpl_stats[frame].height >= mi_rows && - cpi->tpl_stats[frame].tpl_stats_ptr) - continue; - -#if CONFIG_NON_GREEDY_MV - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); - CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], - vpx_calloc(mi_rows * mi_cols * 4, - sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); - vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); - CHECK_MEM_ERROR( - cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], - vpx_calloc(mi_rows * mi_cols * 4, - sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); - } -#endif - vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, - vpx_calloc(mi_rows * mi_cols, - sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); - cpi->tpl_stats[frame].is_valid = 0; - cpi->tpl_stats[frame].width = mi_cols; - cpi->tpl_stats[frame].height = mi_rows; - cpi->tpl_stats[frame].stride = mi_cols; - cpi->tpl_stats[frame].mi_rows = cm->mi_rows; - cpi->tpl_stats[frame].mi_cols = cm->mi_cols; - } - - for (frame = 0; frame < REF_FRAMES; ++frame) { - cpi->enc_frame_buf[frame].mem_valid = 0; - cpi->enc_frame_buf[frame].released = 1; - } -} - -static void free_tpl_buffer(VP9_COMP *cpi) { - int frame; -#if CONFIG_NON_GREEDY_MV - vp9_free_motion_field_info(&cpi->motion_field_info); - vpx_free(cpi->select_mv_arr); -#endif - for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { -#if CONFIG_NON_GREEDY_MV - int rf_idx; - for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { - vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); - vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); - } -#endif - vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); - cpi->tpl_stats[frame].is_valid = 0; - } -} - -#if CONFIG_RATE_CTRL -static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int show_frame_count = 0; - int frame_idx; - // Accumulate tpl stats for each frame in the current group of picture. - for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { - TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; - TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; - const int tpl_stride = tpl_frame->stride; - int64_t intra_cost_base = 0; - int64_t inter_cost_base = 0; - int64_t mc_dep_cost_base = 0; - int64_t mc_ref_cost_base = 0; - int64_t mc_flow_base = 0; - int row, col; - - if (!tpl_frame->is_valid) continue; - - for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { - for (col = 0; col < cm->mi_cols; ++col) { - TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; - intra_cost_base += this_stats->intra_cost; - inter_cost_base += this_stats->inter_cost; - mc_dep_cost_base += this_stats->mc_dep_cost; - mc_ref_cost_base += this_stats->mc_ref_cost; - mc_flow_base += this_stats->mc_flow; - } - } - - cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; - cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; - cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; - - ++show_frame_count; - } -} -#endif // CONFIG_RATE_CTRL - -static void setup_tpl_stats(VP9_COMP *cpi) { - GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; - const GF_GROUP *gf_group = &cpi->twopass.gf_group; - int tpl_group_frames = 0; - int frame_idx; - cpi->tpl_bsize = BLOCK_32X32; - - init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); - - init_tpl_stats(cpi); - - // Backward propagation from tpl_group_frames to 1. - for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { - if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; - mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); - } -#if CONFIG_NON_GREEDY_MV - cpi->tpl_ready = 1; -#if DUMP_TPL_STATS - dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); -#endif // DUMP_TPL_STATS -#endif // CONFIG_NON_GREEDY_MV - -#if CONFIG_RATE_CTRL - if (cpi->oxcf.use_simple_encode_api) { - accumulate_frame_tpl_stats(cpi); - } -#endif // CONFIG_RATE_CTRL -} - void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags, RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int *ref_frame_coding_indexes, @@ -7640,6 +6231,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, const int gf_group_index = cpi->twopass.gf_group.index; int i; +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time); +#endif + if (is_one_pass_svc(cpi)) { vp9_one_pass_svc_start_layer(cpi); } @@ -7704,9 +6299,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1); not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_temporal_filter_time); +#endif // Produce the filtered ARF frame. vp9_temporal_filter(cpi, arf_src_index); vpx_extend_frame_borders(&cpi->alt_ref_buffer); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_temporal_filter_time); +#endif // for small bitrates segmentation overhead usually // eats all bitrate gain from enabling delta quantizers @@ -7820,7 +6421,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #if !CONFIG_REALTIME_ONLY if ((oxcf->pass == 2) && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif vp9_rc_get_second_pass_params(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, vp9_rc_get_second_pass_params_time); +#endif } else if (oxcf->pass == 1) { set_frame_size(cpi); } @@ -7860,13 +6467,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, } #endif // CONFIG_NON_GREEDY_MV +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, setup_tpl_stats_time); +#endif if (gf_group_index == 1 && cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE && cpi->sf.enable_tpl_model) { - init_tpl_buffer(cpi); + vp9_init_tpl_buffer(cpi); vp9_estimate_qp_gop(cpi); - setup_tpl_stats(cpi); + vp9_setup_tpl_stats(cpi); } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, setup_tpl_stats_time); +#endif #if CONFIG_BITSTREAM_DEBUG assert(cpi->oxcf.max_threads == 0 && @@ -7903,8 +6516,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add; vp9_first_pass(cpi, source); } else if (oxcf->pass == 2 && !cpi->use_svc) { +#if CONFIG_COLLECT_COMPONENT_TIMING + // Accumulate 2nd pass time in 2-pass case. + start_timing(cpi, Pass2Encode_time); +#endif Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result); vp9_twopass_postencode_update(cpi); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, Pass2Encode_time); +#endif } else if (cpi->use_svc) { SvcEncode(cpi, size, dest, frame_flags); } else { @@ -8107,6 +6727,41 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, #endif +#if CONFIG_COLLECT_COMPONENT_TIMING + if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time); + + // Print out timing information. + // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of + // show_existing_frame and lag-in-frames. + // if (cpi->frame_component_time[0] > 100) + if (oxcf->pass == 2) { + uint64_t frame_total = 0, total = 0; + int i; + + fprintf(stderr, + "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n", + cm->current_video_frame, get_frame_type_enum(cm->frame_type), + cm->show_frame, cm->base_qindex); + for (i = 0; i < kTimingComponents; i++) { + cpi->component_time[i] += cpi->frame_component_time[i]; + // Use vp9_get_compressed_data_time (i = 0) as the total time. + if (i == 0) { + frame_total = cpi->frame_component_time[0]; + total = cpi->component_time[0]; + } + fprintf(stderr, + " %50s: %15" PRId64 " us [%6.2f%%] (total: %15" PRId64 + " us [%6.2f%%])\n", + get_component_name(i), cpi->frame_component_time[i], + (float)((float)cpi->frame_component_time[i] * 100.0 / + (float)frame_total), + cpi->component_time[i], + (float)((float)cpi->component_time[i] * 100.0 / (float)total)); + cpi->frame_component_time[i] = 0; + } + } +#endif + if (is_one_pass_svc(cpi)) { if (cm->show_frame) { ++cpi->svc.spatial_layer_to_encode; @@ -8149,12 +6804,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest, } } -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode) { +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode) { VP9_COMMON *cm = &cpi->common; int hr = 0, hs = 0, vr = 0, vs = 0; - if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1; + if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1; Scale2Ratio(horiz_mode, &hr, &hs); Scale2Ratio(vert_mode, &vr, &vs); diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h index cca8b53f8eaf..79c0b36a17a8 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.h @@ -90,13 +90,6 @@ typedef enum { ENCODE_BREAKOUT_LIMITED = 2 } ENCODE_BREAKOUT_TYPE; -typedef enum { - NORMAL = 0, - FOURFIVE = 1, - THREEFIVE = 2, - ONETWO = 3 -} VPX_SCALING; - typedef enum { // Good Quality Fast Encoding. The encoder balances quality with the amount of // time it takes to encode the output. Speed setting controls how fast. @@ -336,9 +329,7 @@ typedef struct TplDepFrame { typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES]; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int8_t mode_map[BLOCK_SIZES][MAX_MODES]; FIRSTPASS_DATA fp_data; VP9RowMTSync row_mt_sync; @@ -659,6 +650,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; } static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; } #endif // CONFIG_RATE_CTRL +#if CONFIG_COLLECT_COMPONENT_TIMING +#include "vpx_ports/vpx_timer.h" +// Adjust the following to add new components. +typedef enum { + vp9_get_compressed_data_time, + vp9_temporal_filter_time, + vp9_rc_get_second_pass_params_time, + setup_tpl_stats_time, + Pass2Encode_time, + + encode_with_recode_loop_time, + loopfilter_frame_time, + vp9_pack_bitstream_time, + + encode_frame_internal_time, + rd_pick_partition_time, + rd_pick_sb_modes_time, + encode_sb_time, + + vp9_rd_pick_inter_mode_sb_time, + vp9_rd_pick_inter_mode_sub8x8_time, + + intra_mode_search_time, + handle_inter_mode_time, + single_motion_search_time, + joint_motion_search_time, + interp_filter_time, + + kTimingComponents, +} TIMING_COMPONENT; + +static INLINE char const *get_component_name(int index) { + switch (index) { + case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time"; + case vp9_temporal_filter_time: return "vp9_temporal_filter_time"; + case vp9_rc_get_second_pass_params_time: + return "vp9_rc_get_second_pass_params_time"; + case setup_tpl_stats_time: return "setup_tpl_stats_time"; + case Pass2Encode_time: return "Pass2Encode_time"; + + case encode_with_recode_loop_time: return "encode_with_recode_loop_time"; + case loopfilter_frame_time: return "loopfilter_frame_time"; + case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time"; + + case encode_frame_internal_time: return "encode_frame_internal_time"; + case rd_pick_partition_time: return "rd_pick_partition_time"; + case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time"; + case encode_sb_time: return "encode_sb_time"; + + case vp9_rd_pick_inter_mode_sb_time: + return "vp9_rd_pick_inter_mode_sb_time"; + case vp9_rd_pick_inter_mode_sub8x8_time: + return "vp9_rd_pick_inter_mode_sub8x8_time"; + + case intra_mode_search_time: return "intra_mode_search_time"; + case handle_inter_mode_time: return "handle_inter_mode_time"; + case single_motion_search_time: return "single_motion_search_time"; + case joint_motion_search_time: return "joint_motion_search_time"; + case interp_filter_time: return "interp_filter_time"; + + default: assert(0); + } + return "error"; +} +#endif + typedef struct VP9_COMP { FRAME_INFO frame_info; QUANTS quants; @@ -973,6 +1030,22 @@ typedef struct VP9_COMP { EXT_RATECTRL ext_ratectrl; int fixed_qp_onepass; + +#if CONFIG_COLLECT_COMPONENT_TIMING + /*! + * component_time[] are initialized to zero while encoder starts. + */ + uint64_t component_time[kTimingComponents]; + /*! + * Stores timing for individual components between calls of start_timing() + * and end_timing(). + */ + struct vpx_usec_timer component_timer[kTimingComponents]; + /*! + * frame_component_time[] are initialized to zero at beginning of each frame. + */ + uint64_t frame_component_time[kTimingComponents]; +#endif } VP9_COMP; #if CONFIG_RATE_CTRL @@ -1154,8 +1227,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, int cols); -int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, - VPX_SCALING vert_mode); +int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode, + VPX_SCALING_MODE vert_mode); int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width, unsigned int height); @@ -1392,6 +1465,38 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr); #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl)) +static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) { + RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx]; + if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows || + new_fb_ptr->mi_cols < cm->mi_cols) { + vpx_free(new_fb_ptr->mvs); + CHECK_MEM_ERROR(cm, new_fb_ptr->mvs, + (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols, + sizeof(*new_fb_ptr->mvs))); + new_fb_ptr->mi_rows = cm->mi_rows; + new_fb_ptr->mi_cols = cm->mi_cols; + } +} + +#if CONFIG_COLLECT_COMPONENT_TIMING +static INLINE void start_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_start(&cpi->component_timer[component]); +} +static INLINE void end_timing(VP9_COMP *cpi, int component) { + vpx_usec_timer_mark(&cpi->component_timer[component]); + cpi->frame_component_time[component] += + vpx_usec_timer_elapsed(&cpi->component_timer[component]); +} +static INLINE char const *get_frame_type_enum(int type) { + switch (type) { + case 0: return "KEY_FRAME"; + case 1: return "INTER_FRAME"; + default: assert(0); + } + return "error"; +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h index d2bc5e60ed48..efd854edf403 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rd.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rd.h @@ -121,11 +121,9 @@ typedef struct RD_OPT { int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; -#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES]; int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS]; -#endif // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL int RDMULT; int RDDIV; double r0; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c index a464ce38f115..498bc0fbd5e2 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c @@ -2832,8 +2832,14 @@ static int64_t handle_inter_mode( frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int; if (cpi->sf.comp_inter_joint_search_thresh <= bsize) { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, joint_motion_search_time); +#endif joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col, single_newmv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, joint_motion_search_time); +#endif } else { rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv, &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv, @@ -2845,7 +2851,13 @@ static int64_t handle_inter_mode( *rate2 += rate_mv; } else { int_mv tmp_mv; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, single_motion_search_time); +#endif single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, single_motion_search_time); +#endif if (tmp_mv.as_int == INVALID_MV) return INT64_MAX; frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int = @@ -2908,6 +2920,9 @@ static int64_t handle_inter_mode( intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv); if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv); +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, interp_filter_time); +#endif // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX; @@ -3005,6 +3020,9 @@ static int64_t handle_inter_mode( restore_dst_buf(xd, orig_dst, orig_dst_stride); } } +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, interp_filter_time); +#endif // Set the appropriate filter mi->interp_filter = cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter; @@ -3707,19 +3725,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (ref_frame == INTRA_FRAME) { TX_SIZE uv_tx; struct macroblockd_plane *const pd = &xd->plane[1]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif memset(x->skip_txfm, 0, sizeof(x->skip_txfm)); super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize, best_rd, recon); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif if (rate_y == INT_MAX) continue; uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x] [pd->subsampling_y]; +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, intra_mode_search_time); +#endif if (rate_uv_intra[uv_tx] == INT_MAX) { choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]); } - +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, intra_mode_search_time); +#endif rate_uv = rate_uv_tokenonly[uv_tx]; distortion_uv = dist_uv[uv_tx]; skippable = skippable && skip_uv[uv_tx]; @@ -3730,11 +3759,17 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, rate2 += intra_cost_penalty; distortion2 = distortion_y + distortion_uv; } else { +#if CONFIG_COLLECT_COMPONENT_TIMING + start_timing(cpi, handle_inter_mode_time); +#endif this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, single_inter_filter, single_skippable, &total_sse, best_rd, &mask_filter, filter_cache); +#if CONFIG_COLLECT_COMPONENT_TIMING + end_timing(cpi, handle_inter_mode_time); +#endif if (this_rd == INT64_MAX) continue; compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred); @@ -3970,13 +4005,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, } if (best_mode_index < 0 || best_rd >= best_rd_so_far) { -// If adaptive interp filter is enabled, then the current leaf node of 8x8 -// data is needed for sub8x8. Hence preserve the context. -#if CONFIG_CONSISTENT_RECODE + // If adaptive interp filter is enabled, then the current leaf node of 8x8 + // data is needed for sub8x8. Hence preserve the context. if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; -#else - if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0]; -#endif rd_cost->rate = INT_MAX; rd_cost->rdcost = INT64_MAX; return; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c index 0431d8a452fb..19e37537b23c 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c @@ -16,8 +16,11 @@ #include "vpx_dsp/vpx_dsp_common.h" // Mesh search patters for various speed settings -static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = { - { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } +// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non +// FC_GRAPHICS_ANIMATION content type. +static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = { + { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } }, + { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } }, }; #if !CONFIG_REALTIME_ONLY @@ -209,15 +212,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, const int boosted = frame_is_boosted(cpi); int i; - sf->tx_size_search_breakout = 1; + sf->adaptive_pred_interp_filter = 1; sf->adaptive_rd_thresh = 1; sf->adaptive_rd_thresh_row_mt = 0; sf->allow_skip_recode = 1; sf->less_rectangular_check = 1; - sf->use_square_partition_only = !boosted; + sf->mv.auto_mv_step_size = 1; sf->prune_ref_frame_for_rect_partitions = 1; - sf->rd_ml_partition.var_pruning = 1; + sf->temporal_filter_search_method = NSTEP; + sf->tx_size_search_breakout = 1; + sf->use_square_partition_only = !boosted; + sf->rd_ml_partition.var_pruning = 1; sf->rd_ml_partition.prune_rect_thresh[0] = -1; sf->rd_ml_partition.prune_rect_thresh[1] = 350; sf->rd_ml_partition.prune_rect_thresh[2] = 325; @@ -238,7 +244,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 1) { - sf->temporal_filter_search_method = NSTEP; sf->rd_ml_partition.var_pruning = !boosted; sf->rd_ml_partition.prune_rect_thresh[1] = 225; sf->rd_ml_partition.prune_rect_thresh[2] = 225; @@ -263,11 +268,9 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, sf->less_rectangular_check = 1; sf->use_rd_breakout = 1; sf->adaptive_motion_search = 1; - sf->mv.auto_mv_step_size = 1; sf->adaptive_rd_thresh = 2; sf->mv.subpel_search_level = 1; if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10; - sf->adaptive_pred_interp_filter = 1; sf->allow_acl = 0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -991,10 +994,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) : INT_MAX; - if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) { + { + const int mesh_density_level = + (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1; for (i = 0; i < MAX_MESH_STEP; ++i) { - sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range; - sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval; + sf->mesh_patterns[i].range = + best_quality_mesh_pattern[mesh_density_level][i].range; + sf->mesh_patterns[i].interval = + best_quality_mesh_pattern[mesh_density_level][i].interval; } } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c new file mode 100644 index 000000000000..b0c735167ef9 --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.c @@ -0,0 +1,1400 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_dsp_rtcd.h" +#if CONFIG_NON_GREEDY_MV +#include "vp9/common/vp9_mvref_common.h" +#endif +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_reconintra.h" +#include "vp9/common/vp9_scan.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_tpl_model.h" + +static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture, + const GF_GROUP *gf_group, int *tpl_group_frames) { + VP9_COMMON *cm = &cpi->common; + int frame_idx = 0; + int i; + int gld_index = -1; + int alt_index = -1; + int lst_index = -1; + int arf_index_stack[MAX_ARF_LAYERS]; + int arf_stack_size = 0; + int extend_frame_count = 0; + int pframe_qindex = cpi->tpl_stats[2].base_qindex; + int frame_gop_offset = 0; + + RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs; + int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS]; + + memset(recon_frame_index, -1, sizeof(recon_frame_index)); + stack_init(arf_index_stack, MAX_ARF_LAYERS); + + // TODO(jingning): To be used later for gf frame type parsing. + (void)gf_group; + + for (i = 0; i < FRAME_BUFFERS; ++i) { + if (frame_bufs[i].ref_count == 0) { + alloc_frame_mvs(cm, i); + if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height, + cm->subsampling_x, cm->subsampling_y, +#if CONFIG_VP9_HIGHBITDEPTH + cm->use_highbitdepth, +#endif + VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment, + NULL, NULL, NULL)) + vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, + "Failed to allocate frame buffer"); + + recon_frame_index[frame_idx] = i; + ++frame_idx; + + if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break; + } + } + + for (i = 0; i < REFS_PER_FRAME + 1; ++i) { + assert(recon_frame_index[i] >= 0); + cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf; + } + + *tpl_group_frames = 0; + + // Initialize Golden reference frame. + gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1; + gf_picture[0].update_type = gf_group->update_type[0]; + gld_index = 0; + ++*tpl_group_frames; + + // Initialize base layer ARF frame + gf_picture[1].frame = cpi->Source; + gf_picture[1].ref_frame[0] = gld_index; + gf_picture[1].ref_frame[1] = lst_index; + gf_picture[1].ref_frame[2] = alt_index; + gf_picture[1].update_type = gf_group->update_type[1]; + alt_index = 1; + ++*tpl_group_frames; + + // Initialize P frames + for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + struct lookahead_entry *buf; + frame_gop_offset = gf_group->frame_gop_index[frame_idx]; + buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx]; + + switch (gf_group->update_type[frame_idx]) { + case ARF_UPDATE: + stack_push(arf_index_stack, alt_index, arf_stack_size); + ++arf_stack_size; + alt_index = frame_idx; + break; + case LF_UPDATE: lst_index = frame_idx; break; + case OVERLAY_UPDATE: + gld_index = frame_idx; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + case USE_BUF_FRAME: + lst_index = alt_index; + alt_index = stack_pop(arf_index_stack, arf_stack_size); + --arf_stack_size; + break; + default: break; + } + + ++*tpl_group_frames; + + // The length of group of pictures is baseline_gf_interval, plus the + // beginning golden frame from last GOP, plus the last overlay frame in + // the same GOP. + if (frame_idx == gf_group->gf_group_size) break; + } + + alt_index = -1; + ++frame_idx; + ++frame_gop_offset; + + // Extend two frames outside the current gf group. + for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) { + struct lookahead_entry *buf = + vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1); + + if (buf == NULL) break; + + cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex; + + gf_picture[frame_idx].frame = &buf->img; + gf_picture[frame_idx].ref_frame[0] = gld_index; + gf_picture[frame_idx].ref_frame[1] = lst_index; + gf_picture[frame_idx].ref_frame[2] = alt_index; + gf_picture[frame_idx].update_type = LF_UPDATE; + lst_index = frame_idx; + ++*tpl_group_frames; + ++extend_frame_count; + ++frame_gop_offset; + } +} + +static void init_tpl_stats(VP9_COMP *cpi) { + int frame_idx; + for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + memset(tpl_frame->tpl_stats_ptr, 0, + tpl_frame->height * tpl_frame->width * + sizeof(*tpl_frame->tpl_stats_ptr)); + tpl_frame->is_valid = 0; + } +} + +#if CONFIG_NON_GREEDY_MV +static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, + int frame_idx, uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, int mi_row, + int mi_col, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + int step_param; + uint32_t bestsme = UINT_MAX; + const MvLimits tmp_mv_limits = x->mv_limits; + // lambda is used to adjust the importance of motion vector consistency. + // TODO(angiebird): Figure out lambda's proper value. + const int lambda = cpi->tpl_stats[frame_idx].lambda; + int_mv nb_full_mvs[NB_MVS_NUM]; + int nb_full_mv_num; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + nb_full_mv_num = + vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs); + vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param, + lambda, 1, nb_full_mvs, nb_full_mv_num, mv); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + return bestsme; +} + +static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, int stride, + BLOCK_SIZE bsize, MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + + MV best_ref_mv1 = { 0, 0 }; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} + +#else // CONFIG_NON_GREEDY_MV +static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td, + uint8_t *cur_frame_buf, + uint8_t *ref_frame_buf, + int stride, BLOCK_SIZE bsize, + MV *mv) { + MACROBLOCK *const x = &td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; + const SEARCH_METHODS search_method = NSTEP; + int step_param; + int sadpb = x->sadperbit16; + uint32_t bestsme = UINT_MAX; + uint32_t distortion; + uint32_t sse; + int cost_list[5]; + const MvLimits tmp_mv_limits = x->mv_limits; + + MV best_ref_mv1 = { 0, 0 }; + MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */ + + best_ref_mv1_full.col = best_ref_mv1.col >> 3; + best_ref_mv1_full.row = best_ref_mv1.row >> 3; + + // Setup frame pointers + x->plane[0].src.buf = cur_frame_buf; + x->plane[0].src.stride = stride; + xd->plane[0].pre[0].buf = ref_frame_buf; + xd->plane[0].pre[0].stride = stride; + + step_param = mv_sf->reduce_first_step_size; + step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2); + + vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1); + + vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param, + search_method, sadpb, cond_cost_list(cpi, cost_list), + &best_ref_mv1, mv, 0, 0); + + /* restore UMV window */ + x->mv_limits = tmp_mv_limits; + + // TODO(yunqing): may use higher tap interp filter than 2 taps. + // Ignore mv costing by sending NULL pointer instead of cost array + bestsme = cpi->find_fractional_mv_step( + x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit, + &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level, + cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0, + USE_2_TAPS); + + return bestsme; +} +#endif + +static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row, + int ref_pos_col, int block, BLOCK_SIZE bsize) { + int width = 0, height = 0; + int bw = 4 << b_width_log2_lookup[bsize]; + int bh = 4 << b_height_log2_lookup[bsize]; + + switch (block) { + case 0: + width = grid_pos_col + bw - ref_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 1: + width = ref_pos_col + bw - grid_pos_col; + height = grid_pos_row + bh - ref_pos_row; + break; + case 2: + width = grid_pos_col + bw - ref_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + case 3: + width = ref_pos_col + bw - grid_pos_col; + height = ref_pos_row + bh - grid_pos_row; + break; + default: assert(0); + } + + return width * height; +} + +static int round_floor(int ref_pos, int bsize_pix) { + int round; + if (ref_pos < 0) + round = -(1 + (-ref_pos - 1) / bsize_pix); + else + round = ref_pos / bsize_pix; + + return round; +} + +static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col, + BLOCK_SIZE bsize, int stride) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col]; + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx]; + const int64_t mc_flow = tpl_ptr->mc_flow; + const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost; + *tpl_ptr = *src_stats; + tpl_ptr->mc_flow = mc_flow; + tpl_ptr->mc_ref_cost = mc_ref_cost; + tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow; + } + } +} + +static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index]; + TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr; + MV mv = tpl_stats->mv.as_mv; + int mv_row = mv.row >> 3; + int mv_col = mv.col >> 3; + + int ref_pos_row = mi_row * MI_SIZE + mv_row; + int ref_pos_col = mi_col * MI_SIZE + mv_col; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pix_num = bw * bh; + + // top-left on grid block location in pixel + int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh; + int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw; + int block; + + for (block = 0; block < 4; ++block) { + int grid_pos_row = grid_pos_row_base + bh * (block >> 1); + int grid_pos_col = grid_pos_col_base + bw * (block & 0x01); + + if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE && + grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) { + int overlap_area = get_overlap_area( + grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize); + int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height; + int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width; + + int64_t mc_flow = tpl_stats->mc_dep_cost - + (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) / + tpl_stats->intra_cost; + + int idx, idy; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *des_stats = + &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride + + (ref_mi_col + idx)]; + + des_stats->mc_flow += (mc_flow * overlap_area) / pix_num; + des_stats->mc_ref_cost += + ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) / + pix_num; + assert(overlap_area >= 0); + } + } + } + } +} + +static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats, + int mi_row, int mi_col, const BLOCK_SIZE bsize) { + int idx, idy; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + TplDepStats *tpl_ptr = + &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)]; + tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx, + BLOCK_8X8); + } + } +} + +static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, + TX_SIZE tx_size, int64_t *recon_error, + int64_t *sse) { + MACROBLOCKD *const xd = &x->e_mbd; + const struct macroblock_plane *const p = &x->plane[plane]; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + const scan_order *const scan_order = &vp9_default_scan_orders[tx_size]; + uint16_t eob; + int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]; + const int shift = tx_size == TX_32X32 ? 0 : 2; + + // skip block condition should be handled before this is called. + assert(!x->skip_block); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, + qcoeff, dqcoeff, pd->dequant, &eob, + scan_order->scan, scan_order->iscan); + } else { + vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); + } +#else + vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff, + dqcoeff, pd->dequant, &eob, scan_order->scan, + scan_order->iscan); +#endif // CONFIG_VP9_HIGHBITDEPTH + + *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift; + *recon_error = VPXMAX(*recon_error, 1); + + *sse = (*sse) >> shift; + *sse = VPXMAX(*sse, 1); +} + +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms. + switch (tx_size) { + case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size) { + switch (tx_size) { + case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break; + case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break; + case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break; + default: assert(0); + } +} + +static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row, + int mi_col) { + x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.row_max = + (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND); + x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND)); + x->mv_limits.col_max = + ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND); +} + +static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, + struct scale_factors *sf, GF_PICTURE *gf_picture, + int frame_idx, TplDepFrame *tpl_frame, + int16_t *src_diff, tran_low_t *coeff, + tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row, + int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size, + YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor, + int64_t *recon_error, int64_t *sse) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + + const int bw = 4 << b_width_log2_lookup[bsize]; + const int bh = 4 << b_height_log2_lookup[bsize]; + const int pix_num = bw * bh; + int best_rf_idx = -1; + int_mv best_mv; + int64_t best_inter_cost = INT64_MAX; + int64_t inter_cost; + int rf_idx; + const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP]; + + int64_t best_intra_cost = INT64_MAX; + int64_t intra_cost; + PREDICTION_MODE mode; + int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + MODE_INFO mi_above, mi_left; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + TplDepStats *tpl_stats = + &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + + xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8); + xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8; + xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8); + xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8; + xd->above_mi = (mi_row > 0) ? &mi_above : NULL; + xd->left_mi = (mi_col > 0) ? &mi_left : NULL; + + // Intra prediction search + for (mode = DC_PRED; mode <= TM_PRED; ++mode) { + uint8_t *src, *dst; + int src_stride, dst_stride; + + src = xd->cur_buf->y_buffer + mb_y_offset; + src_stride = xd->cur_buf->y_stride; + + dst = &predictor[0]; + dst_stride = bw; + + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + + vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src, + src_stride, dst, dst_stride, 0, 0, 0); + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, + dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); + } +#else + vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + intra_cost = vpx_satd(coeff, pix_num); +#endif // CONFIG_VP9_HIGHBITDEPTH + + if (intra_cost < best_intra_cost) best_intra_cost = intra_cost; + } + + // Motion compensated prediction + best_mv.as_int = 0; + + set_mv_limits(cm, x, mi_row, mi_col); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int_mv mv; +#if CONFIG_NON_GREEDY_MV + MotionField *motion_field; +#endif + if (ref_frame[rf_idx] == NULL) continue; + +#if CONFIG_NON_GREEDY_MV + (void)td; + motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); +#else + motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, bsize, &mv.as_mv); +#endif + +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_highbd_build_inter_predictor( + CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset), + ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, + mi_row * MI_SIZE, xd->bd); + vpx_highbd_subtract_block( + bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw, xd->bd); + vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_highbd_satd(coeff, pix_num); + } else { + vp9_build_inter_predictor( + ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh, + 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); + } +#else + vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset, + ref_frame[rf_idx]->y_stride, &predictor[0], bw, + &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, + mi_col * MI_SIZE, mi_row * MI_SIZE); + vpx_subtract_block(bh, bw, src_diff, bw, + xd->cur_buf->y_buffer + mb_y_offset, + xd->cur_buf->y_stride, &predictor[0], bw); + vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size); + inter_cost = vpx_satd(coeff, pix_num); +#endif + + if (inter_cost < best_inter_cost) { + best_rf_idx = rf_idx; + best_inter_cost = inter_cost; + best_mv.as_int = mv.as_int; + get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error, + sse); + } + } + best_intra_cost = VPXMAX(best_intra_cost, 1); + best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost); + tpl_stats->inter_cost = VPXMAX( + 1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->intra_cost = VPXMAX( + 1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width)); + tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx]; + tpl_stats->mv.as_int = best_mv.as_int; +} + +#if CONFIG_NON_GREEDY_MV +static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture, + int frame_idx, int rf_idx, int mi_row, + int mi_col, struct buf_2d *src, + struct buf_2d *pre) { + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + YV12_BUFFER_CONFIG *ref_frame = NULL; + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + ref_frame = gf_picture[ref_frame_idx].frame; + src->buf = xd->cur_buf->y_buffer + mb_y_offset; + src->stride = xd->cur_buf->y_stride; + pre->buf = ref_frame->y_buffer + mb_y_offset; + pre->stride = ref_frame->y_stride; + assert(src->stride == pre->stride); + return 1; + } else { + printf("invalid ref_frame_idx"); + assert(ref_frame_idx != -1); + return 0; + } +} + +#define kMvPreCheckLines 5 +#define kMvPreCheckSize 15 + +#define MV_REF_POS_NUM 3 +POSITION mv_ref_pos[MV_REF_POS_NUM] = { + { -1, 0 }, + { 0, -1 }, + { -1, -1 }, +}; + +static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row, + int mi_col) { + return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col]; +} + +static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + int i; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int_mv nearest_mv, near_mv, invalid_mv; + nearest_mv.as_int = INVALID_MV; + near_mv.as_int = INVALID_MV; + invalid_mv.as_int = INVALID_MV; + for (i = 0; i < MV_REF_POS_NUM; ++i) { + int nb_row = mi_row + mv_ref_pos[i].row * mi_height; + int nb_col = mi_col + mv_ref_pos[i].col * mi_width; + assert(mv_ref_pos[i].row <= 0); + assert(mv_ref_pos[i].col <= 0); + if (nb_row >= 0 && nb_col >= 0) { + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + } else { + int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col); + if (mv.as_int == nearest_mv.as_int) { + continue; + } else { + near_mv = mv; + break; + } + } + } + } + if (nearest_mv.as_int == INVALID_MV) { + nearest_mv.as_mv.row = 0; + nearest_mv.as_mv.col = 0; + } + if (near_mv.as_int == INVALID_MV) { + near_mv.as_mv.row = 0; + near_mv.as_mv.col = 0; + } + if (mv_mode == NEAREST_MV_MODE) { + return nearest_mv; + } + if (mv_mode == NEAR_MV_MODE) { + return near_mv; + } + assert(0); + return invalid_mv; +} + +static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi, + MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + int_mv mv; + switch (mv_mode) { + case ZERO_MV_MODE: + mv.as_mv.row = 0; + mv.as_mv.col = 0; + break; + case NEW_MV_MODE: + mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + break; + case NEAREST_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + case NEAR_MV_MODE: + mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col); + break; + default: + mv.as_int = INVALID_MV; + assert(0); + break; + } + return mv; +} + +static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + uint32_t sse; + struct buf_2d src; + struct buf_2d pre; + MV full_mv; + *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize, + mi_row, mi_col); + full_mv = get_full_mv(&mv->as_mv); + if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col, + &src, &pre)) { + // TODO(angiebird): Consider subpixel when computing the sse. + cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv), + pre.stride, &sse); + return (double)(sse << VP9_DIST_SCALE_LOG2); + } else { + assert(0); + return 0; + } +} + +static int get_mv_mode_cost(int mv_mode) { + // TODO(angiebird): The probabilities are roughly inferred from + // default_inter_mode_probs. Check if there is a better way to set the + // probabilities. + const int zero_mv_prob = 16; + const int new_mv_prob = 24 * 1; + const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob; + assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256); + switch (mv_mode) { + case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break; + case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break; + case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break; + default: assert(0); return -1; + } +} + +static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) { + double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) + + log2(1 + abs(new_mv->col - ref_mv->col)); + mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT); + return mv_diff_cost; +} +static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field, + TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + double mv_cost = get_mv_mode_cost(mv_mode); + if (mv_mode == NEW_MV_MODE) { + MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field, + tpl_frame, bsize, mi_row, mi_col) + .as_mv; + MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame, + bsize, mi_row, mi_col) + .as_mv; + double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv); + double near_cost = get_mv_diff_cost(&new_mv, &near_mv); + mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost; + } + return mv_cost; +} + +static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + int_mv *mv) { + MACROBLOCKD *xd = &x->e_mbd; + double mv_dist = + get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, mv); + double mv_cost = + get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col); + double mult = 180; + + return mv_cost + mult * log2f(1 + mv_dist); +} + +static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col, + double *rd, int_mv *mv) { + int best_mv_mode = ZERO_MV_MODE; + int update = 0; + int mv_mode; + *rd = 0; + for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) { + double this_rd; + int_mv this_mv; + if (mv_mode == NEW_MV_MODE) { + continue; + } + this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv); + if (update == 0) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + update = 1; + } else { + if (this_rd < *rd) { + *rd = this_rd; + *mv = this_mv; + best_mv_mode = mv_mode; + } + } + } + return best_mv_mode; +} + +static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, MotionField *motion_field, + int frame_idx, TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize, int mi_row, int mi_col) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int tmp_mv_mode_arr[kMvPreCheckSize]; + int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx]; + double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx]; + int_mv *select_mv_arr = cpi->select_mv_arr; + int_mv tmp_select_mv_arr[kMvPreCheckSize]; + int stride = tpl_frame->stride; + double new_mv_rd = 0; + double no_new_mv_rd = 0; + double this_new_mv_rd = 0; + double this_no_new_mv_rd = 0; + int idx; + int tmp_idx; + assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1); + + // no new mv + // diagonal scan order + tmp_idx = 0; + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + if (r == 0 && c == 0) { + this_no_new_mv_rd = this_rd; + } + no_new_mv_rd += this_rd; + tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col]; + tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col]; + ++tmp_idx; + } + } + } + + // new mv + mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE; + this_new_mv_rd = eval_mv_mode( + NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]); + new_mv_rd = this_new_mv_rd; + // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE + // beforehand. + for (idx = 1; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + double this_rd; + int_mv *mv = &select_mv_arr[nb_row * stride + nb_col]; + mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode( + cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx, + bsize, nb_row, nb_col, &this_rd, mv); + new_mv_rd += this_rd; + } + } + } + + // update best_mv_mode + tmp_idx = 0; + if (no_new_mv_rd < new_mv_rd) { + for (idx = 0; idx < kMvPreCheckLines; ++idx) { + int r; + for (r = 0; r <= idx; ++r) { + int c = idx - r; + int nb_row = mi_row + r * mi_height; + int nb_col = mi_col + c * mi_width; + if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) { + mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx]; + select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx]; + ++tmp_idx; + } + } + } + rd_diff_arr[mi_row * stride + mi_col] = 0; + } else { + rd_diff_arr[mi_row * stride + mi_col] = + (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd); + } +} + +static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x, + GF_PICTURE *gf_picture, + MotionField *motion_field, int frame_idx, + TplDepFrame *tpl_frame, int rf_idx, + BLOCK_SIZE bsize) { + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int unit_rows = tpl_frame->mi_rows / mi_height; + const int unit_cols = tpl_frame->mi_cols / mi_width; + const int max_diagonal_lines = unit_rows + unit_cols - 1; + int idx; + for (idx = 0; idx < max_diagonal_lines; ++idx) { + int r; + for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1); + ++r) { + int c = idx - r; + int mi_row = r * mi_height; + int mi_col = c * mi_width; + assert(c >= 0 && c < unit_cols); + assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows); + assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols); + predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, + rf_idx, bsize, mi_row, mi_col); + } + } +} + +static void do_motion_search(VP9_COMP *cpi, ThreadData *td, + MotionField *motion_field, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize, + int mi_row, int mi_col) { + VP9_COMMON *cm = &cpi->common; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + const int mb_y_offset = + mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE; + assert(ref_frame != NULL); + set_mv_limits(cm, x, mi_row, mi_col); + { + int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col); + uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset; + uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset; + const int stride = xd->cur_buf->y_stride; + full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf, + ref_frame_buf, stride, bsize, mi_row, mi_col, + &mv.as_mv); + sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride, + bsize, &mv.as_mv); + vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv); + } +} + +static void build_motion_field( + VP9_COMP *cpi, int frame_idx, + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) { + VP9_COMMON *cm = &cpi->common; + ThreadData *td = &cpi->td; + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + const int pw = num_4x4_blocks_wide_lookup[bsize] << 2; + const int ph = num_4x4_blocks_high_lookup[bsize] << 2; + int mi_row, mi_col; + int rf_idx; + + tpl_frame->lambda = (pw * ph) >> 2; + assert(pw * ph == tpl_frame->lambda << 2); + + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + if (ref_frame[rf_idx] == NULL) { + continue; + } + vp9_motion_field_reset_mvs(motion_field); + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx], + bsize, mi_row, mi_col); + } + } + } +} +#endif // CONFIG_NON_GREEDY_MV + +static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture, + int frame_idx, BLOCK_SIZE bsize) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame; + YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL }; + + VP9_COMMON *cm = &cpi->common; + struct scale_factors sf; + int rdmult, idx; + ThreadData *td = &cpi->td; + MACROBLOCK *x = &td->mb; + MACROBLOCKD *xd = &x->e_mbd; + int mi_row, mi_col; + +#if CONFIG_VP9_HIGHBITDEPTH + DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]); + DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]); + uint8_t *predictor; +#else + DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]); +#endif + DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]); + DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]); + + const TX_SIZE tx_size = max_txsize_lookup[bsize]; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int64_t recon_error, sse; +#if CONFIG_NON_GREEDY_MV + int square_block_idx; + int rf_idx; +#endif + + // Setup scaling factor +#if CONFIG_VP9_HIGHBITDEPTH + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height, + cpi->common.use_highbitdepth); + + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) + predictor = CONVERT_TO_BYTEPTR(predictor16); + else + predictor = predictor8; +#else + vp9_setup_scale_factors_for_frame( + &sf, this_frame->y_crop_width, this_frame->y_crop_height, + this_frame->y_crop_width, this_frame->y_crop_height); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Prepare reference frame pointers. If any reference frame slot is + // unavailable, the pointer will be set to Null. + for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) { + int rf_idx = gf_picture[frame_idx].ref_frame[idx]; + if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame; + } + + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + xd->cur_buf = this_frame; + + // Get rd multiplier set up. + rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex); + set_error_per_bit(&cpi->td.mb, rdmult); + vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex); + + tpl_frame->is_valid = 1; + + cm->base_qindex = tpl_frame->base_qindex; + vp9_frame_init_quantizer(cpi); + +#if CONFIG_NON_GREEDY_MV + for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES; + ++square_block_idx) { + BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx); + build_motion_field(cpi, frame_idx, ref_frame, square_bsize); + } + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + MotionField *motion_field = vp9_motion_field_info_get_motion_field( + &cpi->motion_field_info, frame_idx, rf_idx, bsize); + predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx, + tpl_frame, rf_idx, bsize); + } + } +#endif + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame, + src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize, + tx_size, ref_frame, predictor, &recon_error, &sse); + // Motion flow dependency dispenser. + tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, + tpl_frame->stride); + + tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col, + bsize); + } + } +} + +#if CONFIG_NON_GREEDY_MV +#define DUMP_TPL_STATS 0 +#if DUMP_TPL_STATS +static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) { + int i, j; + printf("%d %d\n", h, w); + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + printf("%d ", buf[(row + i) * stride + col + j]); + } + } + printf("\n"); +} + +static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) { + dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height, + frame_buf->y_width); + dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); + dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0, + frame_buf->uv_height, frame_buf->uv_width); +} + +static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames, + const GF_GROUP *gf_group, + const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) { + int frame_idx; + const VP9_COMMON *cm = &cpi->common; + int rf_idx; + for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) { + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + int mi_row, mi_col; + int ref_frame_idx; + const int mi_height = num_8x8_blocks_high_lookup[bsize]; + const int mi_width = num_8x8_blocks_wide_lookup[bsize]; + ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx]; + if (ref_frame_idx != -1) { + YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame; + const int gf_frame_offset = gf_group->frame_gop_index[frame_idx]; + const int ref_gf_frame_offset = + gf_group->frame_gop_index[ref_frame_idx]; + printf("=\n"); + printf( + "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d " + "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n", + frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE, + ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset); + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info, + frame_idx, rf_idx, bsize, + mi_row, mi_col); + printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row, + mv.as_mv.col); + } + } + } + for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) { + for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) { + if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) { + const TplDepStats *tpl_ptr = + &tpl_frame + ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col]; + printf("%f ", tpl_ptr->feature_score); + } + } + } + printf("\n"); + + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) { + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) { + const int mv_mode = + tpl_frame + ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col]; + printf("%d ", mv_mode); + } + } + printf("\n"); + + dump_frame_buf(gf_picture[frame_idx].frame); + dump_frame_buf(ref_frame_buf); + } + } + } +} +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +void vp9_init_tpl_buffer(VP9_COMP *cpi) { + VP9_COMMON *cm = &cpi->common; + int frame; + + const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols); + const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows); +#if CONFIG_NON_GREEDY_MV + int rf_idx; + + vpx_free(cpi->select_mv_arr); + CHECK_MEM_ERROR( + cm, cpi->select_mv_arr, + vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr))); +#endif + + // TODO(jingning): Reduce the actual memory use for tpl model build up. + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { + if (cpi->tpl_stats[frame].width >= mi_cols && + cpi->tpl_stats[frame].height >= mi_rows && + cpi->tpl_stats[frame].tpl_stats_ptr) + continue; + +#if CONFIG_NON_GREEDY_MV + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx]))); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + CHECK_MEM_ERROR( + cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx], + vpx_calloc(mi_rows * mi_cols * 4, + sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx]))); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr, + vpx_calloc(mi_rows * mi_cols, + sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr))); + cpi->tpl_stats[frame].is_valid = 0; + cpi->tpl_stats[frame].width = mi_cols; + cpi->tpl_stats[frame].height = mi_rows; + cpi->tpl_stats[frame].stride = mi_cols; + cpi->tpl_stats[frame].mi_rows = cm->mi_rows; + cpi->tpl_stats[frame].mi_cols = cm->mi_cols; + } + + for (frame = 0; frame < REF_FRAMES; ++frame) { + cpi->enc_frame_buf[frame].mem_valid = 0; + cpi->enc_frame_buf[frame].released = 1; + } +} + +void vp9_free_tpl_buffer(VP9_COMP *cpi) { + int frame; +#if CONFIG_NON_GREEDY_MV + vp9_free_motion_field_info(&cpi->motion_field_info); + vpx_free(cpi->select_mv_arr); +#endif + for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) { +#if CONFIG_NON_GREEDY_MV + int rf_idx; + for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) { + vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]); + vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]); + } +#endif + vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr); + cpi->tpl_stats[frame].is_valid = 0; + } +} + +#if CONFIG_RATE_CTRL +static void accumulate_frame_tpl_stats(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int show_frame_count = 0; + int frame_idx; + // Accumulate tpl stats for each frame in the current group of picture. + for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) { + TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx]; + TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr; + const int tpl_stride = tpl_frame->stride; + int64_t intra_cost_base = 0; + int64_t inter_cost_base = 0; + int64_t mc_dep_cost_base = 0; + int64_t mc_ref_cost_base = 0; + int64_t mc_flow_base = 0; + int row, col; + + if (!tpl_frame->is_valid) continue; + + for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) { + for (col = 0; col < cm->mi_cols; ++col) { + TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col]; + intra_cost_base += this_stats->intra_cost; + inter_cost_base += this_stats->inter_cost; + mc_dep_cost_base += this_stats->mc_dep_cost; + mc_ref_cost_base += this_stats->mc_ref_cost; + mc_flow_base += this_stats->mc_flow; + } + } + + cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base; + cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base; + cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base; + + ++show_frame_count; + } +} +#endif // CONFIG_RATE_CTRL + +void vp9_setup_tpl_stats(VP9_COMP *cpi) { + GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE]; + const GF_GROUP *gf_group = &cpi->twopass.gf_group; + int tpl_group_frames = 0; + int frame_idx; + cpi->tpl_bsize = BLOCK_32X32; + + init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames); + + init_tpl_stats(cpi); + + // Backward propagation from tpl_group_frames to 1. + for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) { + if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue; + mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize); + } +#if CONFIG_NON_GREEDY_MV + cpi->tpl_ready = 1; +#if DUMP_TPL_STATS + dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize); +#endif // DUMP_TPL_STATS +#endif // CONFIG_NON_GREEDY_MV + +#if CONFIG_RATE_CTRL + if (cpi->oxcf.use_simple_encode_api) { + accumulate_frame_tpl_stats(cpi); + } +#endif // CONFIG_RATE_CTRL +} diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h new file mode 100644 index 000000000000..86a7734f829c --- /dev/null +++ b/media/libvpx/libvpx/vp9/encoder/vp9_tpl_model.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ +#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef M_LOG2_E +#define M_LOG2_E 0.693147180559945309417 +#endif +#define log2f(x) (log(x) / (float)M_LOG2_E) + +typedef struct GF_PICTURE { + YV12_BUFFER_CONFIG *frame; + int ref_frame[3]; + FRAME_UPDATE_TYPE update_type; +} GF_PICTURE; + +void vp9_init_tpl_buffer(VP9_COMP *cpi); +void vp9_setup_tpl_stats(VP9_COMP *cpi); +void vp9_free_tpl_buffer(VP9_COMP *cpi); + +void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff, + TX_SIZE tx_size); +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_ diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc index 02e50a857ca9..944c526ac169 100644 --- a/media/libvpx/libvpx/vp9/ratectrl_rtc.cc +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.cc @@ -48,6 +48,29 @@ std::unique_ptr VP9RateControlRTC::Create( return rc_api; } +VP9RateControlRTC::~VP9RateControlRTC() { + if (cpi_) { + if (cpi_->svc.number_spatial_layers > 1 || + cpi_->svc.number_temporal_layers > 1) { + for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { + for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { + int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); + LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; + vpx_free(lc->map); + vpx_free(lc->last_coded_q_map); + vpx_free(lc->consec_zero_mv); + } + } + } + if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { + vpx_free(cpi_->segmentation_map); + cpi_->segmentation_map = NULL; + vp9_cyclic_refresh_free(cpi_->cyclic_refresh); + } + vpx_free(cpi_); + } +} + void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) { VP9_COMMON *cm = &cpi_->common; VP9EncoderConfig *oxcf = &cpi_->oxcf; @@ -157,7 +180,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) { cm->height = height; } vp9_set_mb_mi(cm, cm->width, cm->height); - cm->frame_type = frame_params.frame_type; + cm->frame_type = static_cast(frame_params.frame_type); // This is needed to ensure key frame does not get unset in rc_get_svc_params. cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0; cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0; diff --git a/media/libvpx/libvpx/vp9/ratectrl_rtc.h b/media/libvpx/libvpx/vp9/ratectrl_rtc.h index b209e4db669f..162a04883e73 100644 --- a/media/libvpx/libvpx/vp9/ratectrl_rtc.h +++ b/media/libvpx/libvpx/vp9/ratectrl_rtc.h @@ -19,14 +19,14 @@ #include "vp9/common/vp9_onyxc_int.h" #include "vp9/vp9_iface_common.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" -#include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/vp9_cx_iface.h" #include "vpx/internal/vpx_ratectrl_rtc.h" #include "vpx_mem/vpx_mem.h" -namespace libvpx { +struct VP9_COMP; +namespace libvpx { struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { public: VP9RateControlRtcConfig() { @@ -53,7 +53,7 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig { }; struct VP9FrameParamsQpRTC { - FRAME_TYPE frame_type; + RcFrameType frame_type; int spatial_layer_id; int temporal_layer_id; }; @@ -90,28 +90,7 @@ class VP9RateControlRTC { public: static std::unique_ptr Create( const VP9RateControlRtcConfig &cfg); - ~VP9RateControlRTC() { - if (cpi_) { - if (cpi_->svc.number_spatial_layers > 1 || - cpi_->svc.number_temporal_layers > 1) { - for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) { - for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) { - int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers); - LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer]; - vpx_free(lc->map); - vpx_free(lc->last_coded_q_map); - vpx_free(lc->consec_zero_mv); - } - } - } - if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) { - vpx_free(cpi_->segmentation_map); - cpi_->segmentation_map = NULL; - vp9_cyclic_refresh_free(cpi_->cyclic_refresh); - } - vpx_free(cpi_); - } - } + ~VP9RateControlRTC(); void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg); // GetQP() needs to be called after ComputeQP() to get the latest QP @@ -125,7 +104,7 @@ class VP9RateControlRTC { private: VP9RateControlRTC() {} void InitRateControl(const VP9RateControlRtcConfig &cfg); - VP9_COMP *cpi_; + struct VP9_COMP *cpi_; }; } // namespace libvpx diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c index 94593826d451..4c7eaed72507 100644 --- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -1372,22 +1372,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, timebase_units_to_ticks(timestamp_ratio, pts + duration); res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - // Store the original flags in to the frame buffer. Will extract the - // key frame flag when we actually encode this frame. - if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, dst_time_stamp, dst_end_time_stamp)) { - res = update_error_state(ctx, &cpi->common.error); - } - ctx->next_frame_flags = 0; + res = update_error_state(ctx, &cpi->common.error); } + ctx->next_frame_flags = 0; } cx_data = ctx->cx_data; @@ -1684,9 +1675,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx, vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *); if (mode) { - const int res = - vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode, - (VPX_SCALING)mode->v_scaling_mode); + const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode, + mode->v_scaling_mode); return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM; } return VPX_CODEC_INVALID_PARAM; diff --git a/media/libvpx/libvpx/vp9/vp9cx.mk b/media/libvpx/libvpx/vp9/vp9cx.mk index 9072628f23f0..ae8fb85d8763 100644 --- a/media/libvpx/libvpx/vp9/vp9cx.mk +++ b/media/libvpx/libvpx/vp9/vp9cx.mk @@ -104,6 +104,8 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c endif VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c +VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h diff --git a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h index 65398c654d4a..33c57e219be1 100644 --- a/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h +++ b/media/libvpx/libvpx/vpx/internal/vpx_ratectrl_rtc.h @@ -14,6 +14,9 @@ #include "vpx/vpx_encoder.h" namespace libvpx { + +enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 }; + struct VpxRateControlRtcConfig { public: VpxRateControlRtcConfig() { diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c index 8a8aaa1ed467..fde71ff30df4 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct16x16_neon.c @@ -165,8 +165,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose top left and top right quarters into one contiguous location to // process to the top half. - transpose_s16_8x8_new(&temp0[0], &temp2[0]); - transpose_s16_8x8_new(&temp1[0], &temp2[8]); + transpose_s16_8x8q(&temp0[0], &temp2[0]); + transpose_s16_8x8q(&temp1[0], &temp2[8]); partial_round_shift(temp2); cross_input(temp2, temp3); vpx_fdct8x16_body(temp3, temp2); @@ -180,7 +180,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) { // Transpose bottom left and bottom right quarters into one contiguous // location to process to the bottom half. - transpose_s16_8x8_new(&temp0[8], &temp1[0]); + transpose_s16_8x8q(&temp0[8], &temp1[0]); transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12], &temp1[13], &temp1[14], &temp1[15]); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c index d6818d2ec635..a91730ce8bf8 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/fdct32x32_neon.c @@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) { store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass(temp0, temp5); @@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, dct_body_first_pass(temp5, temp4); // Generate the top row by munging the first set of 8 from each one together. - transpose_s16_8x8_new(&temp1[0], &temp0[0]); - transpose_s16_8x8_new(&temp2[0], &temp0[8]); - transpose_s16_8x8_new(&temp3[0], &temp0[16]); - transpose_s16_8x8_new(&temp4[0], &temp0[24]); + transpose_s16_8x8q(&temp1[0], &temp0[0]); + transpose_s16_8x8q(&temp2[0], &temp0[8]); + transpose_s16_8x8q(&temp3[0], &temp0[16]); + transpose_s16_8x8q(&temp4[0], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output, temp5); // Second row of 8x32. - transpose_s16_8x8_new(&temp1[8], &temp0[0]); - transpose_s16_8x8_new(&temp2[8], &temp0[8]); - transpose_s16_8x8_new(&temp3[8], &temp0[16]); - transpose_s16_8x8_new(&temp4[8], &temp0[24]); + transpose_s16_8x8q(&temp1[8], &temp0[0]); + transpose_s16_8x8q(&temp2[8], &temp0[8]); + transpose_s16_8x8q(&temp3[8], &temp0[16]); + transpose_s16_8x8q(&temp4[8], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 8 * 32, temp5); // Third row of 8x32 - transpose_s16_8x8_new(&temp1[16], &temp0[0]); - transpose_s16_8x8_new(&temp2[16], &temp0[8]); - transpose_s16_8x8_new(&temp3[16], &temp0[16]); - transpose_s16_8x8_new(&temp4[16], &temp0[24]); + transpose_s16_8x8q(&temp1[16], &temp0[0]); + transpose_s16_8x8q(&temp2[16], &temp0[8]); + transpose_s16_8x8q(&temp3[16], &temp0[16]); + transpose_s16_8x8q(&temp4[16], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); @@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, store(output + 16 * 32, temp5); // Final row of 8x32. - transpose_s16_8x8_new(&temp1[24], &temp0[0]); - transpose_s16_8x8_new(&temp2[24], &temp0[8]); - transpose_s16_8x8_new(&temp3[24], &temp0[16]); - transpose_s16_8x8_new(&temp4[24], &temp0[24]); + transpose_s16_8x8q(&temp1[24], &temp0[0]); + transpose_s16_8x8q(&temp2[24], &temp0[8]); + transpose_s16_8x8q(&temp3[24], &temp0[16]); + transpose_s16_8x8q(&temp4[24], &temp0[24]); dct_body_second_pass_rd(temp0, temp5); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c new file mode 100644 index 000000000000..3063acbb3e3f --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_avg_pred_neon.c @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, + int width, int height, const uint16_t *ref, + int ref_stride) { + int i = height; + if (width > 8) { + do { + int j = 0; + do { + const uint16x8_t p = vld1q_u16(pred + j); + const uint16x8_t r = vld1q_u16(ref + j); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred + j, avg); + + j += 8; + } while (j < width); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else if (width == 8) { + do { + const uint16x8_t p = vld1q_u16(pred); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vrhaddq_u16(p, r); + vst1q_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } else { + assert(width == 4); + do { + const uint16x4_t p = vld1_u16(pred); + const uint16x4_t r = vld1_u16(ref); + + uint16x4_t avg = vrhadd_u16(p, r); + vst1_u16(comp_pred, avg); + + comp_pred += width; + pred += width; + ref += ref_stride; + } while (--i != 0); + } +} diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c new file mode 100644 index 000000000000..f731d38cc1a3 --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" +#include "vpx_dsp/arm/sum_neon.h" + +static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x4_t s = vld1_u16(src16_ptr + i * src_stride); + uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride); + uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride); + uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride); + uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride); + + sum[0] = vabal_u16(sum[0], s, r0); + sum[1] = vabal_u16(sum[1], s, r1); + sum[2] = vabal_u16(sum[2], s, r2); + sum[3] = vabal_u16(sum[3], s, r3); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); +} + +static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = 0; + do { + uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); + + sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); + sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); + sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); + sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); + + } while (++i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint16x8_t s0, s1; + + s0 = vld1q_u16(src16_ptr + i * src_stride); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], int w, + int h) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]); + const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]); + const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); + const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); + + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3; + + s0 = vld1q_u16(src16_ptr + i * src_stride + j); + sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]); + sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]); + sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]); + sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]); + + s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8); + sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]); + sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]); + sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]); + sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]); + + s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16); + sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16), + &sum_lo[0]); + sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16), + &sum_lo[1]); + sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16), + &sum_lo[2]); + sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16), + &sum_lo[3]); + + s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24); + sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24), + &sum_hi[0]); + sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24), + &sum_hi[1]); + sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24), + &sum_hi[2]); + sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24), + &sum_hi[3]); + + j += 32; + } while (j < w); + + } while (++i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); +} + +static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h); +} + +static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *const ref_ptr[4], + int ref_stride, uint32_t res[4], + int h) { + highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h); +} + +#define HBD_SAD_WXH_4D_NEON(w, h) \ + void vpx_highbd_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], \ + int ref_stride, uint32_t res[4]) { \ + highbd_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ + } + +HBD_SAD_WXH_4D_NEON(4, 4) +HBD_SAD_WXH_4D_NEON(4, 8) + +HBD_SAD_WXH_4D_NEON(8, 4) +HBD_SAD_WXH_4D_NEON(8, 8) +HBD_SAD_WXH_4D_NEON(8, 16) + +HBD_SAD_WXH_4D_NEON(16, 8) +HBD_SAD_WXH_4D_NEON(16, 16) +HBD_SAD_WXH_4D_NEON(16, 32) + +HBD_SAD_WXH_4D_NEON(32, 16) +HBD_SAD_WXH_4D_NEON(32, 32) +HBD_SAD_WXH_4D_NEON(32, 64) + +HBD_SAD_WXH_4D_NEON(64, 32) +HBD_SAD_WXH_4D_NEON(64, 64) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c index ecb52ce5a53d..90971f6009ec 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c @@ -17,209 +17,363 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int width, - int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - const uint16x4_t src_u16 = vld1_u16(src16_ptr + j); - const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j); - sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16); - } + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + sum = vabal_u16(sum, s, r); + src16_ptr += src_stride; ref16_ptr += ref_stride; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); } -static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, int width, - int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j); - const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16)); - sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16), - vget_high_u16(ref_u16)); - } + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t diff = vabdq_u16(s, r); + sum = vpadalq_u16(sum, diff); + src16_ptr += src_stride; ref16_ptr += ref_stride; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + return horizontal_add_uint32x4(sum); } -static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon( - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, - int ref_stride, const uint8_t *second_pred, int width, int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 4) { - const uint16x4_t a_u16 = vld1_u16(src16_ptr + j); - const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j); - const uint16x4_t c_u16 = vld1_u16(pred_ptr + j); - const uint16x4_t avg = vrhadd_u16(b_u16, c_u16); - sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg); - } + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1; + uint16x8_t diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + src16_ptr += src_stride; ref16_ptr += ref_stride; - pred_ptr += width; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); } -static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon( - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, - int ref_stride, const uint8_t *second_pred, int width, int height) { - int i, j; - uint32x4_t sum_abs_diff = vdupq_n_u32(0); +static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred); - for (i = 0; i < height; i++) { - for (j = 0; j < width; j += 8) { - const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j); - const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j); - const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j); - const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg)); - sum_abs_diff = - vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg)); - } + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3; + uint16x8_t diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + diff0 = vabdq_u16(s0, r0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + diff1 = vabdq_u16(s1, r1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + diff2 = vabdq_u16(s2, r2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + diff3 = vabdq_u16(s3, r3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + src16_ptr += src_stride; ref16_ptr += ref_stride; - pred_ptr += width; - } + } while (--i != 0); - return horizontal_add_uint32x4(sum_abs_diff); + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); } -#define highbd_sad4MxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ +static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h); +} + +static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h) { + return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h); +} + +#define HBD_SAD_WXH_NEON(w, h) \ + unsigned int vpx_highbd_sad##w##x##h##_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, \ + int ref_stride) { \ + return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \ } -#define highbd_sadMxN(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride) { \ - return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ +HBD_SAD_WXH_NEON(4, 4) +HBD_SAD_WXH_NEON(4, 8) + +HBD_SAD_WXH_NEON(8, 4) +HBD_SAD_WXH_NEON(8, 8) +HBD_SAD_WXH_NEON(8, 16) + +HBD_SAD_WXH_NEON(16, 8) +HBD_SAD_WXH_NEON(16, 16) +HBD_SAD_WXH_NEON(16, 32) + +HBD_SAD_WXH_NEON(32, 16) +HBD_SAD_WXH_NEON(32, 32) +HBD_SAD_WXH_NEON(32, 64) + +HBD_SAD_WXH_NEON(64, 32) +HBD_SAD_WXH_NEON(64, 64) + +static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x4_t s = vld1_u16(src16_ptr); + uint16x4_t r = vld1_u16(ref16_ptr); + uint16x4_t p = vld1_u16(pred16_ptr); + + uint16x4_t avg = vrhadd_u16(r, p); + sum = vabal_u16(sum, s, avg); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 4; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum = vdupq_n_u32(0); + + int i = h; + do { + uint16x8_t s = vld1q_u16(src16_ptr); + uint16x8_t r = vld1q_u16(ref16_ptr); + uint16x8_t p = vld1q_u16(pred16_ptr); + + uint16x8_t avg = vrhaddq_u16(r, p); + uint16x8_t diff = vabdq_u16(s, avg); + sum = vpadalq_u16(sum, diff); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 8; + } while (--i != 0); + + return horizontal_add_uint32x4(sum); +} + +static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + + int i = h; + do { + uint16x8_t s0, s1, r0, r1, p0, p1; + uint16x8_t avg0, avg1, diff0, diff1; + + s0 = vld1q_u16(src16_ptr); + r0 = vld1q_u16(ref16_ptr); + p0 = vld1q_u16(pred16_ptr); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + 8); + r1 = vld1q_u16(ref16_ptr + 8); + p1 = vld1q_u16(pred16_ptr + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += 16; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int w, int h, + const uint8_t *second_pred) { + const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); + const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); + const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred); + uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + + int i = h; + do { + int j = 0; + do { + uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3; + uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3; + + s0 = vld1q_u16(src16_ptr + j); + r0 = vld1q_u16(ref16_ptr + j); + p0 = vld1q_u16(pred16_ptr + j); + avg0 = vrhaddq_u16(r0, p0); + diff0 = vabdq_u16(s0, avg0); + sum[0] = vpadalq_u16(sum[0], diff0); + + s1 = vld1q_u16(src16_ptr + j + 8); + r1 = vld1q_u16(ref16_ptr + j + 8); + p1 = vld1q_u16(pred16_ptr + j + 8); + avg1 = vrhaddq_u16(r1, p1); + diff1 = vabdq_u16(s1, avg1); + sum[1] = vpadalq_u16(sum[1], diff1); + + s2 = vld1q_u16(src16_ptr + j + 16); + r2 = vld1q_u16(ref16_ptr + j + 16); + p2 = vld1q_u16(pred16_ptr + j + 16); + avg2 = vrhaddq_u16(r2, p2); + diff2 = vabdq_u16(s2, avg2); + sum[2] = vpadalq_u16(sum[2], diff2); + + s3 = vld1q_u16(src16_ptr + j + 24); + r3 = vld1q_u16(ref16_ptr + j + 24); + p3 = vld1q_u16(pred16_ptr + j + 24); + avg3 = vrhaddq_u16(r3, p3); + diff3 = vabdq_u16(s3, avg3); + sum[3] = vpadalq_u16(sum[3], diff3); + + j += 32; + } while (j < w); + + src16_ptr += src_stride; + ref16_ptr += ref_stride; + pred16_ptr += w; + } while (--i != 0); + + sum[0] = vaddq_u32(sum[0], sum[1]); + sum[2] = vaddq_u32(sum[2], sum[3]); + sum[0] = vaddq_u32(sum[0], sum[2]); + + return horizontal_add_uint32x4(sum[0]); +} + +static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h, + second_pred); +} + +static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr, + int src_stride, + const uint8_t *ref_ptr, + int ref_stride, int h, + const uint8_t *second_pred) { + return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h, + second_pred); +} + +#define HBD_SAD_WXH_AVG_NEON(w, h) \ + uint32_t vpx_highbd_sad##w##x##h##_avg_neon( \ + const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ + const uint8_t *second_pred) { \ + return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \ + second_pred); \ } -#define highbd_sad4MxN_avg(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ - second_pred, m, n); \ - } +HBD_SAD_WXH_AVG_NEON(4, 4) +HBD_SAD_WXH_AVG_NEON(4, 8) -#define highbd_sadMxN_avg(m, n) \ - unsigned int vpx_highbd_sad##m##x##n##_avg_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, const uint8_t *second_pred) { \ - return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \ - second_pred, m, n); \ - } +HBD_SAD_WXH_AVG_NEON(8, 4) +HBD_SAD_WXH_AVG_NEON(8, 8) +HBD_SAD_WXH_AVG_NEON(8, 16) -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_neon( \ - const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[4], int ref_stride, \ - uint32_t sad_array[4]) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride, \ - ref_array[i], ref_stride); \ - } \ - } +HBD_SAD_WXH_AVG_NEON(16, 8) +HBD_SAD_WXH_AVG_NEON(16, 16) +HBD_SAD_WXH_AVG_NEON(16, 32) -/* clang-format off */ -// 4x4 -highbd_sad4MxN(4, 4) -highbd_sad4MxN_avg(4, 4) -highbd_sadMxNx4D(4, 4) +HBD_SAD_WXH_AVG_NEON(32, 16) +HBD_SAD_WXH_AVG_NEON(32, 32) +HBD_SAD_WXH_AVG_NEON(32, 64) -// 4x8 -highbd_sad4MxN(4, 8) -highbd_sad4MxN_avg(4, 8) -highbd_sadMxNx4D(4, 8) - -// 8x4 -highbd_sadMxN(8, 4) -highbd_sadMxN_avg(8, 4) -highbd_sadMxNx4D(8, 4) - -// 8x8 -highbd_sadMxN(8, 8) -highbd_sadMxN_avg(8, 8) -highbd_sadMxNx4D(8, 8) - -// 8x16 -highbd_sadMxN(8, 16) -highbd_sadMxN_avg(8, 16) -highbd_sadMxNx4D(8, 16) - -// 16x8 -highbd_sadMxN(16, 8) -highbd_sadMxN_avg(16, 8) -highbd_sadMxNx4D(16, 8) - -// 16x16 -highbd_sadMxN(16, 16) -highbd_sadMxN_avg(16, 16) -highbd_sadMxNx4D(16, 16) - -// 16x32 -highbd_sadMxN(16, 32) -highbd_sadMxN_avg(16, 32) -highbd_sadMxNx4D(16, 32) - -// 32x16 -highbd_sadMxN(32, 16) -highbd_sadMxN_avg(32, 16) -highbd_sadMxNx4D(32, 16) - -// 32x32 -highbd_sadMxN(32, 32) -highbd_sadMxN_avg(32, 32) -highbd_sadMxNx4D(32, 32) - -// 32x64 -highbd_sadMxN(32, 64) -highbd_sadMxN_avg(32, 64) -highbd_sadMxNx4D(32, 64) - -// 64x32 -highbd_sadMxN(64, 32) -highbd_sadMxN_avg(64, 32) -highbd_sadMxNx4D(64, 32) - -// 64x64 -highbd_sadMxN(64, 64) -highbd_sadMxN_avg(64, 64) -highbd_sadMxNx4D(64, 64) - /* clang-format on */ +HBD_SAD_WXH_AVG_NEON(64, 32) +HBD_SAD_WXH_AVG_NEON(64, 64) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c new file mode 100644 index 000000000000..aa64697458ca --- /dev/null +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_subpel_variance_neon.c @@ -0,0 +1,594 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "./vpx_dsp_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" + +// The bilinear filters look like this: +// +// {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, +// { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} +// +// We can factor out the highest common multiple, such that the sum of both +// weights will be 8 instead of 128. The benefits of this are two-fold: +// +// 1) We can infer the filter values from the filter_offset parameter in the +// bilinear filter functions below - we don't have to actually load the values +// from memory: +// f0 = 8 - filter_offset +// f1 = filter_offset +// +// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on +// 16-bit data types at all times, rather than widening out to 32-bit and +// requiring double the number of data processing instructions. (12-bit * 8 = +// 15-bit.) + +// Process a block exactly 4 wide and a multiple of 2 high. +static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride); + uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr, blend); + + src_ptr += 2 * src_stride; + dst_ptr += 8; + i -= 2; + } while (i != 0); +} + +// Process a block which is a multiple of 8 and any height. +static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, + uint16_t *dst_ptr, + int src_stride, int pixel_step, + int dst_width, int dst_height, + int filter_offset) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, blend); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 8, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 16, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 32, dst_height, filter_offset); +} +static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_height, + int filter_offset) { + highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, + 64, dst_height, filter_offset); +} + +static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, + uint16_t *dst_ptr, int src_stride, + int pixel_step, int dst_width, + int dst_height) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + vst1q_u16(dst_ptr + j, avg); + + j += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + if (yoffset == 0) { \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ + h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ + src_stride, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + padding)); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ + (h + padding)); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ + xoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. + +// 8-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64, 1) + +// 10-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64, 1) + +// 12-bit +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4, 2) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8, 2) + +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8, 1) +HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64, 1) + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having +// width 4. +static void highbd_avg_pred_var_filter_block2d_bil_w4( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + uint16x8_t s0 = load_unaligned_u16q(src_ptr, src_stride); + uint16x8_t s1 = load_unaligned_u16q(src_ptr + pixel_step, src_stride); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr, vrhaddq_u16(blend, p)); + + src_ptr += 2 * src_stride; + dst_ptr += 2 * 4; + second_pred += 2 * 4; + i -= 2; + } while (i != 0); +} + +// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks. +static void highbd_avg_pred_var_filter_block2d_bil_large( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, int filter_offset, + const uint16_t *second_pred) { + const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); + const uint16x8_t f1 = vdupq_n_u16(filter_offset); + + int i = dst_height; + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t blend = vmulq_u16(s0, f0); + blend = vmlaq_u16(blend, s1, f1); + blend = vrshrq_n_u16(blend, 3); + + vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +static void highbd_avg_pred_var_filter_block2d_bil_w8( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 8, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w16( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 16, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w32( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 32, dst_height, + filter_offset, second_pred); +} +static void highbd_avg_pred_var_filter_block2d_bil_w64( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_height, int filter_offset, const uint16_t *second_pred) { + highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, + pixel_step, 64, dst_height, + filter_offset, second_pred); +} + +// Combine averaging subpel filter with vpx_highbd_comp_avg_pred. +static void highbd_avg_pred_var_filter_block2d_avg( + const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, + int dst_width, int dst_height, const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s0 = vld1q_u16(src_ptr + j); + uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); + uint16x8_t avg = vrhaddq_u16(s0, s1); + + uint16x8_t p = vld1q_u16(second_pred); + avg = vrhaddq_u16(avg, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16. +static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, + int src_stride, int dst_width, int dst_height, + const uint16_t *second_pred) { + int i = dst_height; + + // We only specialize on the filter values for large block sizes (>= 16x16.) + assert(dst_width >= 16 && dst_width % 16 == 0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t p = vld1q_u16(second_pred); + + uint16x8_t avg = vrhaddq_u16(s, p); + + vst1q_u16(dst_ptr + j, avg); + + j += 8; + second_pred += 8; + } while (j < dst_width); + + src_ptr += src_stride; + dst_ptr += dst_width; + } while (--i != 0); +} + +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + padding)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ + (h + padding), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } + +#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h, padding) \ + unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int source_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, unsigned int *sse, \ + const uint8_t *second_pred) { \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + if (xoffset == 0) { \ + uint16_t tmp[w * h]; \ + if (yoffset == 0) { \ + highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp, source_stride, source_stride, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } else { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp, source_stride, source_stride, h, yoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ + } \ + } else if (xoffset == 4) { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_avg( \ + src_ptr, tmp0, source_stride, 1, w, h, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + padding)); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * (h + padding)]; \ + highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ + (h + padding)); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } else { \ + uint16_t tmp0[w * (h + padding)]; \ + if (yoffset == 0) { \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + src_ptr, tmp0, source_stride, 1, h, xoffset, \ + CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ + } else if (yoffset == 4) { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + highbd_avg_pred_var_filter_block2d_avg( \ + tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } else { \ + uint16_t tmp1[w * h]; \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ + (h + padding), xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + return vpx_highbd_##bitdepth##_variance##w##x##h##_neon( \ + CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ + } \ + } \ + } + +// 4x blocks are processed two rows at a time, so require an extra row of +// padding. + +// 8-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8, 2) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64, 1) + +// 10-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8, 2) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64, 1) + +// 12-bit +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4, 2) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8, 2) + +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8, 1) +HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64, 1) + +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32, 1) +HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64, 1) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c index 96a35af01c20..985cc35682e0 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_variance_neon.c @@ -18,11 +18,6 @@ #include "vpx_dsp/arm/sum_neon.h" #include "vpx_ports/mem.h" -static const uint8_t bilinear_filters[8][2] = { - { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }, -}; - static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr, int ref_stride, int w, int h, uint64_t *sse, @@ -136,7 +131,7 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); } -#define HIGHBD_VAR(W, H) \ +#define HBD_VARIANCE_WXH_NEON(W, H) \ uint32_t vpx_highbd_8_variance##W##x##H##_neon( \ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ int ref_stride, uint32_t *sse) { \ @@ -218,274 +213,19 @@ static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride, return *sse; \ } -static INLINE void highbd_var_filter_block2d_bil_first_pass( - const uint8_t *src_ptr8, uint16_t *output_ptr, - unsigned int src_pixels_per_line, int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - uint32_t i, j; - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); - - uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); - uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); - uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); - const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); - uint16x4_t out1_u16; - uint16x4_t out2_u16; - sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); - sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); - out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); - out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); - vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } else { - assert(output_width >= 4); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); - const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); - uint16x4_t out_u16; - sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); - out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); - vst1_u16(&output_ptr[j], out_u16); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } -} - -static INLINE void highbd_var_filter_block2d_bil_second_pass( - const uint16_t *src_ptr, uint16_t *output_ptr, - unsigned int src_pixels_per_line, unsigned int pixel_step, - unsigned int output_height, unsigned int output_width, - const uint8_t *filter) { - uint32_t i, j; - - uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1); - uint16x4_t filter1_u16 = vdup_n_u16(filter[0]); - uint16x4_t filter2_u16 = vdup_n_u16(filter[1]); - - if (output_width >= 8) { - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 8) { - const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]); - const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16)); - uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16)); - uint16x4_t out1_u16; - uint16x4_t out2_u16; - sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16)); - sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16)); - out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS); - out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS); - vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16)); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } else { - assert(output_width >= 4); - for (i = 0; i < output_height; ++i) { - for (j = 0; j < output_width; j += 4) { - const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]); - const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]); - uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16); - uint16x4_t out_u16; - sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16); - out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS); - vst1_u16(&output_ptr[j], out_u16); - } - // Next row... - src_ptr += src_pixels_per_line; - output_ptr += output_width; - } - } -} - -#define HIGHBD_SUBPIX_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \ - ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_10_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - return vpx_highbd_12_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse); \ - } - -#define HIGHBD_SUBPIX_AVG_VAR(W, H) \ - uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \ - ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_10_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ - } \ - \ - uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon( \ - const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, \ - const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint16_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \ - \ - highbd_var_filter_block2d_bil_first_pass( \ - src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \ - highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \ - bilinear_filters[y_offset]); \ - \ - vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W, \ - H, temp2, W); \ - \ - return vpx_highbd_12_variance##W##x##H##_neon( \ - CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse); \ - } - -void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, - int width, int height, const uint16_t *ref, - int ref_stride) { - int i, j; - uint32x4_t one_u32 = vdupq_n_u32(1); - if (width >= 8) { - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 8) { - const uint16x8_t pred_u16 = vld1q_u16(&pred[j]); - const uint16x8_t ref_u16 = vld1q_u16(&ref[j]); - const uint32x4_t sum1_u32 = - vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16)); - const uint32x4_t sum2_u32 = - vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16)); - const uint16x4_t sum1_u16 = - vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1); - const uint16x4_t sum2_u16 = - vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1); - const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16); - vst1q_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } else { - assert(width >= 4); - for (i = 0; i < height; ++i) { - for (j = 0; j < width; j += 4) { - const uint16x4_t pred_u16 = vld1_u16(&pred[j]); - const uint16x4_t ref_u16 = vld1_u16(&ref[j]); - const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16); - const uint16x4_t vcomp_pred = - vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1); - vst1_u16(&comp_pred[j], vcomp_pred); - } - comp_pred += width; - pred += width; - ref += ref_stride; - } - } -} - -/* All three forms of the variance are available in the same sizes. */ -#define HIGHBD_VARIANCES(W, H) \ - HIGHBD_VAR(W, H) \ - HIGHBD_SUBPIX_VAR(W, H) \ - HIGHBD_SUBPIX_AVG_VAR(W, H) - -HIGHBD_VARIANCES(64, 64) -HIGHBD_VARIANCES(64, 32) -HIGHBD_VARIANCES(32, 64) -HIGHBD_VARIANCES(32, 32) -HIGHBD_VARIANCES(32, 16) -HIGHBD_VARIANCES(16, 32) -HIGHBD_VARIANCES(16, 16) -HIGHBD_VARIANCES(16, 8) -HIGHBD_VARIANCES(8, 16) -HIGHBD_VARIANCES(8, 8) -HIGHBD_VARIANCES(8, 4) -HIGHBD_VARIANCES(4, 8) -HIGHBD_VARIANCES(4, 4) +HBD_VARIANCE_WXH_NEON(64, 64) +HBD_VARIANCE_WXH_NEON(64, 32) +HBD_VARIANCE_WXH_NEON(32, 64) +HBD_VARIANCE_WXH_NEON(32, 32) +HBD_VARIANCE_WXH_NEON(32, 16) +HBD_VARIANCE_WXH_NEON(16, 32) +HBD_VARIANCE_WXH_NEON(16, 16) +HBD_VARIANCE_WXH_NEON(16, 8) +HBD_VARIANCE_WXH_NEON(8, 16) +HBD_VARIANCE_WXH_NEON(8, 8) +HBD_VARIANCE_WXH_NEON(8, 4) +HBD_VARIANCE_WXH_NEON(4, 8) +HBD_VARIANCE_WXH_NEON(4, 4) HIGHBD_GET_VAR(8) HIGHBD_GET_VAR(16) diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 9d2752e097be..7751082083f3 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, (void)bd; if (w < 8) { // copy4 + uint16x4_t s0, s1; do { - vst1_u16(dst, vld1_u16(src)); + s0 = vld1_u16(src); src += src_stride; + s1 = vld1_u16(src); + src += src_stride; + + vst1_u16(dst, s0); dst += dst_stride; - vst1_u16(dst, vld1_u16(src)); - src += src_stride; + vst1_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w == 8) { // copy8 + uint16x8_t s0, s1; do { - vst1q_u16(dst, vld1q_u16(src)); + s0 = vld1q_u16(src); src += src_stride; + s1 = vld1q_u16(src); + src += src_stride; + + vst1q_u16(dst, s0); dst += dst_stride; - vst1q_u16(dst, vld1q_u16(src)); - src += src_stride; + vst1q_u16(dst, s1); dst += dst_stride; h -= 2; - } while (h > 0); + } while (h != 0); } else if (w < 32) { // copy16 + uint16x8_t s0, s1, s2, s3; do { - vst2q_u16(dst, vld2q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); src += src_stride; - dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); + s2 = vld1q_u16(src); + s3 = vld1q_u16(src + 8); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; + vst1q_u16(dst, s2); + vst1q_u16(dst + 8, s3); dst += dst_stride; - vst2q_u16(dst, vld2q_u16(src)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + h -= 2; + } while (h != 0); } else if (w == 32) { // copy32 + uint16x8_t s0, s1, s2, s3; do { - vst4q_u16(dst, vld4q_u16(src)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } else { // copy64 + uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7; do { - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + s4 = vld1q_u16(src + 32); + s5 = vld1q_u16(src + 40); + s6 = vld1q_u16(src + 48); + s7 = vld1q_u16(src + 56); src += src_stride; + + vst1q_u16(dst, s0); + vst1q_u16(dst + 8, s1); + vst1q_u16(dst + 16, s2); + vst1q_u16(dst + 24, s3); + vst1q_u16(dst + 32, s4); + vst1q_u16(dst + 40, s5); + vst1q_u16(dst + 48, s6); + vst1q_u16(dst + 56, s7); dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - vst4q_u16(dst, vld4q_u16(src)); - vst4q_u16(dst + 32, vld4q_u16(src + 32)); - src += src_stride; - dst += dst_stride; - h -= 4; - } while (h > 0); + } while (--h != 0); } } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h index 19cfc7c7f22b..866be7439e2d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/mem_neon.h @@ -126,6 +126,20 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, return vreinterpret_u8_u32(a_u32); } +// Load 2 sets of 8 bytes when alignment is not guaranteed. +static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf, + ptrdiff_t stride) { + uint64_t a; + uint64x2_t a_u64; + if (stride == 4) return vld1q_u16(buf); + memcpy(&a, buf, 8); + buf += stride; + a_u64 = vdupq_n_u64(a); + memcpy(&a, buf, 8); + a_u64 = vsetq_lane_u64(a, a_u64, 1); + return vreinterpretq_u16_u64(a_u64); +} + // Store 2 sets of 4 bytes when alignment is not guaranteed. static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) { diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c index 5fc621aee186..95095739394f 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -17,633 +17,316 @@ #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" -static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, - const void *const buf1) { - uint32_t a; - uint32x2_t aa; - memcpy(&a, buf0, 4); - aa = vdup_n_u32(a); - memcpy(&a, buf1, 4); - aa = vset_lane_u32(a, aa, 1); - return vreinterpret_u8_u32(aa); +#if defined(__ARM_FEATURE_DOTPROD) + +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint32x4_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1)); } -static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, - const uint8_t *const ref_array[4], - const int ref_stride, const int height, - uint32_t sad_array[4]) { - int i; - uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; -#if !defined(__aarch64__) - uint16x4_t a[2]; -#endif - uint32x4_t r; +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; - assert(!((intptr_t)src_ptr % sizeof(uint32_t))); - assert(!(src_stride % sizeof(uint32_t))); + int i = 0; + do { + uint8x16_t s0, s1, s2, s3; - for (i = 0; i < height; ++i) { - const uint8x8_t s = vreinterpret_u8_u32( - vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride))); - const uint8x8_t ref01 = load_unaligned_2_buffers( - ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride); - const uint8x8_t ref23 = load_unaligned_2_buffers( - ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride); - abs[0] = vabal_u8(abs[0], s, ref01); - abs[1] = vabal_u8(abs[1], s, ref23); - } + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); -#if defined(__aarch64__) - abs[0] = vpaddq_u16(abs[0], abs[1]); - r = vpaddlq_u16(abs[0]); -#else - a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); - a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); - r = vpaddlq_u16(vcombine_u16(a[0], a[1])); -#endif - vst1q_u32(sad_array, r); + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } -void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array); +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint32x4_t sum[4]; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]); + sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]); + sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]); + sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } -void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array); -} - -//////////////////////////////////////////////////////////////////////////////// - -// Can handle 512 pixels' sad sum (such as 16x32 or 32x16) -static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); - const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); - const uint16x8_t b0 = vpaddq_u16(a0, a1); - const uint32x4_t r = vpaddlq_u16(b0); -#else - const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); - const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); - const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); - const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); - const uint16x4_t b0 = vpadd_u16(a0, a1); - const uint16x4_t b1 = vpadd_u16(a2, a3); - const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); -#endif - vst1q_u32(sad_array, r); -} - -#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD) - -// Can handle 1024 pixels' sad sum (such as 32x32) -static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); - const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); - const uint32x4_t b0 = vpaddlq_u16(a0); - const uint32x4_t b1 = vpaddlq_u16(a1); - const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(sad_array, r); -#else - const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); - const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); - const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); - const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); - const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1)); - const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); - const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); - const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - vst1q_u32(sad_array, vcombine_u32(c0, c1)); -#endif -} - -// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) -static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t b0 = vpaddq_u32(a0, a1); - const uint32x4_t b1 = vpaddq_u32(a2, a3); - const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(sad_array, r); -#else - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0)); - const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1)); - const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2)); - const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); - const uint32x2_t c0 = vpadd_u32(b0, b1); - const uint32x2_t c1 = vpadd_u32(b2, b3); - vst1q_u32(sad_array, vcombine_u32(c0, c1)); -#endif -} - -// Can handle 4096 pixels' sad sum (such as 64x64) -static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8], - uint32_t sad_array[4]) { -#if defined(__aarch64__) - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t a4 = vpaddlq_u16(sum[4]); - const uint32x4_t a5 = vpaddlq_u16(sum[5]); - const uint32x4_t a6 = vpaddlq_u16(sum[6]); - const uint32x4_t a7 = vpaddlq_u16(sum[7]); - const uint32x4_t b0 = vaddq_u32(a0, a1); - const uint32x4_t b1 = vaddq_u32(a2, a3); - const uint32x4_t b2 = vaddq_u32(a4, a5); - const uint32x4_t b3 = vaddq_u32(a6, a7); - const uint32x4_t c0 = vpaddq_u32(b0, b1); - const uint32x4_t c1 = vpaddq_u32(b2, b3); - const uint32x4_t r = vpaddq_u32(c0, c1); - vst1q_u32(sad_array, r); -#else - const uint32x4_t a0 = vpaddlq_u16(sum[0]); - const uint32x4_t a1 = vpaddlq_u16(sum[1]); - const uint32x4_t a2 = vpaddlq_u16(sum[2]); - const uint32x4_t a3 = vpaddlq_u16(sum[3]); - const uint32x4_t a4 = vpaddlq_u16(sum[4]); - const uint32x4_t a5 = vpaddlq_u16(sum[5]); - const uint32x4_t a6 = vpaddlq_u16(sum[6]); - const uint32x4_t a7 = vpaddlq_u16(sum[7]); - const uint32x4_t b0 = vaddq_u32(a0, a1); - const uint32x4_t b1 = vaddq_u32(a2, a3); - const uint32x4_t b2 = vaddq_u32(a4, a5); - const uint32x4_t b3 = vaddq_u32(a6, a7); - const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); - const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); - const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); - const uint32x2_t d0 = vpadd_u32(c0, c1); - const uint32x2_t d1 = vpadd_u32(c2, c3); - vst1q_u32(sad_array, vcombine_u32(d0, d1)); -#endif -} - -#endif - -static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i, j; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - - for (i = 0; i < height; ++i) { - const uint8x8_t s = vld1_u8(src_ptr); - src_ptr += src_stride; - for (j = 0; j < 4; ++j) { - const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); - ref_loop[j] += ref_stride; - sum[j] = vabal_u8(sum[j], s, b_u8); - } - } - - sad_512_pel_final_neon(sum, sad_array); -} - -void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4); -} - -void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); -} - -void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, - uint32x4_t *const sum) { - const uint8x16_t r = vld1q_u8(ref_ptr); - const uint8x16_t diff = vabdq_u8(src_ptr, r); - *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1)); -} - -static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride); - sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]); - sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]); - sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]); - sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]); - } + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); + i++; + } while (i < h); + + vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) +#else // !defined(__ARM_FEATURE_DOTPROD)) -static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, - uint16x8_t *const sum) { - const uint8x16_t r = vld1q_u8(ref_ptr); - *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r)); - *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r)); +static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, + uint16x8_t *const sad_sum) { + uint8x16_t abs_diff = vabdq_u8(src, ref); + *sad_sum = vpadalq_u8(*sad_sum, abs_diff); } -static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; +static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + int h_tmp = h > 64 ? 64 : h; + int i = 0; + vst1q_u32(res, vdupq_n_u32(0)); + + do { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + do { + uint8x16_t s0, s1, s2, s3; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); + + i++; + } while (i < h_tmp); + + res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); + res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); + res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); + res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); + + h_tmp += 64; + } while (i < h); +} + +static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + + int i = 0; + do { + uint8x16_t s0, s1; + + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + + i++; + } while (i < h); + + res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); + res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); + res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); + res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); +} + +static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t s = vld1q_u8(src_ptr); - src_ptr += src_stride; - /* Manual unrolling here stops the compiler from getting confused. */ - sad16_neon(ref_loop[0], s, &sum[0]); - ref_loop[0] += ref_stride; - sad16_neon(ref_loop[1], s, &sum[1]); - ref_loop[1] += ref_stride; - sad16_neon(ref_loop[2], s, &sum[2]); - ref_loop[2] += ref_stride; - sad16_neon(ref_loop[3], s, &sum[3]); - ref_loop[3] += ref_stride; - } + int i = 0; + do { + const uint8x16_t s = vld1q_u8(src + i * src_stride); + sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]); + sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]); + sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]); + sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]); - sad_512_pel_final_neon(sum, sad_array); + i++; + } while (i < h); + + res[0] = horizontal_add_uint16x8(sum[0]); + res[1] = horizontal_add_uint16x8(sum[1]); + res[2] = horizontal_add_uint16x8(sum[2]); + res[3] = horizontal_add_uint16x8(sum[3]); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // defined(__ARM_FEATURE_DOTPROD) -void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); +static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref, + uint16x8_t *const sad_sum) { + uint8x8_t abs_diff = vabd_u8(src, ref); + *sad_sum = vaddw_u8(*sad_sum, abs_diff); } -void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); -} - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4], const int height) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < height; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); -} - -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); -} - -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); -} - -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - const int height, uint16x8_t *const sum) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - - sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); - - for (i = 0; i < height; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } -} - -void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); - sad_512_pel_final_neon(sum, sad_array); -} - -void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); - sad_1024_pel_final_neon(sum, sad_array); -} - -void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - uint16x8_t sum[4]; - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); - sad_2048_pel_final_neon(sum, sad_array); -} - -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -//////////////////////////////////////////////////////////////////////////////// - -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) - -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - uint32x4_t r0, r1; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; - - for (i = 0; i < 32; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - uint32x4_t r0, r1, r2, r3; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0), vdupq_n_u32(0) }; - - for (i = 0; i < 64; ++i) { - uint8x16_t s; - - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - r0 = vpaddq_u32(sum[0], sum[1]); - r1 = vpaddq_u32(sum[2], sum[3]); - r2 = vpaddq_u32(sum[4], sum[5]); - r3 = vpaddq_u32(sum[6], sum[7]); - r0 = vpaddq_u32(r0, r1); - r1 = vpaddq_u32(r2, r3); - vst1q_u32(sad_array, vpaddq_u32(r0, r1)); -} - -#else // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)) - -void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; +static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - for (i = 0; i < 32; ++i) { - uint8x16_t s; + int i = 0; + do { + const uint8x8_t s = vld1_u8(src + i * src_stride); + sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]); + sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]); + sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]); + sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]); - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + i++; + } while (i < h); - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - sad_2048_pel_final_neon(sum, sad_array); + res[0] = horizontal_add_uint16x8(sum[0]); + res[1] = horizontal_add_uint16x8(sum[1]); + res[2] = horizontal_add_uint16x8(sum[2]); + res[3] = horizontal_add_uint16x8(sum[3]); } -void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[4], int ref_stride, - uint32_t sad_array[4]) { - int i; - const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], - ref_array[3] }; - uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0), vdupq_n_u16(0) }; +static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t res[4], int h) { + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - for (i = 0; i < 64; ++i) { - uint8x16_t s; + int i = 0; + do { + uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride); + uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride); + uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride); + uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride); + uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride); - s = vld1q_u8(src_ptr + 0 * 16); - sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); + sad8_neon(s, r0, &sum[0]); + sad8_neon(s, r1, &sum[1]); + sad8_neon(s, r2, &sum[2]); + sad8_neon(s, r3, &sum[3]); - s = vld1q_u8(src_ptr + 1 * 16); - sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); - sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); - sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); - sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); + i += 2; + } while (i < h); - s = vld1q_u8(src_ptr + 2 * 16); - sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); - - s = vld1q_u8(src_ptr + 3 * 16); - sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); - sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); - sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); - sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); - - src_ptr += src_stride; - ref_loop[0] += ref_stride; - ref_loop[1] += ref_stride; - ref_loop[2] += ref_stride; - ref_loop[3] += ref_stride; - } - - sad_4096_pel_final_neon(sum, sad_array); + res[0] = horizontal_add_uint16x8(sum[0]); + res[1] = horizontal_add_uint16x8(sum[1]); + res[2] = horizontal_add_uint16x8(sum[2]); + res[3] = horizontal_add_uint16x8(sum[3]); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#define SAD_WXH_4D_NEON(w, h) \ + void vpx_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \ + const uint8_t *const ref[4], int ref_stride, \ + uint32_t res[4]) { \ + sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h)); \ + } + +SAD_WXH_4D_NEON(4, 4) +SAD_WXH_4D_NEON(4, 8) + +SAD_WXH_4D_NEON(8, 4) +SAD_WXH_4D_NEON(8, 8) +SAD_WXH_4D_NEON(8, 16) + +SAD_WXH_4D_NEON(16, 8) +SAD_WXH_4D_NEON(16, 16) +SAD_WXH_4D_NEON(16, 32) + +SAD_WXH_4D_NEON(32, 16) +SAD_WXH_4D_NEON(32, 32) +SAD_WXH_4D_NEON(32, 64) + +SAD_WXH_4D_NEON(64, 32) +SAD_WXH_4D_NEON(64, 64) + +#undef SAD_WXH_4D_NEON diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c index 7336edb69403..9382b806261d 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad_neon.c @@ -214,24 +214,13 @@ static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride, int i = h / 2; do { - uint32x2_t s, r; - uint32_t s0, s1, r0, r1; + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); - memcpy(&s0, src_ptr, 4); - memcpy(&r0, ref_ptr, 4); - s = vdup_n_u32(s0); - r = vdup_n_u32(r0); - src_ptr += src_stride; - ref_ptr += ref_stride; + sum = vabal_u8(sum, s, r); - memcpy(&s1, src_ptr, 4); - memcpy(&r1, ref_ptr, 4); - s = vset_lane_u32(s1, s, 1); - r = vset_lane_u32(r1, r, 1); - src_ptr += src_stride; - ref_ptr += ref_stride; - - sum = vabal_u8(sum, vreinterpret_u8_u32(s), vreinterpret_u8_u32(r)); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; } while (--i != 0); return horizontal_add_uint16x8(sum); @@ -509,28 +498,15 @@ static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr, int i = h / 2; do { - uint32x2_t s, r; - uint32_t s0, s1, r0, r1; - uint8x8_t p, avg; + uint8x8_t s = load_unaligned_u8(src_ptr, src_stride); + uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride); + uint8x8_t p = vld1_u8(second_pred); - memcpy(&s0, src_ptr, 4); - memcpy(&r0, ref_ptr, 4); - s = vdup_n_u32(s0); - r = vdup_n_u32(r0); - src_ptr += src_stride; - ref_ptr += ref_stride; + uint8x8_t avg = vrhadd_u8(r, p); + sum = vabal_u8(sum, s, avg); - memcpy(&s1, src_ptr, 4); - memcpy(&r1, ref_ptr, 4); - s = vset_lane_u32(s1, s, 1); - r = vset_lane_u32(r1, r, 1); - src_ptr += src_stride; - ref_ptr += ref_stride; - - p = vld1_u8(second_pred); - avg = vrhadd_u8(vreinterpret_u8_u32(r), p); - - sum = vabal_u8(sum, vreinterpret_u8_u32(s), avg); + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; second_pred += 8; } while (--i != 0); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h index 9a7c424e8e4a..21560837ae2e 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h @@ -40,6 +40,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, + const uint16x8_t vec_hi) { +#if defined(__aarch64__) + return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); +#else + const uint32x4_t vec_l_lo = + vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo)); + const uint32x4_t vec_l_hi = + vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi)); + const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); + const uint64x2_t b = vpaddlq_u32(a); + const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), + vreinterpret_u32_u64(vget_high_u64(b))); + return vget_lane_u32(c, 0); +#endif +} + static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { #if defined(__aarch64__) return vaddv_s32(a); @@ -77,4 +94,20 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { return vget_lane_u32(c, 0); #endif } + +static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { +#if defined(__aarch64__) + uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); + uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); + return vpaddq_u32(res01, res23); +#else + uint32x4_t res = vdupq_n_u32(0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2); + res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3); + return res; +#endif +} + #endif // VPX_VPX_DSP_ARM_SUM_NEON_H_ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h index 6c0bd08f77e3..48292c693633 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/transpose_neon.h @@ -23,10 +23,17 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_s16_s64( + vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); + b0.val[1] = vreinterpretq_s16_s64( + vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); +#else b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), vreinterpret_s16_s32(vget_high_s32(a1))); +#endif return b0; } @@ -57,10 +64,17 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; +#if defined(__aarch64__) + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), vreinterpret_u16_u32(vget_low_u32(a1))); b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), vreinterpret_u16_u32(vget_high_u32(a1))); +#endif return b0; } @@ -569,37 +583,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, } // Transpose 8x8 to a new location. -static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) { - // Swap 16 bit elements. - const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]); - const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]); - const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]); - const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]); +static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 - // Swap 32 bit elements. - const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]), - vreinterpretq_s32_s16(c1.val[0])); - const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]), - vreinterpretq_s32_s16(c1.val[1])); - const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]), - vreinterpretq_s32_s16(c3.val[0])); - const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]), - vreinterpretq_s32_s16(c3.val[1])); + const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]); + const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]); + const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]); + const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]); - // Swap 64 bit elements - const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]); - const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]); - const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]); - const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]); + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 - b[0] = e0.val[0]; - b[1] = e1.val[0]; - b[2] = e2.val[0]; - b[3] = e3.val[0]; - b[4] = e0.val[1]; - b[5] = e1.val[1]; - b[6] = e2.val[1]; - b[7] = e3.val[1]; + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); + + out[0] = d0.val[0]; + out[1] = d1.val[0]; + out[2] = d2.val[0]; + out[3] = d3.val[0]; + out[4] = d0.val[1]; + out[5] = d1.val[1]; + out[6] = d2.val[1]; + out[7] = d3.val[1]; } static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, @@ -658,6 +708,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); @@ -729,6 +780,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); diff --git a/media/libvpx/libvpx/vpx_dsp/psnr.c b/media/libvpx/libvpx/vpx_dsp/psnr.c index 48bac04508f5..f0d4e927ae3c 100644 --- a/media/libvpx/libvpx/vpx_dsp/psnr.c +++ b/media/libvpx/libvpx/vpx_dsp/psnr.c @@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) { /* TODO(yaowu): The block_variance calls the unoptimized versions of variance() * and highbd_8_variance(). It should not. */ -static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b, - int b_stride, int w, int h, unsigned int *sse, - int *sum) { +static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b, + int b_stride, int w, int h) { int i, j; - - *sum = 0; - *sse = 0; + int64_t sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } + + return sse; } #if CONFIG_VP9_HIGHBITDEPTH -static void encoder_highbd_variance64(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, uint64_t *sse, int64_t *sum) { +static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride, + const uint8_t *b8, int b_stride, int w, + int h) { int i, j; + int64_t sse = 0; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); - *sum = 0; - *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; - *sum += diff; - *sse += diff * diff; + sse += diff * diff; } a += a_stride; b += b_stride; } -} -static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, int w, - int h, unsigned int *sse, int *sum) { - uint64_t sse_long = 0; - int64_t sum_long = 0; - encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, - &sum_long); - *sse = (unsigned int)sse_long; - *sum = (int)sum_long; + return sse; } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b, const int dw = width % 16; const int dh = height % 16; int64_t total_sse = 0; - unsigned int sse = 0; - int sum = 0; int x, y; if (dw > 0) { - encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw, - height, &sse, &sum); - total_sse += sse; + total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride, + dw, height); } if (dh > 0) { - encoder_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, width - dw, dh, - &sse, &sum); - total_sse += sse; + total_sse += + encoder_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; @@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b, int x, y; const int dw = width % 16; const int dh = height % 16; - unsigned int sse = 0; - int sum = 0; if (dw > 0) { - encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], - b_stride, dw, height, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw], + b_stride, dw, height); } if (dh > 0) { - encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride, - &b[(height - dh) * b_stride], b_stride, - width - dw, dh, &sse, &sum); - total_sse += sse; + total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride, + &b[(height - dh) * b_stride], b_stride, + width - dw, dh); } for (y = 0; y < height / 16; ++y) { const uint8_t *pa = a; const uint8_t *pb = b; + unsigned int sse; for (x = 0; x < width / 16; ++x) { vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse); total_sse += sse; diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk index c83e52538073..1d72ee5125ae 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk @@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c @@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig index 1fd9495cf99c..5535f82c076d 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp.mk.orig @@ -392,6 +392,7 @@ DSP_SRCS-$(HAVE_LSX) += loongarch/subtract_lsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c @@ -432,7 +433,9 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_variance_impl_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_avg_pred_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_variance_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_subpel_variance_neon.c endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC diff --git a/media/libvpx/libvpx/y4minput.c b/media/libvpx/libvpx/y4minput.c index 745e2f1cd64c..210ce52fce0c 100644 --- a/media/libvpx/libvpx/y4minput.c +++ b/media/libvpx/libvpx/y4minput.c @@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) { _img->fmt = _y4m->vpx_fmt; _img->w = _img->d_w = _y4m->pic_w; _img->h = _img->d_h = _y4m->pic_h; + _img->bit_depth = _y4m->bit_depth; _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1; _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1; _img->bps = _y4m->bps; diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml index ccd3483ccbcf..28cda8f6dcd6 100644 --- a/media/libvpx/moz.yaml +++ b/media/libvpx/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9 (Thu Jan 26 21:31:14 2023). + release: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf (Tue Feb 14 02:46:51 2023). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 5c38ffbfa3aba5ea4d8d0ae05a50cc76ec99bed9 + revision: bc2965ff72af7d7b21ffeab10549fcc67ed66ccf # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/ diff --git a/media/libvpx/sources.mozbuild b/media/libvpx/sources.mozbuild index 796edd2c04ee..9673ac0471e1 100644 --- a/media/libvpx/sources.mozbuild +++ b/media/libvpx/sources.mozbuild @@ -181,6 +181,7 @@ files = { 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c', 'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c', @@ -450,6 +451,7 @@ files = { 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/encoder/x86/temporal_filter_sse4.c', 'libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c', @@ -699,6 +701,7 @@ files = { 'libvpx/vp9/encoder/vp9_subexp.c', 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c', @@ -944,6 +947,7 @@ files = { 'libvpx/vp9/encoder/vp9_subexp.c', 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c', @@ -1158,6 +1162,7 @@ files = { 'libvpx/vp9/encoder/vp9_svc_layercontext.c', 'libvpx/vp9/encoder/vp9_temporal_filter.c', 'libvpx/vp9/encoder/vp9_tokenize.c', + 'libvpx/vp9/encoder/vp9_tpl_model.c', 'libvpx/vp9/encoder/vp9_treewriter.c', 'libvpx/vp9/vp9_cx_iface.c', 'libvpx/vp9/vp9_dx_iface.c',