diff --git a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h index c668c12e22e1..e8c7ed72e047 100644 --- a/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/ia32/vpx_dsp_rtcd.h @@ -914,14 +914,17 @@ RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_st unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -1400,10 +1403,13 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; vpx_variance8x4 = vpx_variance8x4_c; if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; vpx_vector_var = vpx_vector_var_c; if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; } diff --git a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h index fe6033a5d5c9..3aec60bde8f1 100644 --- a/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/linux/x64/vpx_dsp_rtcd.h @@ -916,15 +916,18 @@ RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_st unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x16 vpx_variance8x16_sse2 +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_sse2 +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x8 vpx_variance8x8_sse2 +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c @@ -1125,6 +1128,12 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; vpx_variance64x64 = vpx_variance64x64_sse2; if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; } #endif diff --git a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h index c668c12e22e1..e8c7ed72e047 100644 --- a/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h +++ b/media/libvpx/config/mac/ia32/vpx_dsp_rtcd.h @@ -914,14 +914,17 @@ RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_st unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -1400,10 +1403,13 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; vpx_variance8x4 = vpx_variance8x4_c; if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; vpx_vector_var = vpx_vector_var_c; if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; } diff --git a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h index fe6033a5d5c9..3aec60bde8f1 100644 --- a/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/mac/x64/vpx_dsp_rtcd.h @@ -916,15 +916,18 @@ RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_st unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x16 vpx_variance8x16_sse2 +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_sse2 +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x8 vpx_variance8x8_sse2 +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c @@ -1125,6 +1128,12 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; vpx_variance64x64 = vpx_variance64x64_sse2; if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; } #endif diff --git a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h index c668c12e22e1..e8c7ed72e047 100644 --- a/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h +++ b/media/libvpx/config/win/ia32/vpx_dsp_rtcd.h @@ -914,14 +914,17 @@ RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_st unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -1400,10 +1403,13 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; vpx_variance8x16 = vpx_variance8x16_c; if (flags & HAS_SSE2) vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; vpx_variance8x4 = vpx_variance8x4_c; if (flags & HAS_SSE2) vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; vpx_variance8x8 = vpx_variance8x8_c; if (flags & HAS_SSE2) vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; vpx_vector_var = vpx_vector_var_c; if (flags & HAS_SSE2) vpx_vector_var = vpx_vector_var_sse2; } diff --git a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h index fe6033a5d5c9..3aec60bde8f1 100644 --- a/media/libvpx/config/win/x64/vpx_dsp_rtcd.h +++ b/media/libvpx/config/win/x64/vpx_dsp_rtcd.h @@ -916,15 +916,18 @@ RTCD_EXTERN unsigned int (*vpx_variance64x64)(const uint8_t *src_ptr, int src_st unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x16 vpx_variance8x16_sse2 +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x16)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x4 vpx_variance8x4_sse2 +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x4)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); unsigned int vpx_variance8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); -#define vpx_variance8x8 vpx_variance8x8_sse2 +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +RTCD_EXTERN unsigned int (*vpx_variance8x8)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); #define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c @@ -1125,6 +1128,12 @@ static void setup_rtcd_internal(void) if (flags & HAS_AVX2) vpx_variance64x32 = vpx_variance64x32_avx2; vpx_variance64x64 = vpx_variance64x64_sse2; if (flags & HAS_AVX2) vpx_variance64x64 = vpx_variance64x64_avx2; + vpx_variance8x16 = vpx_variance8x16_sse2; + if (flags & HAS_AVX2) vpx_variance8x16 = vpx_variance8x16_avx2; + vpx_variance8x4 = vpx_variance8x4_sse2; + if (flags & HAS_AVX2) vpx_variance8x4 = vpx_variance8x4_avx2; + vpx_variance8x8 = vpx_variance8x8_sse2; + if (flags & HAS_AVX2) vpx_variance8x8 = vpx_variance8x8_avx2; } #endif diff --git a/media/libvpx/libvpx/test/svc_datarate_test.cc b/media/libvpx/libvpx/test/svc_datarate_test.cc index 484252ca43d7..d571f50860e1 100644 --- a/media/libvpx/libvpx/test/svc_datarate_test.cc +++ b/media/libvpx/libvpx/test/svc_datarate_test.cc @@ -256,13 +256,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config, + number_spatial_layers_, 0, &ref_frame_config_, 1); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (update_pattern_ && video->frame() >= 100) { @@ -277,13 +277,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { temporal_layer_id_ = layer_id.temporal_layer_id; for (int i = 0; i < number_spatial_layers_; i++) { layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_; - ref_frame_config.duration[i] = 1; + ref_frame_config_.duration[i] = 1; } encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id); set_frame_flags_bypass_mode(layer_id.temporal_layer_id, - number_spatial_layers_, 0, &ref_frame_config, + number_spatial_layers_, 0, &ref_frame_config_, 0); - encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config); + encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_); } if (change_bitrate_ && video->frame() == 200) { @@ -611,7 +611,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc { bool single_layer_resize_; unsigned int top_sl_width_; unsigned int top_sl_height_; - vpx_svc_ref_frame_config_t ref_frame_config; + vpx_svc_ref_frame_config_t ref_frame_config_; int update_pattern_; bool change_bitrate_; vpx_codec_pts_t last_pts_ref_; diff --git a/media/libvpx/libvpx/test/variance_test.cc b/media/libvpx/libvpx/test/variance_test.cc index 1359bc4baf5a..df9a1c56f622 100644 --- a/media/libvpx/libvpx/test/variance_test.cc +++ b/media/libvpx/libvpx/test/variance_test.cc @@ -1429,7 +1429,10 @@ INSTANTIATE_TEST_SUITE_P( VarianceParams(5, 4, &vpx_variance32x16_avx2), VarianceParams(4, 5, &vpx_variance16x32_avx2), VarianceParams(4, 4, &vpx_variance16x16_avx2), - VarianceParams(4, 3, &vpx_variance16x8_avx2))); + VarianceParams(4, 3, &vpx_variance16x8_avx2), + VarianceParams(3, 4, &vpx_variance8x16_avx2), + VarianceParams(3, 3, &vpx_variance8x8_avx2), + VarianceParams(3, 2, &vpx_variance8x4_avx2))); INSTANTIATE_TEST_SUITE_P( AVX2, VpxSubpelVarianceTest, diff --git a/media/libvpx/libvpx/test/vp9_intrapred_test.cc b/media/libvpx/libvpx/test/vp9_intrapred_test.cc index cec90316181e..6de7cf8d0f5d 100644 --- a/media/libvpx/libvpx/test/vp9_intrapred_test.cc +++ b/media/libvpx/libvpx/test/vp9_intrapred_test.cc @@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam { ref_dst_ = ref_dst; int error_count = 0; for (int i = 0; i < count_test_block; ++i) { + // TODO(webm:1797): Some of the optimised predictor implementations rely + // on the trailing half of the above_row_ being a copy of the final + // element, however relying on this in some cases can cause the MD5 tests + // to fail. We have fixed all of these cases for Neon, so fill the whole + // of above_row_ randomly. +#if HAVE_NEON + // Fill edges with random data, try first with saturated values. + for (int x = -1; x < 2 * block_size; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } +#else // Fill edges with random data, try first with saturated values. for (int x = -1; x < block_size; x++) { if (i == 0) { @@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam { for (int x = block_size; x < 2 * block_size; x++) { above_row_[x] = above_row_[block_size - 1]; } +#endif for (int y = 0; y < block_size; y++) { if (i == 0) { left_col_[y] = mask_; diff --git a/media/libvpx/libvpx/vp8/encoder/block.h b/media/libvpx/libvpx/vp8/encoder/block.h index f0efd3e1e255..1bc5ef75bc29 100644 --- a/media/libvpx/libvpx/vp8/encoder/block.h +++ b/media/libvpx/libvpx/vp8/encoder/block.h @@ -92,8 +92,7 @@ typedef struct macroblock { signed int last_act_zbin_adj; int *mvcost[2]; - /* MSVC generates code that thinks this is 16-byte aligned */ - DECLARE_ALIGNED(16, int*, mvsadcost[2]); + int *mvsadcost[2]; int (*mbmode_cost)[MB_MODE_COUNT]; int (*intra_uv_mode_cost)[MB_MODE_COUNT]; int (*bmode_costs)[10][10]; diff --git a/media/libvpx/libvpx/vp8/vp8_cx_iface.c b/media/libvpx/libvpx/vp8/vp8_cx_iface.c index a1e5396008d5..a9d1f8005da8 100644 --- a/media/libvpx/libvpx/vp8/vp8_cx_iface.c +++ b/media/libvpx/libvpx/vp8/vp8_cx_iface.c @@ -947,19 +947,10 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx, if (img != NULL) { res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, - &sd, dst_time_stamp, dst_end_time_stamp)) { - VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; - res = update_error_state(ctx, &cpi->common.error); - } + if (vp8_receive_raw_frame(ctx->cpi, ctx->next_frame_flag | lib_flags, &sd, + dst_time_stamp, dst_end_time_stamp)) { + VP8_COMP *cpi = (VP8_COMP *)ctx->cpi; + res = update_error_state(ctx, &cpi->common.error); } /* reset for next frame */ diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c index 4cec02eb930c..72a6189d1348 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_encoder.c @@ -2465,11 +2465,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, cpi->svc.number_temporal_layers > 1) { FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf; FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 }; - int i; + int n; - for (i = 0; i < oxcf->ss_number_layers; ++i) { + for (n = 0; n < oxcf->ss_number_layers; ++n) { FIRSTPASS_STATS *const last_packet_for_layer = - &stats[packets - oxcf->ss_number_layers + i]; + &stats[packets - oxcf->ss_number_layers + n]; const int layer_id = (int)last_packet_for_layer->spatial_layer_id; const int packets_in_layer = (int)last_packet_for_layer->count + 1; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) { @@ -2494,11 +2494,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf, } } - for (i = 0; i < packets; ++i) { - const int layer_id = (int)stats[i].spatial_layer_id; + for (n = 0; n < packets; ++n) { + const int layer_id = (int)stats[n].spatial_layer_id; if (layer_id >= 0 && layer_id < oxcf->ss_number_layers && stats_copy[layer_id] != NULL) { - *stats_copy[layer_id] = stats[i]; + *stats_copy[layer_id] = stats[n]; ++stats_copy[layer_id]; } } diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c index e9250e25c0ca..08b68c93ee6a 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_firstpass.c @@ -3495,7 +3495,6 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { const int show_idx = cm->current_video_frame; if (cpi->common.current_frame_coding_index == 0) { - VP9_COMMON *cm = &cpi->common; const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats( &cpi->ext_ratectrl, &cpi->twopass.first_pass_info); if (codec_status != VPX_CODEC_OK) { diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c index 1f08aa5de708..207eb4394978 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_mcomp.c @@ -163,8 +163,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ if (second_pred == NULL) { \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ @@ -173,7 +173,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { src_stride, &sse, second_pred); \ } \ tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -192,15 +193,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { #define CHECK_BETTER(v, r, c) \ do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ if (second_pred == NULL) \ thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse); \ else \ thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \ src_stride, &sse, second_pred); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -686,13 +688,14 @@ static int accurate_sub_pel_search( do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ int64_t tmpmse; \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ src_stride, y, y_stride, second_pred, \ w, h, &sse); \ tmpmse = thismse; \ - tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit); \ + tmpmse += \ + mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit); \ if (tmpmse >= INT_MAX) { \ v = INT_MAX; \ } else if ((v = (uint32_t)tmpmse) < besterr) { \ @@ -711,12 +714,13 @@ static int accurate_sub_pel_search( #define CHECK_BETTER1(v, r, c) \ do { \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) { \ - const MV mv = { r, c }; \ - const MV ref_mv = { rr, rc }; \ - thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, \ + const MV cb_mv = { r, c }; \ + const MV cb_ref_mv = { rr, rc }; \ + thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \ src_stride, y, y_stride, second_pred, \ w, h, &sse); \ - if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) + \ + if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, \ + error_per_bit) + \ thismse) < besterr) { \ besterr = v; \ br = r; \ @@ -980,16 +984,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv, const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 }; int br = best_mv->row; int bc = best_mv->col; - MV this_mv; + const MV mv = { br, bc }; int i; unsigned int sse; - this_mv.row = br; - this_mv.col = bc; cost_list[0] = - fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv), + fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv), in_what->stride, &sse) + - mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb); + mvsad_err_cost(x, &mv, &fcenter_mv, sadpb); if (check_bounds(&x->mv_limits, br, bc, 1)) { for (i = 0; i < 4; i++) { const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col }; @@ -1170,6 +1172,9 @@ static int vp9_pattern_search( } while (s--); } + best_mv->row = br; + best_mv->col = bc; + // Returns the one-away integer pel sad values around the best as follows: // cost_list[0]: cost at the best integer pel // cost_list[1]: cost at delta {0, -1} (left) from the best integer pel @@ -1177,11 +1182,8 @@ static int vp9_pattern_search( // cost_list[3]: cost at delta { 0, 1} (right) from the best integer pel // cost_list[4]: cost at delta {-1, 0} (top) from the best integer pel if (cost_list) { - const MV best_mv = { br, bc }; - calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list); + calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list); } - best_mv->row = br; - best_mv->col = bc; return bestsad; } @@ -2321,17 +2323,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, // TODO(jingning): Implement integral projection functions for high bit-depth // setting and remove this part of code. if (xd->bd != 8) { - unsigned int this_sad; + const unsigned int sad = cpi->fn_ptr[bsize].sdf( + x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride); tmp_mv->row = 0; tmp_mv->col = 0; - this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride, - xd->plane[0].pre[0].buf, ref_stride); if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; } - return this_sad; + return sad; } #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c index 76d545cd965f..05811bd828c3 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_rdopt.c @@ -743,7 +743,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, } if (x->block_tx_domain) { dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col, - tx_size, &dist, &sse, /*recon =*/0, sse_calc_done); + tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done); } else { const struct macroblock_plane *const p = &x->plane[plane]; const int src_stride = p->src.stride; @@ -1396,7 +1396,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, mic->mode = mode; super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL, - bsize, best_rd, /*recon = */ 0); + bsize, best_rd, /*recon=*/NULL); if (this_rate_tokenonly == INT_MAX) continue; @@ -1449,7 +1449,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate, for (plane = 1; plane < MAX_MB_PLANE; ++plane) { txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd, plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing, - /*recon = */ 0); + /*recon=*/NULL); if (pnrate == INT_MAX) { is_cost_valid = 0; break; @@ -1854,6 +1854,52 @@ static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) { return 0; } +// Compares motion vector and mode rate of current mode and given mode. +static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv, + int this_mode_rate, int mode_rate, + int mv_thresh) { + const int mv_diff = + abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row); + if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1; + return 0; +} + +// Skips single reference inter modes NEARMV and ZEROMV based on motion vector +// difference and mode rate. +static INLINE int skip_single_mode_based_on_mode_rate( + int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode, + int ref0, int this_mode_rate, int best_mode_index) { + MV this_mv = mode_mv[this_mode][ref0].as_mv; + const int mv_thresh = 3; + + // Pruning is not applicable for NEARESTMV or NEWMV modes. + if (this_mode == NEARESTMV || this_mode == NEWMV) return 0; + // Pruning is not done when reference frame of the mode is same as best + // reference so far. + if (best_mode_index > 0 && + ref0 == vp9_mode_order[best_mode_index].ref_frame[0]) + return 0; + + // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV + if (compare_mv_mode_rate( + this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh)) + return 1; + + // Check absolute mv difference and mode rate of current mode w.r.t NEWMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh)) + return 1; + + // Pruning w.r.t NEARMV is applicable only for ZEROMV mode + if (this_mode == NEARMV) return 0; + // Check absolute mv difference and mode rate of current mode w.r.t NEARMV + if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate, + single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh)) + return 1; + return 0; +} + #define NUM_ITERS 4 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int_mv *frame_mv, int mi_row, int mi_col, @@ -2756,8 +2802,9 @@ static int64_t handle_inter_mode( struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES], int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES], INTERP_FILTER (*single_filter)[MAX_REF_FRAMES], - int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse, - const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) { + int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate, + int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter, + int64_t filter_cache[], int best_mode_index) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; @@ -2914,6 +2961,15 @@ static int64_t handle_inter_mode( *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]); } + if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) { + single_mode_rate[INTER_OFFSET(this_mode)] = *rate2; + // Prune NEARMV and ZEROMV modes based on motion vector difference and mode + // rate. + if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate, + this_mode, refs[0], *rate2, + best_mode_index)) + return INT64_MAX; + } if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd && mi->mode != NEARESTMV) return INT64_MAX; @@ -3380,6 +3436,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } }; INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES]; int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES]; + int single_mode_rate[MAX_REF_FRAMES][INTER_MODES]; int64_t best_rd = best_rd_so_far; int64_t best_pred_diff[REFERENCE_MODES]; int64_t best_pred_rd[REFERENCE_MODES]; @@ -3578,6 +3635,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, second_ref_frame = vp9_mode_order[mode_index].ref_frame[1]; vp9_zero(x->sum_y_eobs); + comp_pred = second_ref_frame > INTRA_FRAME; + if (!comp_pred && ref_frame != INTRA_FRAME && + sf->prune_single_mode_based_on_mv_diff_mode_rate) + single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX; if (is_rect_partition) { if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue; @@ -3663,7 +3724,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, if (this_mode == NEARMV || this_mode == ZEROMV) continue; } - comp_pred = second_ref_frame > INTRA_FRAME; if (comp_pred) { if (!cpi->allow_comp_inter_inter) continue; @@ -3783,8 +3843,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data, this_rd = handle_inter_mode( cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv, recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv, - single_inter_filter, single_skippable, &total_sse, best_rd, - &mask_filter, filter_cache); + single_inter_filter, single_skippable, + &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter, + filter_cache, best_mode_index); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, handle_inter_mode_time); #endif diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c index 3e121b799fd6..0522d4ec9701 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.c @@ -70,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, const int is_720p_or_larger = min_frame_size >= 720; const int is_1080p_or_larger = min_frame_size >= 1080; const int is_2160p_or_larger = min_frame_size >= 2160; + const int boosted = frame_is_boosted(cpi); // speed 0 features sf->partition_search_breakout_thr.dist = (1 << 20); @@ -102,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi, } } + if (!is_720p_or_larger) { + if (is_480p_or_larger) + sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1; + else + sf->prune_single_mode_based_on_mv_diff_mode_rate = 1; + } + if (speed >= 1) { sf->rd_ml_partition.search_early_termination = 0; sf->rd_ml_partition.search_breakout = 1; @@ -926,6 +934,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) { sf->enhanced_full_pixel_motion_search = 1; sf->adaptive_pred_interp_filter = 0; sf->adaptive_mode_search = 0; + sf->prune_single_mode_based_on_mv_diff_mode_rate = 0; sf->cb_pred_filter_search = 0; sf->early_term_interp_search_plane_rd = 0; sf->cb_partition_search = 0; diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h index d32bf09e4ee5..e267e55c4150 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h +++ b/media/libvpx/libvpx/vp9/encoder/vp9_speed_features.h @@ -417,6 +417,10 @@ typedef struct SPEED_FEATURES { // Adaptive prediction mode search int adaptive_mode_search; + // Prune NEAREST and ZEROMV single reference modes based on motion vector + // difference and mode rate + int prune_single_mode_based_on_mv_diff_mode_rate; + // Chessboard pattern prediction for interp filter. Aggressiveness increases // with levels. // 0: disable diff --git a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c index c60445cba506..83b6e5c99d03 100644 --- a/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/media/libvpx/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -220,7 +220,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, RATE_CONTROL *const lrc = &lc->rc; lc->spatial_layer_target_bandwidth = spatial_layer_target; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); lrc->optimal_buffer_level = @@ -252,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi, lc->target_bandwidth = oxcf->layer_target_bitrate[layer]; - bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + if (target_bandwidth != 0) { + bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth; + } // Update buffer-related quantities. lrc->starting_buffer_level = (int64_t)(rc->starting_buffer_level * bitrate_alloc); diff --git a/media/libvpx/libvpx/vp9/vp9_cx_iface.c b/media/libvpx/libvpx/vp9/vp9_cx_iface.c index c417766a4ad4..4c7eaed72507 100644 --- a/media/libvpx/libvpx/vp9/vp9_cx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_cx_iface.c @@ -1372,22 +1372,13 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, timebase_units_to_ticks(timestamp_ratio, pts + duration); res = image2yuvconfig(img, &sd); - if (sd.y_width != ctx->cfg.g_w || sd.y_height != ctx->cfg.g_h) { - /* from vpx_encoder.h for g_w/g_h: - "Note that the frames passed as input to the encoder must have this - resolution" - */ - ctx->base.err_detail = "Invalid input frame resolution"; - res = VPX_CODEC_INVALID_PARAM; - } else { - // Store the original flags in to the frame buffer. Will extract the - // key frame flag when we actually encode this frame. - if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, + // Store the original flags in to the frame buffer. Will extract the + // key frame flag when we actually encode this frame. + if (vp9_receive_raw_frame(cpi, flags | ctx->next_frame_flags, &sd, dst_time_stamp, dst_end_time_stamp)) { - res = update_error_state(ctx, &cpi->common.error); - } - ctx->next_frame_flags = 0; + res = update_error_state(ctx, &cpi->common.error); } + ctx->next_frame_flags = 0; } cx_data = ctx->cx_data; diff --git a/media/libvpx/libvpx/vp9/vp9_dx_iface.c b/media/libvpx/libvpx/vp9/vp9_dx_iface.c index bdfe2179363c..20e71cc22712 100644 --- a/media/libvpx/libvpx/vp9/vp9_dx_iface.c +++ b/media/libvpx/libvpx/vp9/vp9_dx_iface.c @@ -348,7 +348,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, // Initialize the decoder on the first frame. if (ctx->pbi == NULL) { - const vpx_codec_err_t res = init_decoder(ctx); + res = init_decoder(ctx); if (res != VPX_CODEC_OK) return res; } @@ -367,7 +367,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, for (i = 0; i < frame_count; ++i) { const uint8_t *data_start_copy = data_start; const uint32_t frame_size = frame_sizes[i]; - vpx_codec_err_t res; if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) { set_error_detail(ctx, "Invalid frame size in index"); return VPX_CODEC_CORRUPT_FRAME; @@ -382,8 +381,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, const uint8_t *const data_end = data + data_sz; while (data_start < data_end) { const uint32_t frame_size = (uint32_t)(data_end - data_start); - const vpx_codec_err_t res = - decode_one(ctx, &data_start, frame_size, user_priv, deadline); + res = decode_one(ctx, &data_start, frame_size, user_priv, deadline); if (res != VPX_CODEC_OK) return res; // Account for suboptimal termination by the encoder. diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c index 503900915d7a..235cb5b9968a 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c @@ -12,23 +12,22 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) { +static INLINE uint16_t dc_sum_4(const uint16_t *ref) { const uint16x4_t ref_u16 = vld1_u16(ref); - const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16); - return vpadd_u16(p0, p0); + return horizontal_add_uint16x4(ref_u16); } static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, const uint16x4_t dc) { - const uint16x4_t dc_dup = vdup_lane_u16(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_u16(dst, dc_dup); + vst1_u16(dst, dc); } } @@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *left, int bd) { const uint16x4_t a = vld1_u16(above); const uint16x4_t l = vld1_u16(left); - uint16x4_t sum; - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l)); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3); (void)bd; - sum = vadd_u16(a, l); - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 3); dc_store_4x4(dst, stride, dc); } void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(left); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)above; (void)bd; dc_store_4x4(dst, stride, dc); @@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_4(above); - const uint16x4_t dc = vrshr_n_u16(sum, 2); + const uint16_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2); (void)left; (void)bd; dc_store_4x4(dst, stride, dc); @@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) { +static INLINE uint16_t dc_sum_8(const uint16_t *ref) { const uint16x8_t ref_u16 = vld1q_u16(ref); - uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); + return horizontal_add_uint16x8(ref_u16); } static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + const uint16x8_t dc) { int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1q_u16(dst, dc_dup); + vst1q_u16(dst, dc); } } @@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16x8_t above_u16 = vld1q_u16(above); const uint16x8_t left_u16 = vld1q_u16(left); const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint16x4_t dc; + const uint16_t sum = horizontal_add_uint16x8(p0); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)bd; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vrshr_n_u16(sum, 4); dc_store_8x8(dst, stride, dc); } void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(left); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)above; (void)bd; dc_store_8x8(dst, stride, dc); @@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_8(above); - const uint16x4_t dc = vrshr_n_u16(sum, 3); + const uint16_t sum = dc_sum_8(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3); (void)left; (void)bd; dc_store_8x8(dst, stride, dc); @@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_8x8(dst, stride, dc); @@ -142,47 +131,43 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { - const uint16x8x2_t ref_u16 = vld2q_u16(ref); - const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint16_t *ref) { + const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0); + const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8); + const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1); + return horizontal_add_uint16x8(p0); } static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8x2_t dc_dup; + const uint16x8_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); for (i = 0; i < 16; ++i, dst += stride) { - vst2q_u16(dst, dc_dup); + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); } } void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t a = vld2q_u16(above); - const uint16x8x2_t l = vld2q_u16(left); - const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t pa = vaddq_u16(a0, a1); + const uint16x8_t pl = vaddq_u16(l0, l1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum; - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)bd; - pal1 = vpadd_u16(pal1, pal1); - sum = vpaddl_u16(pal1); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); dc_store_16x16(dst, stride, dc); } void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(left); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(left); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)above; (void)bd; dc_store_16x16(dst, stride, dc); @@ -191,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t sum = dc_sum_16(above); - const uint16x4_t dc = vrshr_n_u16(sum, 4); + const uint16_t sum = dc_sum_16(above); + const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4); (void)left; (void)bd; dc_store_16x16(dst, stride, dc); @@ -201,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -210,56 +195,58 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { - const uint16x8x4_t r = vld4q_u16(ref); - const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]); - const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]); +static INLINE uint32_t dc_sum_32(const uint16_t *ref) { + const uint16x8_t r0 = vld1q_u16(ref + 0); + const uint16x8_t r1 = vld1q_u16(ref + 8); + const uint16x8_t r2 = vld1q_u16(ref + 16); + const uint16x8_t r3 = vld1q_u16(ref + 24); + const uint16x8_t p0 = vaddq_u16(r0, r1); + const uint16x8_t p1 = vaddq_u16(r2, r3); const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpaddl_u16(sum); + return horizontal_add_uint16x8(p2); } static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, - const uint16x4_t dc) { - uint16x8x2_t dc_dup; + const uint16x8_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); - for (i = 0; i < 32; ++i) { - vst2q_u16(dst, dc_dup); - dst += 16; - vst2q_u16(dst, dc_dup); - dst += stride - 16; + vst1q_u16(dst + 0, dc); + vst1q_u16(dst + 8, dc); + vst1q_u16(dst + 16, dc); + vst1q_u16(dst + 24, dc); + dst += stride; } } void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x4_t a = vld4q_u16(above); - const uint16x8x4_t l = vld4q_u16(left); - const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]); - const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]); - const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]); - const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]); + const uint16x8_t a0 = vld1q_u16(above + 0); + const uint16x8_t a1 = vld1q_u16(above + 8); + const uint16x8_t a2 = vld1q_u16(above + 16); + const uint16x8_t a3 = vld1q_u16(above + 24); + const uint16x8_t l0 = vld1q_u16(left + 0); + const uint16x8_t l1 = vld1q_u16(left + 8); + const uint16x8_t l2 = vld1q_u16(left + 16); + const uint16x8_t l3 = vld1q_u16(left + 24); + const uint16x8_t pa0 = vaddq_u16(a0, a1); + const uint16x8_t pa1 = vaddq_u16(a2, a3); + const uint16x8_t pl0 = vaddq_u16(l0, l1); + const uint16x8_t pl1 = vaddq_u16(l2, l3); const uint16x8_t pa = vaddq_u16(pa0, pa1); const uint16x8_t pl = vaddq_u16(pl0, pl1); const uint16x8_t pal0 = vaddq_u16(pa, pl); - const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); - uint32x2_t sum = vpaddl_u16(pal1); - uint16x4_t dc; + const uint32_t sum = horizontal_add_uint16x8(pal0); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0); (void)bd; - sum = vpadd_u32(sum, sum); - dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6)); dc_store_32x32(dst, stride, dc); } void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(left); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(left); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)above; (void)bd; dc_store_32x32(dst, stride, dc); @@ -268,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint32x2_t sum = dc_sum_32(above); - const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + const uint32_t sum = dc_sum_32(above); + const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0); (void)left; (void)bd; dc_store_32x32(dst, stride, dc); @@ -278,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1)); (void)above; (void)left; dc_store_32x32(dst, stride, dc); @@ -289,166 +276,179 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t ABCDEFGH = vld1q_u16(above); - const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); - const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); - const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); - const uint16x4_t avg2_low = vget_low_u16(avg2); - const uint16x4_t avg2_high = vget_high_u16(avg2); - const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); - const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); - const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + uint16x8_t a0, a1, a2, d0; + uint16_t a7; (void)left; (void)bd; - vst1_u16(dst, avg2_low); - dst += stride; - vst1_u16(dst, r1); - dst += stride; - vst1_u16(dst, r2); - dst += stride; - vst1_u16(dst, r3); - vst1q_lane_u16(dst + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row) { - *row = vextq_u16(*row, above_right, 1); - vst1q_u16(*dst, *row); - *dst += stride; + a0 = vld1q_u16(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vextq_u16(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vextq_u16(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + vst1_u16(dst + 0 * stride, vget_low_u16(d0)); + vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1))); + vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2))); + vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3))); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0 = vld1q_u16(above); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); - const uint16x8_t A1 = vld1q_u16(above + 1); - const uint16x8_t A2 = vld1q_u16(above + 2); - const uint16x8_t avg1 = vhaddq_u16(A0, A2); - uint16x8_t row = vrhaddq_u16(avg1, A1); + uint16x8_t ax0, a0, a1, a7, d0; (void)left; (void)bd; - vst1q_u16(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1q_u16(dst, above_right); -} + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_dup_u16(above + 7); -static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, - const uint16x8_t above_right, uint16x8_t *row_0, - uint16x8_t *row_1) { - *row_0 = vextq_u16(*row_0, *row_1, 1); - *row_1 = vextq_u16(*row_1, above_right, 1); - vst1q_u16(*dst, *row_0); - *dst += 8; - vst1q_u16(*dst, *row_1); - *dst += stride - 8; + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1)); + vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2)); + vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3)); + vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4)); + vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5)); + vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6)); + vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7)); + vst1q_u16(dst + 7 * stride, a7); } void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2]; (void)left; (void)bd; - vst1q_u16(dst, row_0); - vst1q_u16(dst + 8, row_1); - dst += stride; - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - d45_store_16(&dst, stride, above_right, &row_0, &row_1); - vst1q_u16(dst, above_right); - vst1q_u16(dst + 8, above_right); + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_dup_u16(above + 15); + + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + // We have one unused lane here to leave room to shift in above[15] in the + // last lane: + // d0[0][1] = x (don't care) + // d0[0][1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[0][7] = AVG3(above[6], above[7], above[8]); + // d0[1][0] = AVG3(above[7], above[8], above[9]); + // ... + // d0[1][7] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + + // Incrementally shift in duplicates of above[15]. + vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1)); + vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2)); + vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3)); + vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4)); + vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5)); + vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6)); + vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7)); + vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 7 * stride + 0, d0[1]); + vst1q_u16(dst + 7 * stride + 8, a15); + + vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1)); + vst1q_u16(dst + 8 * stride + 8, a15); + vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2)); + vst1q_u16(dst + 9 * stride + 8, a15); + vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3)); + vst1q_u16(dst + 10 * stride + 8, a15); + vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4)); + vst1q_u16(dst + 11 * stride + 8, a15); + vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5)); + vst1q_u16(dst + 12 * stride + 8, a15); + vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6)); + vst1q_u16(dst + 13 * stride + 8, a15); + vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7)); + vst1q_u16(dst + 14 * stride + 8, a15); + vst1q_u16(dst + 15 * stride + 0, a15); + vst1q_u16(dst + 15 * stride + 8, a15); } void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8_t A0_0 = vld1q_u16(above); - const uint16x8_t A0_1 = vld1q_u16(above + 8); - const uint16x8_t A0_2 = vld1q_u16(above + 16); - const uint16x8_t A0_3 = vld1q_u16(above + 24); - const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); - const uint16x8_t A1_0 = vld1q_u16(above + 1); - const uint16x8_t A1_1 = vld1q_u16(above + 9); - const uint16x8_t A1_2 = vld1q_u16(above + 17); - const uint16x8_t A1_3 = vld1q_u16(above + 25); - const uint16x8_t A2_0 = vld1q_u16(above + 2); - const uint16x8_t A2_1 = vld1q_u16(above + 10); - const uint16x8_t A2_2 = vld1q_u16(above + 18); - const uint16x8_t A2_3 = vld1q_u16(above + 26); - const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); - const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); - const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); - const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); - uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); - uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); - uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); - uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4]; int i; (void)left; (void)bd; - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; + a0 = vld1q_u16(above + 0); + a1 = vld1q_u16(above + 1); + a7 = vld1q_u16(above + 7); + a8 = vld1q_u16(above + 8); + a9 = vld1q_u16(above + 9); + a15 = vld1q_u16(above + 15); + a16 = vld1q_u16(above + 16); + a17 = vld1q_u16(above + 17); + a23 = vld1q_u16(above + 23); + a24 = vld1q_u16(above + 24); + a25 = vld1q_u16(above + 25); + a31 = vld1q_dup_u16(above + 31); - for (i = 0; i < 30; ++i) { - row_0 = vextq_u16(row_0, row_1, 1); - row_1 = vextq_u16(row_1, row_2, 1); - row_2 = vextq_u16(row_2, row_3, 1); - row_3 = vextq_u16(row_3, above_right, 1); - vst1q_u16(dst, row_0); - dst += 8; - vst1q_u16(dst, row_1); - dst += 8; - vst1q_u16(dst, row_2); - dst += 8; - vst1q_u16(dst, row_3); - dst += stride - 24; + // [ x, above[0], ... , above[6] ] + ax0 = vextq_u16(a0, a0, 7); + + d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0); + d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8); + d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16); + d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24); + + for (i = 0; i < 32; ++i) { + d0[0] = vextq_u16(d0[0], d0[1], 1); + d0[1] = vextq_u16(d0[1], d0[2], 1); + d0[2] = vextq_u16(d0[2], d0[3], 1); + d0[3] = vextq_u16(d0[3], a31, 1); + vst1q_u16(dst + 0, d0[0]); + vst1q_u16(dst + 8, d0[1]); + vst1q_u16(dst + 16, d0[2]); + vst1q_u16(dst + 24, d0[3]); + dst += stride; } - - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); - dst += 8; - vst1q_u16(dst, above_right); } // ----------------------------------------------------------------------------- @@ -2155,30 +2155,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row = vld2q_u16(above); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); int i; (void)left; (void)bd; - for (i = 0; i < 16; i++, dst += stride) { - vst2q_u16(dst, row); + for (i = 0; i < 16; i++) { + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + dst += stride; } } void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { - const uint16x8x2_t row0 = vld2q_u16(above); - const uint16x8x2_t row1 = vld2q_u16(above + 16); + const uint16x8_t row0 = vld1q_u16(above + 0); + const uint16x8_t row1 = vld1q_u16(above + 8); + const uint16x8_t row2 = vld1q_u16(above + 16); + const uint16x8_t row3 = vld1q_u16(above + 24); int i; (void)left; (void)bd; for (i = 0; i < 32; i++) { - vst2q_u16(dst, row0); - dst += 16; - vst2q_u16(dst, row1); - dst += stride - 16; + vst1q_u16(dst + 0, row0); + vst1q_u16(dst + 8, row1); + vst1q_u16(dst + 16, row2); + vst1q_u16(dst + 24, row3); + dst += stride; } } diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c index f731d38cc1a3..280d2087f7a0 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad4d_neon.c @@ -48,12 +48,6 @@ static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride, vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); } -static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, - uint32x4_t *const sad_sum) { - uint16x8_t abs_diff = vabdq_u16(src, ref); - *sad_sum = vpadalq_u16(*sad_sum, abs_diff); -} - static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4], int ref_stride, uint32_t res[4], @@ -64,21 +58,32 @@ static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride, const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]); const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]); - uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), - vdupq_n_u32(0) }; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint32x4_t sum_u32[4]; int i = 0; do { uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride); - sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]); - sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]); - sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]); - sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]); + sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride)); + sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride)); + sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride)); + sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride)); } while (++i < h); - vst1q_u32(res, horizontal_add_4d_uint32x4(sum)); + sum_u32[0] = vpaddlq_u16(sum[0]); + sum_u32[1] = vpaddlq_u16(sum[1]); + sum_u32[2] = vpaddlq_u16(sum[2]); + sum_u32[3] = vpaddlq_u16(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32)); +} + +static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref, + uint32x4_t *const sad_sum) { + uint16x8_t abs_diff = vabdq_u16(src, ref); + *sad_sum = vpadalq_u16(*sad_sum, abs_diff); } static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr, diff --git a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c index 90971f6009ec..813710040b84 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/highbd_sad_neon.c @@ -44,20 +44,19 @@ static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr, int ref_stride, int h) { const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr); const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr); - uint32x4_t sum = vdupq_n_u32(0); + uint16x8_t sum = vdupq_n_u16(0); int i = h; do { uint16x8_t s = vld1q_u16(src16_ptr); uint16x8_t r = vld1q_u16(ref16_ptr); - uint16x8_t diff = vabdq_u16(s, r); - sum = vpadalq_u16(sum, diff); + sum = vabaq_u16(sum, s, r); src16_ptr += src_stride; ref16_ptr += ref_stride; } while (--i != 0); - return horizontal_add_uint32x4(sum); + return horizontal_add_uint16x8(sum); } static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr, diff --git a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c index 892310f15171..4f909e493559 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -13,51 +13,46 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "mem_neon.h" +#include "sum_neon.h" #include "vpx/vpx_integer.h" //------------------------------------------------------------------------------ // DC 4x4 -static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - const uint16x4_t p0 = vpaddl_u8(ref_u8); - return vpadd_u16(p0, p0); +static INLINE uint16_t dc_sum_4(const uint8_t *ref) { + return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref)); } static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 4; ++i, dst += stride) { - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t a = vld1_u8(above); - const uint8x8_t l = vld1_u8(left); - const uint16x8_t al = vaddl_u8(a, l); - uint16x4_t sum; - uint8x8_t dc; - sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint8x8_t a = load_unaligned_u8_4x1(above); + const uint8x8_t l = load_unaligned_u8_4x1(left); + const uint16x4_t al = vget_low_u16(vaddl_u8(a, l)); + const uint16_t sum = horizontal_add_uint16x4(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)above; dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_4(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); + const uint16_t sum = dc_sum_4(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2); (void)left; dc_store_4x4(dst, stride, dc); } @@ -73,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 8x8 -static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { - const uint8x8_t ref_u8 = vld1_u8(ref); - uint16x4_t sum = vpaddl_u8(ref_u8); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_8(const uint8_t *ref) { + return horizontal_add_uint8x8(vld1_u8(ref)); } static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, const uint8x8_t dc) { - const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); int i; for (i = 0; i < 8; ++i, dst += stride) { - vst1_u8(dst, dc_dup); + vst1_u8(dst, dc); } } @@ -93,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t above_u8 = vld1_u8(above); const uint8x8_t left_u8 = vld1_u8(left); - const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); - const uint16x8_t p0 = vpaddlq_u8(above_and_left); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16x8_t al = vaddl_u8(above_u8, left_u8); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4); dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(left); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)above; dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_8(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + const uint16_t sum = dc_sum_8(above); + const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3); (void)left; dc_store_8x8(dst, stride, dc); } @@ -130,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 16x16 -static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { - const uint8x16_t ref_u8 = vld1q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(ref_u8); - uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_16(const uint8_t *ref) { + return horizontal_add_uint8x16(vld1q_u8(ref)); } static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + const uint8x16_t dc) { int i; for (i = 0; i < 16; ++i, dst += stride) { - vst1q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); } } @@ -151,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t ref0 = vld1q_u8(above); const uint8x16_t ref1 = vld1q_u8(left); - const uint16x8_t p0 = vpaddlq_u8(ref0); - const uint16x8_t p1 = vpaddlq_u8(ref1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16x8_t a = vpaddlq_u8(ref0); + const uint16x8_t l = vpaddlq_u8(ref1); + const uint16x8_t al = vaddq_u16(a, l); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)above; dc_store_16x16(dst, stride, dc); } @@ -174,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_16(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + const uint16_t sum = dc_sum_16(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0); (void)left; dc_store_16x16(dst, stride, dc); } @@ -183,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_16x16(dst, stride, dc); @@ -192,51 +171,41 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, //------------------------------------------------------------------------------ // DC 32x32 -static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { - const uint8x16x2_t r = vld2q_u8(ref); - const uint16x8_t p0 = vpaddlq_u8(r.val[0]); - const uint16x8_t p1 = vpaddlq_u8(r.val[1]); - const uint16x8_t p2 = vaddq_u16(p0, p1); - uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - sum = vpadd_u16(sum, sum); - return vpadd_u16(sum, sum); +static INLINE uint16_t dc_sum_32(const uint8_t *ref) { + const uint8x16_t r0 = vld1q_u8(ref + 0); + const uint8x16_t r1 = vld1q_u8(ref + 16); + const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1)); + return horizontal_add_uint16x8(r01); } static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8x8_t dc) { - uint8x16x2_t dc_dup; + const uint8x16_t dc) { int i; - dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - for (i = 0; i < 32; ++i, dst += stride) { - vst2q_u8(dst, dc_dup); + vst1q_u8(dst + 0, dc); + vst1q_u8(dst + 16, dc); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16x2_t a = vld2q_u8(above); - const uint8x16x2_t l = vld2q_u8(left); - const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); - const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); - const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); - const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); - const uint16x8_t pa = vaddq_u16(pa0, pa1); - const uint16x8_t pl = vaddq_u16(pl0, pl1); - const uint16x8_t pal = vaddq_u16(pa, pl); - uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); - uint8x8_t dc; - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + const uint8x16_t a0 = vld1q_u8(above + 0); + const uint8x16_t a1 = vld1q_u8(above + 16); + const uint8x16_t l0 = vld1q_u8(left + 0); + const uint8x16_t l1 = vld1q_u8(left + 16); + const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1)); + const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1)); + const uint16x8_t al = vaddq_u16(a01, l01); + const uint16_t sum = horizontal_add_uint16x8(al); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0); dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(left); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(left); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)above; dc_store_32x32(dst, stride, dc); } @@ -244,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint16x4_t sum = dc_sum_32(above); - const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + const uint16_t sum = dc_sum_32(above); + const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0); (void)left; dc_store_32x32(dst, stride, dc); } @@ -253,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t dc = vdup_n_u8(0x80); + const uint8x16_t dc = vdupq_n_u8(0x80); (void)above; (void)left; dc_store_32x32(dst, stride, dc); @@ -263,123 +232,202 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t ABCDEFGH = vld1_u8(above); - const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); - const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); - const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); - const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); - const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); - const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r0 = vreinterpret_u32_u8(avg2); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); + uint8x8_t a0, a1, a2, d0; + uint8_t a7; (void)left; - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); -} -static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, - const uint8x8_t above_right, uint8x8_t *row) { - *row = vext_u8(*row, above_right, 1); - vst1_u8(*dst, *row); - *dst += stride; + a0 = vld1_u8(above); + a7 = above[7]; + + // [ above[1], ..., above[6], x, x ] + a1 = vext_u8(a0, a0, 1); + // [ above[2], ..., above[7], x, x ] + a2 = vext_u8(a0, a0, 2); + + // d0[0] = AVG3(above[0], above[1], above[2]); + // ... + // d0[5] = AVG3(above[5], above[6], above[7]); + // d0[6] = x (don't care) + // d0[7] = x (don't care) + d0 = vrhadd_u8(vhadd_u8(a0, a2), a1); + + // We want: + // stride=0 [ d0[0], d0[1], d0[2], d0[3] ] + // stride=1 [ d0[1], d0[2], d0[3], d0[4] ] + // stride=2 [ d0[2], d0[3], d0[4], d0[5] ] + // stride=2 [ d0[3], d0[4], d0[5], above[7] ] + store_u8_4x1(dst + 0 * stride, d0); + store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1)); + store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2)); + store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3)); + + // We stored d0[6] above, so fixup into above[7]. + dst[3 * stride + 3] = a7; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t A0 = vld1_u8(above); - const uint8x8_t above_right = vdup_lane_u8(A0, 7); - const uint8x8_t A1 = vext_u8(A0, above_right, 1); - const uint8x8_t A2 = vext_u8(A0, above_right, 2); - const uint8x8_t avg1 = vhadd_u8(A0, A2); - uint8x8_t row = vrhadd_u8(avg1, A1); + uint8x8_t ax0, a0, a1, a7, d0; (void)left; - vst1_u8(dst, row); - dst += stride; - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - d45_store_8(&dst, stride, above_right, &row); - vst1_u8(dst, above_right); -} + a0 = vld1_u8(above + 0); + a1 = vld1_u8(above + 1); + a7 = vld1_dup_u8(above + 7); -static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, - const uint8x16_t above_right, uint8x16_t *row) { - *row = vextq_u8(*row, above_right, 1); - vst1q_u8(*dst, *row); - *dst += stride; + // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can + // shift in above[7] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[6] ] + ax0 = vext_u8(a0, a0, 7); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[7] = AVG3(above[6], above[7], above[8]); + d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[7]. + vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1)); + vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5)); + vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7)); + vst1_u8(dst + 7 * stride, a7); } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); - const uint8x16_t A1 = vextq_u8(A0, above_right, 1); - const uint8x16_t A2 = vextq_u8(A0, above_right, 2); - const uint8x16_t avg1 = vhaddq_u8(A0, A2); - uint8x16_t row = vrhaddq_u8(avg1, A1); + uint8x16_t ax0, a0, a1, a15, d0; (void)left; - vst1q_u8(dst, row); - dst += stride; - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - d45_store_16(&dst, stride, above_right, &row); - vst1q_u8(dst, above_right); + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_dup_u8(above + 15); + + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); + + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1)); + vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9)); + vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11)); + vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13)); + vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15)); + vst1q_u8(dst + 15 * stride, a15); } void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0_0 = vld1q_u8(above); - const uint8x16_t A0_1 = vld1q_u8(above + 16); - const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7); - const uint8x16_t A1_0 = vld1q_u8(above + 1); - const uint8x16_t A1_1 = vld1q_u8(above + 17); - const uint8x16_t A2_0 = vld1q_u8(above + 2); - const uint8x16_t A2_1 = vld1q_u8(above + 18); - const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0); - const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1); - uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0); - uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1); - int i; + uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2]; (void)left; - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; + a0 = vld1q_u8(above + 0); + a1 = vld1q_u8(above + 1); + a15 = vld1q_u8(above + 15); + a16 = vld1q_u8(above + 16); + a17 = vld1q_u8(above + 17); + a31 = vld1q_dup_u8(above + 31); - for (i = 0; i < 30; ++i) { - row_0 = vextq_u8(row_0, row_1, 1); - row_1 = vextq_u8(row_1, above_right, 1); - vst1q_u8(dst, row_0); - dst += 16; - vst1q_u8(dst, row_1); - dst += stride - 16; - } + // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can + // shift in above[15] later, so shift a0 across by one to get the right + // inputs: + // [ x, above[0], ... , above[14] ] + ax0 = vextq_u8(a0, a0, 15); - vst1q_u8(dst, above_right); - dst += 16; - vst1q_u8(dst, row_1); + // d0[0] = x (don't care) + // d0[1] = AVG3(above[0], above[1], above[2]); + // ... + // d0[15] = AVG3(above[14], above[15], above[16]); + d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0); + d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16); + + // Undo the earlier ext, incrementally shift in duplicates of above[15]. + vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1)); + vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2)); + vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 15 * stride + 0, d0[1]); + vst1q_u8(dst + 15 * stride + 16, a31); + + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1)); + vst1q_u8(dst + 16 * stride + 16, a31); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2)); + vst1q_u8(dst + 17 * stride + 16, a31); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3)); + vst1q_u8(dst + 18 * stride + 16, a31); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4)); + vst1q_u8(dst + 19 * stride + 16, a31); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5)); + vst1q_u8(dst + 20 * stride + 16, a31); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6)); + vst1q_u8(dst + 21 * stride + 16, a31); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7)); + vst1q_u8(dst + 22 * stride + 16, a31); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8)); + vst1q_u8(dst + 23 * stride + 16, a31); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9)); + vst1q_u8(dst + 24 * stride + 16, a31); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10)); + vst1q_u8(dst + 25 * stride + 16, a31); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11)); + vst1q_u8(dst + 26 * stride + 16, a31); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12)); + vst1q_u8(dst + 27 * stride + 16, a31); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13)); + vst1q_u8(dst + 28 * stride + 16, a31); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14)); + vst1q_u8(dst + 29 * stride + 16, a31); + vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15)); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, a31); + vst1q_u8(dst + 31 * stride + 16, a31); } // ----------------------------------------------------------------------------- @@ -420,12 +468,16 @@ void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, vst1_u8(dst + 0 * stride, d0); vst1_u8(dst + 1 * stride, d1); - vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 1)); - vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 1)); - vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 2)); - vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 2)); - vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 3)); - vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 3)); + + d0 = vext_u8(d0, d0, 7); + d1 = vext_u8(d1, d1, 7); + + vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2)); + vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2)); + vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3)); + vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3)); + vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4)); + vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4)); } void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, @@ -443,20 +495,24 @@ void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, vst1q_u8(dst + 0 * stride, d0); vst1q_u8(dst + 1 * stride, d1); - vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 1)); - vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 1)); - vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 2)); - vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 2)); - vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 3)); - vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 3)); - vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 4)); - vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 4)); - vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 5)); - vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 5)); - vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 6)); - vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 6)); - vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 7)); - vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 7)); + + d0 = vextq_u8(d0, d0, 15); + d1 = vextq_u8(d1, d1, 15); + + vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2)); + vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2)); + vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3)); + vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3)); + vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4)); + vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4)); + vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5)); + vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5)); + vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6)); + vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6)); + vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7)); + vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7)); + vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8)); + vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8)); } void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, @@ -481,66 +537,72 @@ void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, vst1q_u8(dst + 0 * stride + 16, d0_hi); vst1q_u8(dst + 1 * stride + 0, d1_lo); vst1q_u8(dst + 1 * stride + 16, d1_hi); - vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 1)); - vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 1)); - vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 1)); - vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 1)); - vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); - vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 2)); - vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); - vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 2)); - vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); - vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 3)); - vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); - vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 3)); - vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); - vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 4)); - vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); - vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 4)); - vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); - vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 5)); - vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); - vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 5)); - vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); - vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 6)); - vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); - vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 6)); - vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); - vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 7)); - vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); - vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 7)); - vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); - vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 8)); - vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); - vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 8)); - vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); - vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 9)); - vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); - vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 9)); - vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); - vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 10)); - vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); - vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 10)); - vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); - vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 11)); - vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); - vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 11)); - vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); - vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 12)); - vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); - vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 12)); - vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); - vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 13)); - vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); - vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 13)); - vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); - vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 14)); - vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); - vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 14)); - vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); - vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_hi, a31, 15)); - vst1q_u8(dst + 31 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); - vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_hi, a31, 15)); + + d0_hi = vextq_u8(d0_lo, d0_hi, 15); + d0_lo = vextq_u8(d0_lo, d0_lo, 15); + d1_hi = vextq_u8(d1_lo, d1_hi, 15); + d1_lo = vextq_u8(d1_lo, d1_lo, 15); + + vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2)); + vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2)); + vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2)); + vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2)); + vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3)); + vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3)); + vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3)); + vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3)); + vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4)); + vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4)); + vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4)); + vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4)); + vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5)); + vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5)); + vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5)); + vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5)); + vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6)); + vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6)); + vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6)); + vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6)); + vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7)); + vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7)); + vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7)); + vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7)); + vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8)); + vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8)); + vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8)); + vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8)); + vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9)); + vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9)); + vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9)); + vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9)); + vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10)); + vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10)); + vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10)); + vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10)); + vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11)); + vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11)); + vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11)); + vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11)); + vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12)); + vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12)); + vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12)); + vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12)); + vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13)); + vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13)); + vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13)); + vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13)); + vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14)); + vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14)); + vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14)); + vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14)); + vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15)); + vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15)); + vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15)); + vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15)); + vst1q_u8(dst + 30 * stride + 0, d0_hi); + vst1q_u8(dst + 30 * stride + 16, a31); + vst1q_u8(dst + 31 * stride + 0, d1_hi); + vst1q_u8(dst + 31 * stride + 16, a31); } // ----------------------------------------------------------------------------- @@ -804,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8x8_t L3210 = vrev64_u8(L0123); const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); - const uint8x8_t L10XA0123_ = - vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8)); + const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1); const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r3 = vreinterpret_u32_u8(avg2); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)dst, r0, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r1, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r2, 0); - dst += stride; - vst1_lane_u32((uint32_t *)dst, r3, 0); + + store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3)); + store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2)); + store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1)); + store_u8_4x1(dst + 3 * stride, avg2); } void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, @@ -836,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); - const uint8x8_t row_0 = vget_low_u8(row); - const uint8x8_t row_1 = vget_high_u8(row); - const uint8x8_t r0 = vext_u8(row_0, row_1, 7); - const uint8x8_t r1 = vext_u8(row_0, row_1, 6); - const uint8x8_t r2 = vext_u8(row_0, row_1, 5); - const uint8x8_t r3 = vext_u8(row_0, row_1, 4); - const uint8x8_t r4 = vext_u8(row_0, row_1, 3); - const uint8x8_t r5 = vext_u8(row_0, row_1, 2); - const uint8x8_t r6 = vext_u8(row_0, row_1, 1); - vst1_u8(dst, r0); - dst += stride; - vst1_u8(dst, r1); - dst += stride; - vst1_u8(dst, r2); - dst += stride; - vst1_u8(dst, r3); - dst += stride; - vst1_u8(dst, r4); - dst += stride; - vst1_u8(dst, r5); - dst += stride; - vst1_u8(dst, r6); - dst += stride; - vst1_u8(dst, row_0); + vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7))); + vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6))); + vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5))); + vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4))); + vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3))); + vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2))); + vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1))); + vst1_u8(dst + 7 * stride, vget_low_u8(row)); } static INLINE void d135_store_16x8( @@ -903,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); @@ -910,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); - const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1)); + const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8); const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c index 95095739394f..6ad6c9621455 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c +++ b/media/libvpx/libvpx/vpx_dsp/arm/sad4d_neon.c @@ -140,53 +140,43 @@ static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref, static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t res[4], int h) { - int h_tmp = h > 64 ? 64 : h; + uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; + int i = 0; - vst1q_u32(res, vdupq_n_u32(0)); - do { - uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; + uint8x16_t s0, s1, s2, s3; - do { - uint8x16_t s0, s1, s2, s3; + s0 = vld1q_u8(src + i * src_stride); + sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); + sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); + sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); + sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); - s0 = vld1q_u8(src + i * src_stride); - sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]); - sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]); - sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]); - sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]); + s1 = vld1q_u8(src + i * src_stride + 16); + sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); + sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); + sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); + sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); - s1 = vld1q_u8(src + i * src_stride + 16); - sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]); - sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]); - sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]); - sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]); + s2 = vld1q_u8(src + i * src_stride + 32); + sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); + sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); + sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); + sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); - s2 = vld1q_u8(src + i * src_stride + 32); - sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]); - sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]); - sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]); - sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]); + s3 = vld1q_u8(src + i * src_stride + 48); + sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); + sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); + sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); + sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); - s3 = vld1q_u8(src + i * src_stride + 48); - sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]); - sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]); - sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]); - sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]); - - i++; - } while (i < h_tmp); - - res[0] += horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); - res[1] += horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); - res[2] += horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); - res[3] += horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); - - h_tmp += 64; + i++; } while (i < h); + + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); } static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, @@ -216,10 +206,7 @@ static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res[0] = horizontal_long_add_uint16x8(sum_lo[0], sum_hi[0]); - res[1] = horizontal_long_add_uint16x8(sum_lo[1], sum_hi[1]); - res[2] = horizontal_long_add_uint16x8(sum_lo[2], sum_hi[2]); - res[3] = horizontal_long_add_uint16x8(sum_lo[3], sum_hi[3]); + vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi)); } static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, @@ -239,10 +226,7 @@ static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res[0] = horizontal_add_uint16x8(sum[0]); - res[1] = horizontal_add_uint16x8(sum[1]); - res[2] = horizontal_add_uint16x8(sum[2]); - res[3] = horizontal_add_uint16x8(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } #endif // defined(__ARM_FEATURE_DOTPROD) @@ -270,10 +254,7 @@ static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride, i++; } while (i < h); - res[0] = horizontal_add_uint16x8(sum[0]); - res[1] = horizontal_add_uint16x8(sum[1]); - res[2] = horizontal_add_uint16x8(sum[2]); - res[3] = horizontal_add_uint16x8(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, @@ -298,10 +279,7 @@ static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride, i += 2; } while (i < h); - res[0] = horizontal_add_uint16x8(sum[0]); - res[1] = horizontal_add_uint16x8(sum[1]); - res[2] = horizontal_add_uint16x8(sum[2]); - res[3] = horizontal_add_uint16x8(sum[3]); + vst1q_u32(res, horizontal_add_4d_uint16x8(sum)); } #define SAD_WXH_4D_NEON(w, h) \ diff --git a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h index 8291f072963c..a0c72f92ceb7 100644 --- a/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h +++ b/media/libvpx/libvpx/vpx_dsp/arm/sum_neon.h @@ -16,6 +16,49 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { +#if defined(__aarch64__) + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { +#if defined(__aarch64__) + return vaddlv_u8(a); +#else + const uint16x4_t b = vpaddl_u8(a); + const uint16x4_t c = vpadd_u16(b, b); + const uint16x4_t d = vpadd_u16(c, c); + return vget_lane_u16(d, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { +#if defined(__aarch64__) + return vaddlvq_u8(a); +#else + const uint16x8_t b = vpaddlq_u8(a); + const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b)); + const uint16x4_t d = vpadd_u16(c, c); + const uint16x4_t e = vpadd_u16(d, d); + return vget_lane_u16(e, 0); +#endif +} + +static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { +#if defined(__aarch64__) + return vaddv_u16(a); +#else + const uint16x4_t b = vpadd_u16(a, a); + const uint16x4_t c = vpadd_u16(b, b); + return vget_lane_u16(c, 0); +#endif +} + static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { #if defined(__aarch64__) return vaddlvq_s16(a); @@ -40,6 +83,23 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { #endif } +static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { +#if defined(__aarch64__) + const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); + const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); + const uint16x8_t b0 = vpaddq_u16(a0, a1); + return vpaddlq_u16(b0); +#else + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + return vpaddlq_u16(vcombine_u16(b0, b1)); +#endif +} + static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { #if defined(__aarch64__) @@ -57,6 +117,31 @@ static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, #endif } +static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( + const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) { + const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]); + const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]); + const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]); + const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]); + const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]); + const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); + const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); + const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); +#if defined(__aarch64__) + const uint32x4_t c0 = vpaddq_u32(b0, b1); + const uint32x4_t c1 = vpaddq_u32(b2, b3); + return vpaddq_u32(c0, c1); +#else + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + return vcombine_u32(d0, d1); +#endif +} + static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { #if defined(__aarch64__) return vaddv_s32(a); diff --git a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index 49bc9a630981..d63be5fb8fed 100644 --- a/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/media/libvpx/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -1123,13 +1123,13 @@ add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_st specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x16 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/; + specialize qw/vpx_variance8x8 sse2 avx2 neon msa mmi vsx lsx/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/; + specialize qw/vpx_variance8x4 sse2 avx2 neon msa mmi vsx/; add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/; diff --git a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c index 35925d59082f..8305b9f20f04 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/variance_avx2.c @@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { return _mm256_add_epi32(sum_lo, sum_hi); } +static INLINE void variance8_kernel_avx2( + const uint8_t *const src, const int src_stride, const uint8_t *const ref, + const int ref_stride, __m256i *const sse, __m256i *const sum) { + __m128i src0, src1, ref0, ref1; + __m256i ss, rr, diff; + + // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00 + src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride)); + + // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10 + src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride)); + + // s17 s16...s11 s10 s07 s06...s01 s00 (8bit) + src0 = _mm_unpacklo_epi64(src0, src1); + + // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit) + ss = _mm256_cvtepu8_epi16(src0); + + // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00 + ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride)); + + // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10 + ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride)); + + // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit) + ref0 = _mm_unpacklo_epi64(ref0, ref1); + + // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit) + rr = _mm256_cvtepu8_epi16(ref0); + + diff = _mm256_sub_epi16(ss, rr); + *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff)); + *sum = _mm256_add_epi16(*sum, diff); +} + static INLINE void variance16_kernel_avx2( const uint8_t *const src, const int src_stride, const uint8_t *const ref, const int ref_stride, __m256i *const sse, __m256i *const sum) { @@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src, variance_kernel_avx2(s, r, sse, sum); } +static INLINE void variance8_avx2(const uint8_t *src, const int src_stride, + const uint8_t *ref, const int ref_stride, + const int h, __m256i *const vsse, + __m256i *const vsum) { + int i; + *vsum = _mm256_setzero_si256(); + *vsse = _mm256_setzero_si256(); + + for (i = 0; i < h; i += 2) { + variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum); + src += 2 * src_stride; + ref += 2 * ref_stride; + } +} + static INLINE void variance16_avx2(const uint8_t *src, const int src_stride, const uint8_t *ref, const int ref_stride, const int h, __m256i *const vsse, @@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 5); +} + +unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 6); +} + +unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse) { + __m256i vsse, vsum; + int sum; + variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum); + variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum); + return *sse - ((sum * sum) >> 7); +} + unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse) { diff --git a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 37ef59f36c2c..9ff67bd301b8 100644 --- a/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/media/libvpx/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -240,7 +240,7 @@ static void vpx_filter_block1d8_h8_avx2( // For the remaining height. if (y > 0) { - const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); f[0] = _mm256_castsi256_si128(f1[0]); f[1] = _mm256_castsi256_si128(f1[1]); @@ -248,10 +248,10 @@ static void vpx_filter_block1d8_h8_avx2( f[3] = _mm256_castsi256_si128(f1[3]); // filter the source buffer - s[0] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])); - s[1] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])); - s[2] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])); - s[3] = _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])); + s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0])); + s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1])); + s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2])); + s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3])); s[0] = convolve8_8_ssse3(s, f); // Saturate 16bit value to 8bit. @@ -1184,8 +1184,190 @@ static void vpx_filter_block1d4_h8_avx2( } } +static void vpx_filter_block1d4_v8_avx2( + const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, + ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { + __m256i f[4], ss[4]; + __m256i r[8]; + __m128i r1[10]; + __m128i s[11]; + + unsigned int y = output_height; + // Multiply the size of the source stride by four + const ptrdiff_t src_stride = src_pitch << 2; + const ptrdiff_t out_stride = out_pitch << 2; + + // The output_height is always a multiple of two. + assert(!(output_height & 0x01)); + + shuffle_filter_avx2(filter, f); + + s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch)); + s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch)); + s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch)); + s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch)); + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); + s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); + s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); + + // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00 + r1[0] = _mm_unpacklo_epi32(s[0], s[1]); + + // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10 + r1[1] = _mm_unpacklo_epi32(s[1], s[2]); + + // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20 + r1[2] = _mm_unpacklo_epi32(s[2], s[3]); + + // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30 + r1[3] = _mm_unpacklo_epi32(s[3], s[4]); + + // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40 + r1[4] = _mm_unpacklo_epi32(s[4], s[5]); + + // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50 + r1[5] = _mm_unpacklo_epi32(s[5], s[6]); + + // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02 + // r01 r00 + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1); + + // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12 + // r11 r10 + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1); + + // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22 + // r21 r20 + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1); + + // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32 + // r31 r30 + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1); + + // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 + // r00| + ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 + // r20| + ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); + + // Process 4 rows at a time + while (y >= 4) { + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); + s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[6] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[7] = _mm_unpacklo_epi32(s[7], s[8]); + + // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80 + r1[8] = _mm_unpacklo_epi32(s[8], s[9]); + + // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90 + r1[9] = _mm_unpacklo_epi32(s[9], s[10]); + + // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43 + // r42 r41 r40 + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1); + + // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53 + // r52 r51 r50 + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1); + + // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63 + // r62 r61 r60 + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1); + + // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81 + // r80|r73 r72 r71 r70 + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1); + + // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50 + // r40| + ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); + + // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73 + // r63....r70 r60| + ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + + ss[0] = convolve8_16_avx2(ss, f); + + // r3 r2 r3 r2 r1 r0 r1 r0 + ss[0] = _mm256_packus_epi16(ss[0], ss[0]); + src_ptr += src_stride; + + mm256_storeu2_epi32((__m128i *const)output_ptr, + (__m128i *const)(output_ptr + (2 * out_pitch)), ss); + + ss[0] = _mm256_srli_si256(ss[0], 4); + + mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)), + (__m128i *const)(output_ptr + (3 * out_pitch)), ss); + + output_ptr += out_stride; + + ss[0] = ss[2]; + ss[1] = ss[3]; + + s[6] = s[10]; + + r1[4] = r1[8]; + r1[5] = r1[9]; + + y -= 4; + } + + // Process 2 rows + if (y == 2) { + __m128i ss1[4], f1[4]; + + s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); + s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); + + f1[0] = _mm256_castsi256_si128(f[0]); + f1[1] = _mm256_castsi256_si128(f[1]); + f1[2] = _mm256_castsi256_si128(f[2]); + f1[3] = _mm256_castsi256_si128(f[3]); + + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 + r1[6] = _mm_unpacklo_epi32(s[6], s[7]); + + // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 + r1[7] = _mm_unpacklo_epi32(s[7], s[8]); + + // r23 r13....r20 r10|r13 r03....r10 r00 + ss1[0] = _mm256_castsi256_si128(ss[0]); + + // r43 r33....r40 r30|r33 r23....r30 r20 + ss1[1] = _mm256_castsi256_si128(ss[1]); + + // r63 r53....r60 r50|r53 r43....r50 r40 + ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]); + + // r83 r73....r80 r70|r73 r63....r70 r60 + ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]); + + ss1[0] = convolve8_8_ssse3(ss1, f1); + + // r1 r0 r1 r0 + ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]); + + // Save first row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + output_ptr += out_pitch; + + ss1[0] = _mm_srli_si128(ss1[0], 4); + // Save second row 4 values + *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]); + } +} + #if HAVE_AVX2 && HAVE_SSSE3 -filter8_1dfunction vpx_filter_block1d4_v8_ssse3; #if VPX_ARCH_X86_64 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3; filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3; @@ -1209,7 +1391,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3; filter8_1dfunction vpx_filter_block1d8_h2_ssse3; filter8_1dfunction vpx_filter_block1d4_v2_ssse3; filter8_1dfunction vpx_filter_block1d4_h2_ssse3; -#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3 diff --git a/media/libvpx/moz.yaml b/media/libvpx/moz.yaml index 110a7bc49bab..b0362992bde5 100644 --- a/media/libvpx/moz.yaml +++ b/media/libvpx/moz.yaml @@ -20,11 +20,11 @@ origin: # Human-readable identifier for this version/release # Generally "version NNN", "tag SSS", "bookmark SSS" - release: 5b05f6f3a01d7e25d0573b482245a2b8b0eb09bd (Fri Mar 24 18:04:19 2023). + release: 31b6d12892cebc57adccc62994f29ebbca828fa0 (Mon Apr 10 18:50:09 2023). # Revision to pull in # Must be a long or short commit SHA (long preferred) - revision: 5b05f6f3a01d7e25d0573b482245a2b8b0eb09bd + revision: 31b6d12892cebc57adccc62994f29ebbca828fa0 # The package's license, where possible using the mnemonic from # https://spdx.org/licenses/