diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c index 41b55c98520d..e001a1d70159 100644 --- a/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_avx2.c @@ -11,6 +11,7 @@ #include +#include "aom_ports/msvc.h" #include "./aom_dsp_rtcd.h" // ----------------------------------------------------------------------------- diff --git a/third_party/aom/aom_ports/msvc.h b/third_party/aom/aom_ports/msvc.h index 2d3ab9b6537d..5a41d29d2b0f 100644 --- a/third_party/aom/aom_ports/msvc.h +++ b/third_party/aom/aom_ports/msvc.h @@ -43,5 +43,25 @@ static INLINE long lroundf(float x) { } #endif // _MSC_VER < 1800 +#if HAVE_AVX +#include +// Note: +// _mm256_insert_epi16 intrinsics is available from vs2017. +// We define this macro for vs2015 and earlier. The +// intrinsics used here are in vs2015 document: +// https://msdn.microsoft.com/en-us/library/hh977022.aspx +// Input parameters: +// a: __m256i, +// d: int16_t, +// indx: imm8 (0 - 15) +#if _MSC_VER <= 1900 +#define _mm256_insert_epi16(a, d, indx) \ + _mm256_insertf128_si256( \ + a, \ + _mm_insert_epi16(_mm256_extractf128_si256(a, indx >> 3), d, indx % 8), \ + indx >> 3) +#endif // _MSC_VER <= 1900 +#endif // HAVE_AVX + #endif // _MSC_VER #endif // AOM_PORTS_MSVC_H_ diff --git a/third_party/aom/av1/common/reconinter.c b/third_party/aom/av1/common/reconinter.c index d7e39b45ca31..a1a22a0af308 100644 --- a/third_party/aom/av1/common/reconinter.c +++ b/third_party/aom/av1/common/reconinter.c @@ -1728,9 +1728,9 @@ void av1_build_inter_predictors_sb(const AV1_COMMON *cm, MACROBLOCKD *xd, av1_build_inter_predictors_sbuv(cm, xd, mi_row, mi_col, ctx, bsize); } -void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], - BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col) { +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, + int mi_col) { const int widths[MAX_MB_PLANE] = { src->y_crop_width, src->uv_crop_width, src->uv_crop_width }; const int heights[MAX_MB_PLANE] = { src->y_crop_height, src->uv_crop_height, diff --git a/third_party/aom/av1/common/reconinter.h b/third_party/aom/av1/common/reconinter.h index fd69f9db37f6..0c33333397b1 100644 --- a/third_party/aom/av1/common/reconinter.h +++ b/third_party/aom/av1/common/reconinter.h @@ -446,9 +446,9 @@ static INLINE void setup_pred_plane(struct buf_2d *dst, BLOCK_SIZE bsize, dst->stride = stride; } -void av1_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE], - BLOCK_SIZE bsize, const YV12_BUFFER_CONFIG *src, - int mi_row, int mi_col); +void av1_setup_dst_planes(struct macroblockd_plane *planes, BLOCK_SIZE bsize, + const YV12_BUFFER_CONFIG *src, int mi_row, + int mi_col); void av1_setup_pre_planes(MACROBLOCKD *xd, int idx, const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col, diff --git a/third_party/aom/av1/common/thread_common.c b/third_party/aom/av1/common/thread_common.c index eec8629ff98d..4c9fa69625e8 100644 --- a/third_party/aom/av1/common/thread_common.c +++ b/third_party/aom/av1/common/thread_common.c @@ -86,7 +86,7 @@ static INLINE void sync_write(AV1LfSync *const lf_sync, int r, int c, #if !CONFIG_EXT_PARTITION_TYPES static INLINE enum lf_path get_loop_filter_path( - int y_only, struct macroblockd_plane planes[MAX_MB_PLANE]) { + int y_only, struct macroblockd_plane *planes) { if (y_only) return LF_PATH_444; else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) @@ -98,7 +98,7 @@ static INLINE enum lf_path get_loop_filter_path( } static INLINE void loop_filter_block_plane_ver( - AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, + AV1_COMMON *cm, struct macroblockd_plane *planes, int plane, MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path, LOOP_FILTER_MASK *lfm) { if (plane == 0) { @@ -120,7 +120,7 @@ static INLINE void loop_filter_block_plane_ver( } static INLINE void loop_filter_block_plane_hor( - AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, + AV1_COMMON *cm, struct macroblockd_plane *planes, int plane, MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path, LOOP_FILTER_MASK *lfm) { if (plane == 0) { @@ -286,10 +286,9 @@ static int loop_filter_row_worker(AV1LfSync *const lf_sync, #endif // CONFIG_PARALLEL_DEBLOCKING static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - struct macroblockd_plane planes[MAX_MB_PLANE], - int start, int stop, int y_only, - AVxWorker *workers, int nworkers, - AV1LfSync *lf_sync) { + struct macroblockd_plane *planes, int start, + int stop, int y_only, AVxWorker *workers, + int nworkers, AV1LfSync *lf_sync) { #if CONFIG_EXT_PARTITION printf( "STOPPING: This code has not been modified to work with the " @@ -415,7 +414,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, } void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, - struct macroblockd_plane planes[MAX_MB_PLANE], + struct macroblockd_plane *planes, int frame_filter_level, #if CONFIG_LOOPFILTER_LEVEL int frame_filter_level_r, diff --git a/third_party/aom/av1/common/thread_common.h b/third_party/aom/av1/common/thread_common.h index 6d118e60b157..7eddc662cee2 100644 --- a/third_party/aom/av1/common/thread_common.h +++ b/third_party/aom/av1/common/thread_common.h @@ -49,7 +49,7 @@ void av1_loop_filter_dealloc(AV1LfSync *lf_sync); // Multi-threaded loopfilter that uses the tile threads. void av1_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm, - struct macroblockd_plane planes[MAX_MB_PLANE], + struct macroblockd_plane *planes, int frame_filter_level, #if CONFIG_LOOPFILTER_LEVEL int frame_filter_level_r, diff --git a/third_party/aom/av1/common/x86/selfguided_sse4.c b/third_party/aom/av1/common/x86/selfguided_sse4.c index 4006b8518189..9de9177c1d09 100644 --- a/third_party/aom/av1/common/x86/selfguided_sse4.c +++ b/third_party/aom/av1/common/x86/selfguided_sse4.c @@ -10,9 +10,11 @@ av1_selfguided_restoration) */ static void calc_block(__m128i sum, __m128i sum_sq, __m128i n, - __m128i one_over_n, __m128i s, int bit_depth, int idx, - int32_t *A, int32_t *B) { + __m128i *one_over_n_, __m128i *s_, int bit_depth, + int idx, int32_t *A, int32_t *B) { __m128i a, b, p; + __m128i one_over_n = *one_over_n_; + __m128i s = *s_; #if CONFIG_HIGHBITDEPTH if (bit_depth > 8) { __m128i rounding_a = _mm_set1_epi32((1 << (2 * (bit_depth - 8))) >> 1); @@ -147,7 +149,7 @@ static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width, __m128i s = _mm_set_epi32( sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][2 * h - 1]); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A, + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A, B); n = _mm_set1_epi32(3 * h); @@ -178,8 +180,8 @@ static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width, _mm_alignr_epi8(b2, b1, 8))); sum_sq_ = _mm_add_epi32(a1, _mm_add_epi32(_mm_alignr_epi8(a2, a1, 4), _mm_alignr_epi8(a2, a1, 8))); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, - A, B); + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, + i * buf_stride + j, A, B); } __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 3]); __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 3]); @@ -227,7 +229,7 @@ static void selfguided_restoration_1_h(int32_t *A, int32_t *B, int width, s = _mm_set_epi32( sgrproj_mtable[eps - 1][2 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j, A, B); } } @@ -342,7 +344,7 @@ static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width, __m128i s = _mm_set_epi32( sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][3 * h - 1]); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A, + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A, B); // Re-align a1 and b1 so that they start at index i * buf_stride + 2 @@ -372,8 +374,8 @@ static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width, _mm_alignr_epi8(a2, a1, 8))), _mm_add_epi32(_mm_alignr_epi8(a2, a1, 12), a2)); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, - A, B); + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, + i * buf_stride + j, A, B); } // If the width is not a multiple of 4, we need to reset j to width - 4 // and adjust a1, a2, b1, b2 so that the loop invariant above is maintained @@ -428,7 +430,7 @@ static void selfguided_restoration_2_h(int32_t *A, int32_t *B, int width, s = _mm_set_epi32( sgrproj_mtable[eps - 1][3 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1]); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j, A, B); } } @@ -562,7 +564,7 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width, __m128i s = _mm_set_epi32( sgrproj_mtable[eps - 1][7 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][4 * h - 1]); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride, A, + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride, A, B); // Re-align a1 and b1 so that they start at index i * buf_stride + 1 @@ -599,8 +601,8 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width, _mm_add_epi32(_mm_add_epi32(a2, _mm_alignr_epi8(a3, a2, 4)), _mm_alignr_epi8(a3, a2, 8))); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, - A, B); + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, + i * buf_stride + j, A, B); } __m128i a3 = _mm_loadu_si128((__m128i *)&A[i * buf_stride + j + 1]); __m128i b3 = _mm_loadu_si128((__m128i *)&B[i * buf_stride + j + 1]); @@ -657,7 +659,7 @@ static void selfguided_restoration_3_h(int32_t *A, int32_t *B, int width, s = _mm_set_epi32( sgrproj_mtable[eps - 1][4 * h - 1], sgrproj_mtable[eps - 1][5 * h - 1], sgrproj_mtable[eps - 1][6 * h - 1], sgrproj_mtable[eps - 1][7 * h - 1]); - calc_block(sum_, sum_sq_, n, one_over_n, s, bit_depth, i * buf_stride + j, + calc_block(sum_, sum_sq_, n, &one_over_n, &s, bit_depth, i * buf_stride + j, A, B); } }