diff --git a/av1/common/loopfilter.c b/av1/common/loopfilter.c index eea05dca7..6820147b3 100644 --- a/av1/common/loopfilter.c +++ b/av1/common/loopfilter.c @@ -1193,14 +1193,16 @@ void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_y; const int col_step = 1 << ss_x; + const int row_step_stride = cm->mi_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 }; unsigned int mask_8x8[MAX_MIB_SIZE] = { 0 }; unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 }; unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 }; - uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE]; + uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE] = { { 0 } }; int r, c; + MODE_INFO **tmp_mi = mib; for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) { unsigned int mask_16x16_c = 0; @@ -1210,7 +1212,7 @@ void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, // Determine the vertical edges that need filtering for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) { - const MODE_INFO *mi = mib[c]; + const MODE_INFO *mi = tmp_mi[c]; const MB_MODE_INFO *mbmi = &mi[0].mbmi; const BLOCK_SIZE sb_type = mbmi->sb_type; const int skip_this = mbmi->skip && is_inter_block(mbmi); @@ -1243,8 +1245,12 @@ void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, TX_SIZE tx_size_r = num_4x4_blocks_high_txsize_log2_lookup[tx_size]; int tx_size_mask = 0; + const int c_step = (c >> ss_x); + const int r_step = (r >> ss_y); + const int col_mask = 1 << c_step; + // Filter level can vary per MI - if (!(lfl[r][c >> ss_x] = get_filter_level(&cm->lf_info, mbmi))) continue; + if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue; if (txsize_sqr_up_map[tx_size] == TX_32X32) tx_size_mask = 3; @@ -1289,60 +1295,60 @@ void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, // Build masks based on the transform size of each block // handle vertical mask if (tx_size_c == TX_32X32) { - if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) { + if (!skip_this_c && (c_step & tx_size_mask) == 0) { if (!skip_border_4x4_c) - mask_16x16_c |= 1 << (c >> ss_x); + mask_16x16_c |= col_mask; else - mask_8x8_c |= 1 << (c >> ss_x); + mask_8x8_c |= col_mask; } } else if (tx_size_c == TX_16X16) { - if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) { + if (!skip_this_c && (c_step & tx_size_mask) == 0) { if (!skip_border_4x4_c) - mask_16x16_c |= 1 << (c >> ss_x); + mask_16x16_c |= col_mask; else - mask_8x8_c |= 1 << (c >> ss_x); + mask_8x8_c |= col_mask; } } else { // force 8x8 filtering on 32x32 boundaries - if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) { + if (!skip_this_c && (c_step & tx_size_mask) == 0) { if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0) - mask_8x8_c |= 1 << (c >> ss_x); + mask_8x8_c |= col_mask; else - mask_4x4_c |= 1 << (c >> ss_x); + mask_4x4_c |= col_mask; } if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c && - ((c >> ss_x) & tx_size_mask) == 0) - mask_4x4_int[r] |= 1 << (c >> ss_x); + (c_step & tx_size_mask) == 0) + mask_4x4_int[r] |= col_mask; } // set horizontal mask if (tx_size_r == TX_32X32) { - if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) { + if (!skip_this_r && (r_step & tx_size_mask) == 0) { if (!skip_border_4x4_r) - mask_16x16[r] |= 1 << (c >> ss_x); + mask_16x16[r] |= col_mask; else - mask_8x8[r] |= 1 << (c >> ss_x); + mask_8x8[r] |= col_mask; } } else if (tx_size_r == TX_16X16) { - if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) { + if (!skip_this_r && (r_step & tx_size_mask) == 0) { if (!skip_border_4x4_r) - mask_16x16[r] |= 1 << (c >> ss_x); + mask_16x16[r] |= col_mask; else - mask_8x8[r] |= 1 << (c >> ss_x); + mask_8x8[r] |= col_mask; } } else { // force 8x8 filtering on 32x32 boundaries - if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) { - if (tx_size_r == TX_8X8 || ((r >> ss_y) & 3) == 0) - mask_8x8[r] |= 1 << (c >> ss_x); + if (!skip_this_r && (r_step & tx_size_mask) == 0) { + if (tx_size_r == TX_8X8 || (r_step & 3) == 0) + mask_8x8[r] |= col_mask; else - mask_4x4[r] |= 1 << (c >> ss_x); + mask_4x4[r] |= col_mask; } if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c && ((r >> ss_y) & tx_size_mask) == 0) - mask_4x4_int[r] |= 1 << (c >> ss_x); + mask_4x4_int[r] |= col_mask; } } @@ -1367,7 +1373,7 @@ void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, mask_4x4_int[r], &cm->lf_info, &lfl[r][0]); #endif // CONFIG_AOM_HIGHBITDEPTH dst->buf += MI_SIZE * dst->stride; - mib += row_step * cm->mi_stride; + tmp_mi += row_step_stride; } // Now do horizontal pass @@ -1376,9 +1382,13 @@ void av1_filter_block_plane_non420_ver(AV1_COMMON *cm, void av1_filter_block_plane_non420_hor(AV1_COMMON *cm, struct macroblockd_plane *plane, - int mi_row) { + MODE_INFO **mib, int mi_row, + int mi_col) { + const int ss_x = plane->subsampling_x; const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_y; + const int col_step = 1 << ss_x; + const int row_step_stride = cm->mi_stride * row_step; struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; unsigned int mask_16x16[MAX_MIB_SIZE] = { 0 }; @@ -1386,8 +1396,156 @@ void av1_filter_block_plane_non420_hor(AV1_COMMON *cm, unsigned int mask_4x4[MAX_MIB_SIZE] = { 0 }; unsigned int mask_4x4_int[MAX_MIB_SIZE] = { 0 }; uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE]; - int r; + int r, c; + MODE_INFO **tmp_mi = mib; + for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) { + unsigned int mask_16x16_c = 0; + unsigned int mask_8x8_c = 0; + unsigned int mask_4x4_c = 0; + // Determine the vertical edges that need filtering + for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) { + const MODE_INFO *mi = tmp_mi[c]; + const MB_MODE_INFO *mbmi = &mi[0].mbmi; + const BLOCK_SIZE sb_type = mbmi->sb_type; + const int skip_this = mbmi->skip && is_inter_block(mbmi); + const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1); + const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1); + + // left edge of current unit is block/partition edge -> no skip + const int block_edge_left = + (num_4x4_blocks_wide_lookup[sb_type] > 1) ? !blk_col : 1; + const int skip_this_c = skip_this && !block_edge_left; + // top edge of current unit is block/partition edge -> no skip + const int block_edge_above = + (num_4x4_blocks_high_lookup[sb_type] > 1) ? !blk_row : 1; + const int skip_this_r = skip_this && !block_edge_above; + +#if CONFIG_VAR_TX + TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) + ? get_uv_tx_size(mbmi, plane) + : mbmi->tx_size; +#else + const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV) + ? get_uv_tx_size(mbmi, plane) + : mbmi->tx_size; +#endif + + const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1; + const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; + + TX_SIZE tx_size_c = num_4x4_blocks_wide_txsize_log2_lookup[tx_size]; + TX_SIZE tx_size_r = num_4x4_blocks_high_txsize_log2_lookup[tx_size]; + + int tx_size_mask = 0; + const int c_step = (c >> ss_x); + const int r_step = (r >> ss_y); + const int col_mask = 1 << c_step; + + // Filter level can vary per MI + if (!(lfl[r][c_step] = get_filter_level(&cm->lf_info, mbmi))) continue; + + if (txsize_sqr_up_map[tx_size] == TX_32X32) + tx_size_mask = 3; + else if (txsize_sqr_up_map[tx_size] == TX_16X16) + tx_size_mask = 1; + else + tx_size_mask = 0; + +#if CONFIG_VAR_TX + if (is_inter_block(mbmi) && !mbmi->skip) { +#if CONFIG_EXT_TX && CONFIG_RECT_TX + TX_SIZE mb_tx_size = is_rect_tx(mbmi->tx_size) + ? mbmi->tx_size + : mbmi->inter_tx_size[blk_row][blk_col]; +#else + TX_SIZE mb_tx_size = mbmi->inter_tx_size[blk_row][blk_col]; +#endif + tx_size = (plane->plane_type == PLANE_TYPE_UV) + ? uv_txsize_lookup[sb_type][mb_tx_size][ss_x][ss_y] + : mb_tx_size; + } + +#if CONFIG_EXT_TX && CONFIG_RECT_TX + tx_size_r = + AOMMIN(txsize_horz_map[tx_size], cm->above_txfm_context[mi_col + c]); + tx_size_c = AOMMIN(txsize_vert_map[tx_size], + cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]); + + cm->above_txfm_context[mi_col + c] = txsize_horz_map[tx_size]; + cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = + txsize_vert_map[tx_size]; +#else + tx_size_r = AOMMIN(tx_size, cm->above_txfm_context[mi_col + c]); + tx_size_c = + AOMMIN(tx_size, cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]); + + cm->above_txfm_context[mi_col + c] = tx_size; + cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size; +#endif +#endif + + // Build masks based on the transform size of each block + // handle vertical mask + if (tx_size_c == TX_32X32) { + if (!skip_this_c && (c_step & tx_size_mask) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= col_mask; + else + mask_8x8_c |= col_mask; + } + } else if (tx_size_c == TX_16X16) { + if (!skip_this_c && (c_step & tx_size_mask) == 0) { + if (!skip_border_4x4_c) + mask_16x16_c |= col_mask; + else + mask_8x8_c |= col_mask; + } + } else { + // force 8x8 filtering on 32x32 boundaries + if (!skip_this_c && (c_step & tx_size_mask) == 0) { + if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0) + mask_8x8_c |= col_mask; + else + mask_4x4_c |= col_mask; + } + + if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c && + (c_step & tx_size_mask) == 0) + mask_4x4_int[r] |= col_mask; + } + + // set horizontal mask + if (tx_size_r == TX_32X32) { + if (!skip_this_r && (r_step & tx_size_mask) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= col_mask; + else + mask_8x8[r] |= col_mask; + } + } else if (tx_size_r == TX_16X16) { + if (!skip_this_r && (r_step & tx_size_mask) == 0) { + if (!skip_border_4x4_r) + mask_16x16[r] |= col_mask; + else + mask_8x8[r] |= col_mask; + } + } else { + // force 8x8 filtering on 32x32 boundaries + if (!skip_this_r && (r_step & tx_size_mask) == 0) { + if (tx_size_r == TX_8X8 || (r_step & 3) == 0) + mask_8x8[r] |= col_mask; + else + mask_4x4[r] |= col_mask; + } + + if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c && + ((r >> ss_y) & tx_size_mask) == 0) + mask_4x4_int[r] |= col_mask; + } + } + tmp_mi += row_step_stride; + } for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) { const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1; const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r]; @@ -1543,6 +1701,7 @@ void av1_filter_block_plane_ss11_ver(AV1_COMMON *const cm, assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); assert(plane->plane_type == PLANE_TYPE_UV); + memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv)); // Vertical pass: do 2 rows at one time for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) { @@ -1593,13 +1752,23 @@ void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm, int mi_row, LOOP_FILTER_MASK *lfm) { struct buf_2d *const dst = &plane->dst; uint8_t *const dst0 = dst->buf; - int r; + int r, c; uint64_t mask_16x16 = lfm->above_uv[TX_16X16]; uint64_t mask_8x8 = lfm->above_uv[TX_8X8]; uint64_t mask_4x4 = lfm->above_uv[TX_4X4]; uint64_t mask_4x4_int = lfm->above_int_4x4_uv; assert(plane->subsampling_x == 1 && plane->subsampling_y == 1); + memset(lfm->lfl_uv, 0, sizeof(lfm->lfl_uv)); + + // re-porpulate the filter level for uv, same as the code for vertical + // filter in av1_filter_block_plane_ss11_ver + for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) { + for (c = 0; c < (cm->mib_size >> 1); c++) { + lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1]; + lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1]; + } + } for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) { const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1; @@ -1669,7 +1838,8 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, for (plane = 0; plane < num_planes; ++plane) { av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, mi_row, mi_col); - av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); } } } @@ -1735,7 +1905,8 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm); break; case LF_PATH_SLOW: - av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); break; } } @@ -1767,7 +1938,8 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, case LF_PATH_SLOW: av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, mi_row, mi_col); - av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); break; } diff --git a/av1/common/loopfilter.h b/av1/common/loopfilter.h index cdc251208..0f7067290 100644 --- a/av1/common/loopfilter.h +++ b/av1/common/loopfilter.h @@ -118,7 +118,8 @@ void av1_filter_block_plane_non420_ver(struct AV1Common *cm, int mi_col); void av1_filter_block_plane_non420_hor(struct AV1Common *cm, struct macroblockd_plane *plane, - int mi_row); + MODE_INFO **mi_8x8, int mi_row, + int mi_col); void av1_loop_filter_init(struct AV1Common *cm); diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c index 11006715b..541db1d9c 100644 --- a/av1/common/thread_common.c +++ b/av1/common/thread_common.c @@ -113,8 +113,8 @@ static INLINE void loop_filter_block_plane_ver( av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, lfm); break; case LF_PATH_SLOW: - av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col, - mi_row, mi_col); + av1_filter_block_plane_non420_ver(cm, &planes[plane], mi, mi_row, + mi_col); break; } } @@ -122,7 +122,8 @@ static INLINE void loop_filter_block_plane_ver( static INLINE void loop_filter_block_plane_hor( AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int plane, - int mi_row, enum lf_path path, LOOP_FILTER_MASK *lfm) { + MODE_INFO **mi, int mi_row, int mi_col, enum lf_path path, + LOOP_FILTER_MASK *lfm) { if (plane == 0) { av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, lfm); } else { @@ -134,7 +135,8 @@ static INLINE void loop_filter_block_plane_hor( av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, lfm); break; case LF_PATH_SLOW: - av1_filter_block_plane_non420_hor(cm, &planes[plane], mi_row); + av1_filter_block_plane_non420_hor(cm, &planes[plane], mi, mi_row, + mi_col); break; } } @@ -171,8 +173,8 @@ static int loop_filter_ver_row_worker(AV1LfSync *const lf_sync, #else for (plane = 0; plane < num_planes; ++plane) - loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi, - mi_row, mi_col, path, &lfm); + loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, + mi + mi_col, mi_row, mi_col, path, &lfm); #endif } } @@ -212,11 +214,11 @@ static int loop_filter_hor_row_worker(AV1LfSync *const lf_sync, #if CONFIG_EXT_PARTITION_TYPES for (plane = 0; plane < num_planes; ++plane) av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], - mi_row); + mi + mi_col, mi_row, mi_col); #else for (plane = 0; plane < num_planes; ++plane) - loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row, - path, &lfm); + loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, + mi + mi_col, mi_row, mi_col, path, &lfm); #endif sync_write(lf_sync, r, c, sb_cols); } @@ -264,17 +266,17 @@ static int loop_filter_row_worker(AV1LfSync *const lf_sync, av1_filter_block_plane_non420_ver(lf_data->cm, &lf_data->planes[plane], mi + mi_col, mi_row, mi_col); av1_filter_block_plane_non420_hor(lf_data->cm, &lf_data->planes[plane], - mi_row); + mi + mi_col, mi_row, mi_col); } #else av1_setup_mask(lf_data->cm, mi_row, mi_col, mi + mi_col, lf_data->cm->mi_stride, &lfm); for (plane = 0; plane < num_planes; ++plane) { - loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, mi, - mi_row, mi_col, path, &lfm); - loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, mi_row, - path, &lfm); + loop_filter_block_plane_ver(lf_data->cm, lf_data->planes, plane, + mi + mi_col, mi_row, mi_col, path, &lfm); + loop_filter_block_plane_hor(lf_data->cm, lf_data->planes, plane, + mi + mi_col, mi_row, mi_col, path, &lfm); } #endif // CONFIG_EXT_PARTITION_TYPES sync_write(lf_sync, r, c, sb_cols); diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c index 8ab549b84..f269865f8 100644 --- a/av1/decoder/decodeframe.c +++ b/av1/decoder/decodeframe.c @@ -2850,7 +2850,10 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, assert(mi_row > 0); -#if !CONFIG_VAR_TX +// when Parallel deblocking is enabled, deblocking should not +// be interleaved with decoding. Instead, deblocking should be done +// after the entire frame is decoded. +#if !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING // Loopfilter one tile row. if (cm->lf.filter_level && !cm->skip_loop_filter) { LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1; @@ -2873,18 +2876,28 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, winterface->execute(&pbi->lf_worker); } } +#endif // !CONFIG_VAR_TX && !CONFIG_PARALLEL_DEBLOCKING // After loopfiltering, the last 7 row pixels in each superblock row may // still be changed by the longest loopfilter of the next superblock row. if (cm->frame_parallel_decode) av1_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2); -#endif // !CONFIG_VAR_TX } #if CONFIG_VAR_TX // Loopfilter the whole frame. av1_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb, cm->lf.filter_level, 0, 0); +#else +#if CONFIG_PARALLEL_DEBLOCKING + // Loopfilter all rows in the frame in the frame. + if (cm->lf.filter_level && !cm->skip_loop_filter) { + LFWorkerData *const lf_data = (LFWorkerData *)pbi->lf_worker.data1; + winterface->sync(&pbi->lf_worker); + lf_data->start = 0; + lf_data->stop = cm->mi_rows; + winterface->execute(&pbi->lf_worker); + } #else // Loopfilter remaining rows in the frame. if (cm->lf.filter_level && !cm->skip_loop_filter) { @@ -2894,6 +2907,7 @@ static const uint8_t *decode_tiles(AV1Decoder *pbi, const uint8_t *data, lf_data->stop = cm->mi_rows; winterface->execute(&pbi->lf_worker); } +#endif // CONFIG_PARALLEL_DEBLOCKING #endif // CONFIG_VAR_TX if (cm->frame_parallel_decode) av1_frameworker_broadcast(pbi->cur_buf, INT_MAX);