diff --git a/aom_dsp/loopfilter.c b/aom_dsp/loopfilter.c index 74201746c..7ea1e6b89 100644 --- a/aom_dsp/loopfilter.c +++ b/aom_dsp/loopfilter.c @@ -149,10 +149,15 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -179,10 +184,15 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0, void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; @@ -229,10 +239,15 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat, void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -256,8 +271,13 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0, void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = @@ -390,10 +410,15 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int step = 4; +#else + int step = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < step * count; ++i) { const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; @@ -436,7 +461,11 @@ void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit, void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1); +#else mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2); +#endif } static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, @@ -478,7 +507,11 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit, void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4); +#else mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8); +#endif } void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit, @@ -596,10 +629,15 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; @@ -636,10 +674,15 @@ void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { #if !CONFIG_PARALLEL_DEBLOCKING const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; @@ -689,10 +732,15 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; @@ -718,8 +766,13 @@ void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int count = 4; +#else + int count = 8; +#endif - for (i = 0; i < 8; ++i) { + for (i = 0; i < count; ++i) { const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1]; const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3]; const int8_t mask = @@ -813,10 +866,15 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p, const uint8_t *thresh, int count, int bd) { int i; +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + int step = 4; +#else + int step = 8; +#endif // loop filter designed to work using chars so that we can make maximum use // of 8 bit simd instructions. - for (i = 0; i < 8 * count; ++i) { + for (i = 0; i < step * count; ++i) { const uint16_t p3 = s[-4 * p]; const uint16_t p2 = s[-3 * p]; const uint16_t p1 = s[-2 * p]; @@ -852,7 +910,11 @@ void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd); +#else highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd); +#endif } static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, @@ -888,13 +950,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p, void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd); +#else highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +#endif } void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd) { +#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4 + highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd); +#else highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd); +#endif } #endif // CONFIG_HIGHBITDEPTH diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c index af919a854..4b27ae93b 100644 --- a/av1/common/av1_loopfilter.c +++ b/av1/common/av1_loopfilter.c @@ -22,7 +22,7 @@ #include "av1/common/seg_common.h" -#define CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY 0 +#define PARALLEL_DEBLOCKING_15TAPLUMAONLY 1 // 64 bit masks for left transform size. Each 1 represents a position where // we should apply a loop filter across the left border of an 8x8 block @@ -1857,8 +1857,6 @@ void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm, dst->buf = dst0; } -#if !(CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \ - CONFIG_CB4X4) #if CONFIG_PARALLEL_DEBLOCKING typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR; static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = { @@ -2010,10 +2008,17 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams, // not sure if changes are required. assert(0 && "Not yet updated"); #endif // CONFIG_EXT_PARTITION + { const TX_SIZE ts = av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert); +#if CONFIG_EXT_DELTA_Q + const uint32_t currLevel = + get_filter_level(cm, &cm->lf_info, &ppCurr[0]->mbmi); +#else const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi); +#endif // CONFIG_EXT_DELTA_Q + const int currSkipped = ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi); const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y); @@ -2034,7 +2039,13 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams, const MODE_INFO *const pPrev = *(ppCurr - modeStep); const TX_SIZE pvTs = av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert); +#if CONFIG_EXT_DELTA_Q + const uint32_t pvLvl = + get_filter_level(cm, &cm->lf_info, &pPrev->mbmi); +#else const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi); +#endif // CONFIG_EXT_DELTA_Q + const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi); const int32_t puEdge = (coord & @@ -2046,7 +2057,7 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams, // if the current and the previous blocks are skipped, // deblock the edge if the edge belongs to a PU's edge only. if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) { -#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY +#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY const TX_SIZE minTs = AOMMIN(ts, pvTs); if (TX_4X4 >= minTs) { pParams->filterLength = 4; @@ -2054,7 +2065,7 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams, pParams->filterLength = 8; } else { pParams->filterLength = 16; -#if CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY +#if PARALLEL_DEBLOCKING_15TAPLUMAONLY // No wide filtering for chroma plane if (scaleHorz || scaleVert) { pParams->filterLength = 8; @@ -2064,7 +2075,7 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams, #else pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8); -#endif // CONFIG_PARALLEL_DEBLOCKING_15TAP +#endif // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY // update the level if the current block is skipped, // but the previous one is not @@ -2072,10 +2083,14 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams, } } } + +#if !CONFIG_CB4X4 // prepare internal edge parameters if (currLevel && !currSkipped) { pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0); } +#endif + // prepare common parameters if (pParams->filterLength || pParams->filterLengthInternal) { const loop_filter_thresh *const limits = cm->lf_info.lfthr + level; @@ -2093,15 +2108,21 @@ static void av1_filter_block_plane_vert(const AV1_COMMON *const cm, const ptrdiff_t modeStride, const uint32_t cuX, const uint32_t cuY) { + const int col_step = MI_SIZE >> MI_SIZE_LOG2; + const int row_step = MI_SIZE >> MI_SIZE_LOG2; const uint32_t scaleHorz = pPlane->subsampling_x; const uint32_t scaleVert = pPlane->subsampling_y; const uint32_t width = pPlane->dst.width; const uint32_t height = pPlane->dst.height; uint8_t *const pDst = pPlane->dst.buf; const int dstStride = pPlane->dst.stride; - for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) { + for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += row_step) { uint8_t *p = pDst + y * MI_SIZE * dstStride; - for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) { + for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += col_step) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will filter the vertical edge aligned with a 8x8 block. + // If 4x4 trasnform is used, it will then filter the internal edge + // aligned with a 4x4 block const MODE_INFO **const pCurr = ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz); AV1_DEBLOCKING_PARAMETERS params; @@ -2112,31 +2133,59 @@ static void av1_filter_block_plane_vert(const AV1_COMMON *const cm, switch (params.filterLength) { // apply 4-tap filtering case 4: - aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p), dstStride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_vertical_4_c(p, dstStride, params.mblim, params.lim, + params.hev_thr); break; // apply 8-tap filtering case 8: - aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(p), dstStride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_vertical_8_c(p, dstStride, params.mblim, params.lim, + params.hev_thr); break; -#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY +#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY // apply 16-tap filtering case 16: - aom_lpf_vertical_16(p, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dstStride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_vertical_16_c(p, dstStride, params.mblim, params.lim, + params.hev_thr); break; -#endif // CONFIG_PARALLEL_DEBLOCKING_15TAP +#endif // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY // no filtering default: break; } // process the internal edge if (params.filterLengthInternal) { - aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p + 4), dstStride, + params.mblim, params.lim, params.hev_thr, + cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_vertical_4_c(p + 4, dstStride, params.mblim, params.lim, + params.hev_thr); } // advance the destination pointer - p += 8; + p += MI_SIZE; } } } @@ -2147,15 +2196,21 @@ static void av1_filter_block_plane_horz(const AV1_COMMON *const cm, const ptrdiff_t modeStride, const uint32_t cuX, const uint32_t cuY) { + const int col_step = MI_SIZE >> MI_SIZE_LOG2; + const int row_step = MI_SIZE >> MI_SIZE_LOG2; const uint32_t scaleHorz = pPlane->subsampling_x; const uint32_t scaleVert = pPlane->subsampling_y; const uint32_t width = pPlane->dst.width; const uint32_t height = pPlane->dst.height; uint8_t *const pDst = pPlane->dst.buf; const int dstStride = pPlane->dst.stride; - for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) { + for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += row_step) { uint8_t *p = pDst + y * MI_SIZE * dstStride; - for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) { + for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += col_step) { + // inner loop always filter vertical edges in a MI block. If MI size + // is 8x8, it will first filter the vertical edge aligned with a 8x8 + // block. If 4x4 trasnform is used, it will then filter the internal + // edge aligned with a 4x4 block const MODE_INFO **const pCurr = ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz); AV1_DEBLOCKING_PARAMETERS params; @@ -2166,45 +2221,74 @@ static void av1_filter_block_plane_horz(const AV1_COMMON *const cm, switch (params.filterLength) { // apply 4-tap filtering case 4: - aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p), dstStride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_horizontal_4_c(p, dstStride, params.mblim, params.lim, + params.hev_thr); break; // apply 8-tap filtering case 8: - aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_horizontal_8_c(CONVERT_TO_SHORTPTR(p), dstStride, + params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_horizontal_8_c(p, dstStride, params.mblim, params.lim, + params.hev_thr); break; -#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY +#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY // apply 16-tap filtering case 16: - aom_lpf_horizontal_edge_16(p, dstStride, params.mblim, params.lim, - params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_horizontal_edge_16_c( + CONVERT_TO_SHORTPTR(p), dstStride, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_horizontal_edge_16_c(p, dstStride, params.mblim, params.lim, + params.hev_thr); break; -#endif // CONFIG_PARALLEL_DEBLOCKING_15TAP +#endif // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY // no filtering default: break; } // process the internal edge if (params.filterLengthInternal) { - aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim, - params.lim, params.hev_thr); +#if CONFIG_HIGHBITDEPTH + if (cm->use_highbitdepth) + aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p + 4 * dstStride), + dstStride, params.mblim, params.lim, + params.hev_thr, cm->bit_depth); + else +#endif // CONFIG_HIGHBITDEPTH + aom_lpf_horizontal_4_c(p + 4 * dstStride, dstStride, params.mblim, + params.lim, params.hev_thr); } // advance the destination pointer - p += 8; + p += MI_SIZE; } } } #endif // CONFIG_PARALLEL_DEBLOCKING -#endif void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, struct macroblockd_plane planes[MAX_MB_PLANE], int start, int stop, int y_only) { -#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \ - CONFIG_CB4X4 const int num_planes = y_only ? 1 : MAX_MB_PLANE; int mi_row, mi_col; +#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \ + CONFIG_CB4X4 + +#if !CONFIG_PARALLEL_DEBLOCKING #if CONFIG_VAR_TX for (int i = 0; i < MAX_MB_PLANE; ++i) memset(cm->top_txfm_context[i], TX_32X32, cm->mi_cols << TX_UNIT_WIDE_LOG2); @@ -2229,27 +2313,17 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, } } } -#else // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES - const int num_planes = y_only ? 1 : MAX_MB_PLANE; - int mi_row, mi_col; -#if !CONFIG_PARALLEL_DEBLOCKING - enum lf_path path; - LOOP_FILTER_MASK lfm; +#else + +#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES + assert(0 && "Not yet updated. ToDo as next steps"); +#endif // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES - if (y_only) - path = LF_PATH_444; - else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) - path = LF_PATH_420; - else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) - path = LF_PATH_444; - else - path = LF_PATH_SLOW; -#endif -#if CONFIG_PARALLEL_DEBLOCKING for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col); + // filter all vertical edges in every 64x64 super block for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) { const int32_t scaleHorz = planes[planeIdx].subsampling_x; const int32_t scaleVert = planes[planeIdx].subsampling_y; @@ -2264,6 +2338,42 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col); + // filter all horizontal edges in every 64x64 super block + for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) { + const int32_t scaleHorz = planes[planeIdx].subsampling_x; + const int32_t scaleVert = planes[planeIdx].subsampling_y; + av1_filter_block_plane_horz( + cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col), + cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz, + (mi_row * MI_SIZE) >> scaleVert); + } + } + } +#endif // CONFIG_PARALLEL_DEBLOCKING + +#else // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES + +#if CONFIG_PARALLEL_DEBLOCKING + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { + av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col); + // filter all vertical edges in every 64x64 super block + for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) { + const int32_t scaleHorz = planes[planeIdx].subsampling_x; + const int32_t scaleVert = planes[planeIdx].subsampling_y; + av1_filter_block_plane_vert( + cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col), + cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz, + (mi_row * MI_SIZE) >> scaleVert); + } + } + } + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { + MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { + av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col); + // filter all horizontal edges in every 64x64 super block for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) { const int32_t scaleHorz = planes[planeIdx].subsampling_x; const int32_t scaleVert = planes[planeIdx].subsampling_y; @@ -2275,6 +2385,18 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm, } } #else // CONFIG_PARALLEL_DEBLOCKING + enum lf_path path; + LOOP_FILTER_MASK lfm; + + if (y_only) + path = LF_PATH_444; + else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1) + path = LF_PATH_420; + else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0) + path = LF_PATH_444; + else + path = LF_PATH_SLOW; + for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) { MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride; for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) { diff --git a/configure b/configure index 969f8f39c..7e9c62178 100755 --- a/configure +++ b/configure @@ -551,6 +551,11 @@ post_process_cmdline() { soft_enable accounting soft_enable inspection fi + if enabled parallel_deblocking_15tap && ! enabled parallel_deblocking; then + log_echo "parallel_deblocking_15tap dependes on parallel_deblocking, so" + log_echo "enabling parallel_deblocking" + soft_enable parallel_deblocking + fi } process_targets() {