Make CDEF handle 4:2:2 and 4:4:0 properly

Change-Id: I6570518c16d0d7707ddf4e8e8ad1b756eb5a7648
This commit is contained in:
Steinar Midtskogen 2017-04-07 09:24:02 +02:00 коммит произвёл Jean-Marc Valin
Родитель 40c6ffc0f0
Коммит 95a2f869c4
5 изменённых файлов: 134 добавлений и 72 удалений

Просмотреть файл

@ -176,7 +176,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int nhsb, nvsb;
uint16_t src[OD_DERING_INBUF_SIZE];
uint16_t *linebuf[3];
uint16_t colbuf[3][(OD_BSIZE_MAX + 2 * OD_FILT_VBORDER) * OD_FILT_HBORDER];
uint16_t colbuf[3][(MAX_SB_SIZE + 2 * OD_FILT_VBORDER) * OD_FILT_HBORDER];
dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
int dering_count;
@ -185,7 +185,8 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
int stride;
int mi_wide_l2[3];
int mi_high_l2[3];
int dec[3];
int xdec[3];
int ydec[3];
int pli;
int dering_left;
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
@ -201,11 +202,10 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
prev_row_dering = row_dering + 1;
curr_row_dering = prev_row_dering + nhsb + 2;
for (pli = 0; pli < nplanes; pli++) {
dec[pli] = xd->plane[pli].subsampling_x;
xdec[pli] = xd->plane[pli].subsampling_x;
ydec[pli] = xd->plane[pli].subsampling_y;
mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
// TODO(stemidts/jmvalin): We should use subsampling_y below but can't
// until we've properly fixed 4:2:2
mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
}
stride = (cm->mi_cols << MI_SIZE_LOG2) + 2 * OD_FILT_HBORDER;
for (pli = 0; pli < nplanes; pli++) {
@ -277,7 +277,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
curr_row_dering[sbc] = 1;
for (pli = 0; pli < nplanes; pli++) {
uint16_t dst[OD_BSIZE_MAX * OD_BSIZE_MAX];
uint16_t dst[MAX_SB_SIZE * MAX_SB_SIZE];
int coffset;
int rend, cend;
int clpf_damping = 3 - (pli != AOM_PLANE_Y) + (cm->base_qindex >> 6);
@ -409,8 +409,8 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
(sbc * MAX_MIB_SIZE << mi_wide_l2[pli])],
xd->plane[pli].dst.stride, dst,
&src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
dec[pli], dir, NULL, var, pli, dlist, dering_count, level,
clpf_strength, clpf_damping, coeff_shift, 0, 1);
xdec[pli], ydec[pli], dir, NULL, var, pli, dlist, dering_count,
level, clpf_strength, clpf_damping, coeff_shift, 0, 1);
} else {
#endif
od_dering(&xd->plane[pli]
@ -419,8 +419,9 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
(sbc * MAX_MIB_SIZE << mi_wide_l2[pli])],
xd->plane[pli].dst.stride, dst,
&src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
dec[pli], dir, NULL, var, pli, dlist, dering_count, level,
clpf_strength, clpf_damping, coeff_shift, 0, 0);
xdec[pli], ydec[pli], dir, NULL, var, pli, dlist,
dering_count, level, clpf_strength, clpf_damping,
coeff_shift, 0, 0);
#if CONFIG_AOM_HIGHBITDEPTH
}

Просмотреть файл

@ -216,19 +216,38 @@ void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
int bsize) {
int bi, bx, by;
if (bsize == 3) {
if (bsize == BLOCK_8X8) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_8x8_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
&src[bi << (2 * 3)], 8);
&src[bi << (3 + 3)], 8);
}
} else if (bsize == BLOCK_4X8) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
&src[bi << (3 + 2)], 4);
copy_4x4_16bit_to_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
}
} else if (bsize == BLOCK_8X4) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
&src[bi << (2 + 3)], 8);
copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4],
dstride, &src[(bi << (2 + 3)) + 4], 8);
}
} else {
assert(bsize == BLOCK_4X4);
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
&src[bi << (2 * 2)], 4);
&src[bi << (2 + 2)], 4);
}
}
}
@ -253,33 +272,52 @@ static void copy_dering_16bit_to_8bit(uint8_t *dst, int dstride,
const uint16_t *src, dering_list *dlist,
int dering_count, int bsize) {
int bi, bx, by;
if (bsize == 3) {
if (bsize == BLOCK_8X8) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_8x8_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
&src[bi << 2 * bsize], 1 << bsize);
&src[bi << (3 + 3)], 8);
}
} else if (bsize == BLOCK_4X8) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_8bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
&src[bi << (3 + 2)], 4);
copy_4x4_16bit_to_8bit(&dst[((by << 3) + 4) * dstride + (bx << 2)],
dstride, &src[(bi << (3 + 2)) + 4 * 4], 4);
}
} else if (bsize == BLOCK_8X4) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
&src[bi << (2 + 3)], 8);
copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
&src[(bi << (2 + 3)) + 4], 8);
}
} else {
assert(bsize == BLOCK_4X4);
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
copy_4x4_16bit_to_8bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
&src[bi << 2 * bsize], 1 << bsize);
&src[bi << (2 * 2)], 4);
}
}
}
void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
dering_list *dlist, int dering_count, int level,
int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
int *dirinit, int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
int pli, dering_list *dlist, int dering_count, int level,
int clpf_strength, int clpf_damping, int coeff_shift,
int skip_dering, int hbd) {
int bi;
int bx;
int by;
int bsize;
int bsize, bsizex, bsizey;
int threshold = (level >> 1) << coeff_shift;
int dering_damping = 4 + !pli + (level & 1) + coeff_shift;
@ -288,11 +326,15 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
dering_damping = 3 + !pli + coeff_shift;
}
od_filter_dering_direction_func filter_dering_direction[OD_DERINGSIZES] = {
od_filter_dering_direction_func filter_dering_direction[] = {
od_filter_dering_direction_4x4, od_filter_dering_direction_8x8
};
clpf_damping += coeff_shift;
bsize = OD_DERING_SIZE_LOG2 - xdec;
bsize =
ydec ? (xdec ? BLOCK_4X4 : BLOCK_8X4) : (xdec ? BLOCK_4X8 : BLOCK_8X8);
bsizex = 3 - xdec;
bsizey = 3 - ydec;
if (!skip_dering) {
if (pli == 0) {
if (!dirinit || !*dirinit) {
@ -305,30 +347,19 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
}
if (dirinit) *dirinit = 1;
}
}
// Only run dering for non-zero threshold (which is always the case for
// 4:2:2 or 4:4:0). If we don't dering, we still need to eventually write
// something out in y[] later.
if (threshold != 0) {
assert(bsize == BLOCK_8X8 || bsize == BLOCK_4X4);
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
/* Deringing orthogonal to the direction uses a tighter threshold
because we want to be conservative. We've presumably already
achieved some deringing, so the amount of change is expected
to be low. Also, since we might be filtering across an edge, we
want to make sure not to blur it. That being said, we might want
to be a little bit more aggressive on pure horizontal/vertical
since the ringing there tends to be directional, so it doesn't
get removed by the directional filtering. */
(filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[bi << 2 * bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)],
od_adjust_thresh(threshold, var[by][bx]), dir[by][bx],
dering_damping);
}
} else {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
(filter_dering_direction[bsize - OD_LOG_BSIZE0])(
&y[bi << 2 * bsize], 1 << bsize,
&in[(by * OD_FILT_BSTRIDE << bsize) + (bx << bsize)], threshold,
(filter_dering_direction[bsize == BLOCK_8X8])(
&y[bi << (bsizex + bsizey)], 1 << bsizex,
&in[(by * OD_FILT_BSTRIDE << bsizey) + (bx << bsizex)],
pli ? threshold : od_adjust_thresh(threshold, var[by][bx]),
dir[by][bx], dering_damping);
}
}
@ -341,27 +372,28 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
int py = by << bsize;
int px = bx << bsize;
int py = by << bsizey;
int px = bx << bsizex;
if (!dst || hbd) {
// 16 bit destination if high bitdepth or 8 bit destination not given
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block_hbd
: aom_clpf_hblock_hbd)(
dst ? (uint16_t *)dst + py * dstride + px : &y[bi << 2 * bsize],
in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsize,
OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize,
dst ? (uint16_t *)dst + py * dstride + px
: &y[bi << (bsizex + bsizey)],
in + py * OD_FILT_BSTRIDE + px, dst && hbd ? dstride : 1 << bsizex,
OD_FILT_BSTRIDE, 1 << bsizex, 1 << bsizey,
clpf_strength << coeff_shift, clpf_damping);
} else {
// Do clpf and write the result to an 8 bit destination
(!threshold || (dir[by][bx] < 4 && dir[by][bx]) ? aom_clpf_block
: aom_clpf_hblock)(
dst + py * dstride + px, in + py * OD_FILT_BSTRIDE + px, dstride,
OD_FILT_BSTRIDE, 1 << bsize, 1 << bsize,
OD_FILT_BSTRIDE, 1 << bsizex, 1 << bsizey,
clpf_strength << coeff_shift, clpf_damping);
}
}
} else {
} else if (threshold != 0) {
// No clpf, so copy instead
if (hbd) {
copy_dering_16bit_to_16bit((uint16_t *)dst, dstride, y, dlist,
@ -369,5 +401,19 @@ void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
} else {
copy_dering_16bit_to_8bit(dst, dstride, y, dlist, dering_count, bsize);
}
} else if (dirinit) {
// If we're here, both dering and clpf are off, and we still haven't written
// anything to y[] yet, so we just copy the input to y[]. This is necessary
// only for av1_cdef_search() and only av1_cdef_search() sets dirinit.
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
int iy, ix;
// TODO(stemidts/jmvalin): SIMD optimisations
for (iy = 0; iy < 1 << bsizey; iy++)
for (ix = 0; ix < 1 << bsizex; ix++)
y[(bi << (bsizex + bsizey)) + (iy << bsizex) + ix] =
in[((by << bsizey) + iy) * OD_FILT_BSTRIDE + (bx << bsizex) + ix];
}
}
}

Просмотреть файл

@ -14,23 +14,18 @@
#include "odintrin.h"
#define OD_DERINGSIZES (2)
#define OD_DERING_SIZE_LOG2 (3)
#define OD_DERING_NBLOCKS (OD_BSIZE_MAX / 8)
#define OD_DERING_NBLOCKS (MAX_SB_SIZE / 8)
/* We need to buffer three vertical lines. */
#define OD_FILT_VBORDER (3)
/* We only need to buffer three horizontal pixels too, but let's align to
16 bytes (8 x 16 bits) to make vectorization easier. */
#define OD_FILT_HBORDER (8)
#define OD_FILT_BSTRIDE \
ALIGN_POWER_OF_TWO(OD_BSIZE_MAX + 2 * OD_FILT_HBORDER, 3)
#define OD_FILT_BSTRIDE ALIGN_POWER_OF_TWO(MAX_SB_SIZE + 2 * OD_FILT_HBORDER, 3)
#define OD_DERING_VERY_LARGE (30000)
#define OD_DERING_INBUF_SIZE \
(OD_FILT_BSTRIDE * (OD_BSIZE_MAX + 2 * OD_FILT_VBORDER))
(OD_FILT_BSTRIDE * (MAX_SB_SIZE + 2 * OD_FILT_VBORDER))
extern const int OD_DIRECTION_OFFSETS_TABLE[8][3];
@ -48,9 +43,9 @@ void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
int bsize);
void od_dering(uint8_t *dst, int dstride, uint16_t *y, uint16_t *in, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int *dirinit,
int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
dering_list *dlist, int dering_count, int level,
int ydec, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
int *dirinit, int var[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS],
int pli, dering_list *dlist, int dering_count, int level,
int clpf_strength, int clpf_damping, int coeff_shift,
int skip_dering, int hbd);
#endif

Просмотреть файл

@ -51,8 +51,6 @@ extern "C" {
#define OD_LOG_BSIZE0 (2)
/*There are 5 block sizes total (4x4, 8x8, 16x16, 32x32 and 64x64).*/
#define OD_NBSIZES (5)
/*The maximum length of the side of a block.*/
#define OD_BSIZE_MAX MAX_SB_SIZE
/*There are 4 transform sizes total in AV1 (4x4, 8x8, 16x16 and 32x32).*/
#define OD_TXSIZES TX_SIZES

Просмотреть файл

@ -232,18 +232,37 @@ uint64_t compute_dering_dist(uint16_t *dst, int dstride, uint16_t *src,
bx = dlist[bi].bx;
if (pli == 0) {
sum += dist_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
&src[bi << (2 * 3)], 8, coeff_shift);
&src[bi << (3 + 3)], 8, coeff_shift);
} else {
sum += mse_8x8_16bit(&dst[(by << 3) * dstride + (bx << 3)], dstride,
&src[bi << (2 * 3)], 8);
&src[bi << (3 + 3)], 8);
}
}
} else if (bsize == BLOCK_4X8) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
sum += mse_4x4_16bit(&dst[(by << 3) * dstride + (bx << 2)], dstride,
&src[bi << (3 + 2)], 4);
sum += mse_4x4_16bit(&dst[((by << 3) + 4) * dstride + (bx << 2)], dstride,
&src[(bi << (3 + 2)) + 4 * 4], 4);
}
} else if (bsize == BLOCK_8X4) {
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3)], dstride,
&src[bi << (2 + 3)], 8);
sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 3) + 4], dstride,
&src[(bi << (2 + 3)) + 4], 8);
}
} else {
assert(bsize == BLOCK_4X4);
for (bi = 0; bi < dering_count; bi++) {
by = dlist[bi].by;
bx = dlist[bi].bx;
sum += mse_4x4_16bit(&dst[(by << 2) * dstride + (bx << 2)], dstride,
&src[bi << (2 * 2)], 4);
&src[bi << (2 + 2)], 4);
}
}
return sum >> 2 * coeff_shift;
@ -262,7 +281,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
int bsize[3];
int mi_wide_l2[3];
int mi_high_l2[3];
int dec[3];
int xdec[3];
int ydec[3];
int pli;
int dering_count;
int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
@ -315,8 +335,10 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
32, sizeof(*src) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
ref_coeff[pli] = aom_memalign(
32, sizeof(*ref_coeff) * cm->mi_rows * cm->mi_cols * MI_SIZE * MI_SIZE);
dec[pli] = xd->plane[pli].subsampling_x;
bsize[pli] = dec[pli] ? BLOCK_4X4 : BLOCK_8X8;
xdec[pli] = xd->plane[pli].subsampling_x;
ydec[pli] = xd->plane[pli].subsampling_y;
bsize[pli] = ydec[pli] ? (xdec[pli] ? BLOCK_4X4 : BLOCK_8X4)
: (xdec[pli] ? BLOCK_4X8 : BLOCK_8X8);
stride[pli] = cm->mi_cols << MI_SIZE_LOG2;
mi_wide_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_x;
mi_high_l2[pli] = MI_SIZE_LOG2 - xd->plane[pli].subsampling_y;
@ -384,8 +406,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
(sbc * MAX_MIB_SIZE << mi_wide_l2[pli]) - xoff,
stride[pli], ysize, xsize);
od_dering(clpf_strength ? NULL : (uint8_t *)in, OD_FILT_BSTRIDE,
tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
dering_count, threshold,
tmp_dst, in, xdec[pli], ydec[pli], dir, &dirinit, var, pli,
dlist, dering_count, threshold,
clpf_strength + (clpf_strength == 3), clpf_damping,
coeff_shift, clpf_strength != 0, 1);
curr_mse = compute_dering_dist(