CDEF encode buffering optimizations

Change-Id: I6b178d5ebf353bca98f18d8add2aa8b77e03cc4f
This commit is contained in:
Jean-Marc Valin 2017-03-24 15:20:05 -04:00 коммит произвёл Steinar Midtskogen
Родитель 12ec6c6529
Коммит deb1950bb3
1 изменённых файлов: 36 добавлений и 25 удалений

Просмотреть файл

@ -92,6 +92,19 @@ static double compute_dist(uint16_t *x, int xstride, uint16_t *y, int ystride,
return sum / (double)(1 << 2 * coeff_shift);
}
/* FIXME: SSE-optimize this. */
static void copy_sb16_16(uint16_t *dst, int dstride, const uint16_t *src,
int src_voffset, int src_hoffset, int sstride,
int vsize, int hsize) {
int r, c;
const uint16_t *base = &src[src_voffset * sstride + src_hoffset];
for (r = 0; r < vsize; r++) {
for (c = 0; c < hsize; c++) {
dst[r * dstride + c] = base[r * sstride + c];
}
}
}
void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
AV1_COMMON *cm, MACROBLOCKD *xd) {
int r, c;
@ -181,7 +194,8 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
int nvb, nhb;
int gi;
int dirinit = 0;
DECLARE_ALIGNED(32, uint16_t, dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
DECLARE_ALIGNED(32, uint16_t,
dst[3][MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
DECLARE_ALIGNED(32, uint16_t,
tmp_dst[MAX_MIB_SIZE * MAX_MIB_SIZE * 8 * 8]);
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
@ -189,24 +203,23 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
dering_count = sb_compute_dering_list(cm, sbr * MAX_MIB_SIZE,
sbc * MAX_MIB_SIZE, dlist);
if (dering_count == 0) continue;
for (pli = 0; pli < nplanes; pli++) {
/* Copy the dst buffer only once since it will always be written at
the same place. */
copy_sb16_16(dst[pli], MAX_MIB_SIZE << bsize[pli], src[pli],
sbr * MAX_MIB_SIZE << bsize[pli],
sbc * MAX_MIB_SIZE << bsize[pli], stride[pli],
nvb << bsize[pli], nhb << bsize[pli]);
}
for (gi = 0; gi < TOTAL_STRENGTHS; gi++) {
int threshold;
int clpf_strength;
DECLARE_ALIGNED(32, uint16_t, inbuf[OD_DERING_INBUF_SIZE]);
uint16_t *in;
int j;
level = dering_level_table[gi / CLPF_STRENGTHS];
threshold = level << coeff_shift;
for (pli = 0; pli < nplanes; pli++) {
if (pli > 0 && !chroma_dering) threshold = 0;
for (r = 0; r < nvb << bsize[pli]; r++) {
for (c = 0; c < nhb << bsize[pli]; c++) {
dst[(r * MAX_MIB_SIZE << bsize[pli]) + c] =
src[pli]
[((sbr * MAX_MIB_SIZE << bsize[pli]) + r) * stride[pli] +
(sbc * MAX_MIB_SIZE << bsize[pli]) + c];
}
}
in = inbuf + OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER;
/* We avoid filtering the pixels for which some of the pixels to
average
@ -214,27 +227,25 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
would add special cases for any future vectorization. */
for (i = 0; i < OD_DERING_INBUF_SIZE; i++)
inbuf[i] = OD_DERING_VERY_LARGE;
for (i = -OD_FILT_VBORDER * (sbr != 0);
i < (nvb << bsize[pli]) + OD_FILT_VBORDER * (sbr != nvsb - 1);
i++) {
for (j = -OD_FILT_HBORDER * (sbc != 0);
j < (nhb << bsize[pli]) + OD_FILT_HBORDER * (sbc != nhsb - 1);
j++) {
uint16_t *x;
x = &src[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
(sbc * MAX_MIB_SIZE << bsize[pli])];
in[i * OD_FILT_BSTRIDE + j] = x[i * stride[pli] + j];
}
}
int yoff = OD_FILT_VBORDER * (sbr != 0);
int xoff = OD_FILT_HBORDER * (sbc != 0);
int ysize =
(nvb << bsize[pli]) + OD_FILT_VBORDER * (sbr != nvsb - 1) + yoff;
int xsize =
(nhb << bsize[pli]) + OD_FILT_HBORDER * (sbc != nhsb - 1) + xoff;
copy_sb16_16(&in[(-yoff * OD_FILT_BSTRIDE - xoff)], OD_FILT_BSTRIDE,
src[pli], (sbr * MAX_MIB_SIZE << bsize[pli]) - yoff,
(sbc * MAX_MIB_SIZE << bsize[pli]) - xoff, stride[pli],
ysize, xsize);
clpf_strength = gi % CLPF_STRENGTHS;
od_dering(tmp_dst, in, dec[pli], dir, &dirinit, var, pli, dlist,
dering_count, threshold,
clpf_strength + (clpf_strength == 3), clpf_damping,
coeff_shift);
copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[pli], tmp_dst,
dlist, dering_count, bsize[pli]);
copy_dering_16bit_to_16bit(dst[pli], MAX_MIB_SIZE << bsize[pli],
tmp_dst, dlist, dering_count, bsize[pli]);
mse[pli][sb_count][gi] = (int)compute_dist(
dst, MAX_MIB_SIZE << bsize[pli],
dst[pli], MAX_MIB_SIZE << bsize[pli],
&ref_coeff[pli][(sbr * stride[pli] * MAX_MIB_SIZE << bsize[pli]) +
(sbc * MAX_MIB_SIZE << bsize[pli])],
stride[pli], nhb, nvb, coeff_shift, bsize[pli]);