Remove boundary checks in CLPF

Change-Id: Icc93783f47fe7fe3aac395aadcc8bbc307dae1fb
This commit is contained in:
Steinar Midtskogen 2017-03-21 09:59:14 +01:00
Родитель 6eca835fff
Коммит d280a84554
8 изменённых файлов: 46 добавлений и 335 удалений

Просмотреть файл

@ -852,7 +852,7 @@ add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint
specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/; specialize qw/aom_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
if (aom_config("CONFIG_CDEF") eq "yes") { if (aom_config("CONFIG_CDEF") eq "yes") {
add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; add_proto qw/void aom_clpf_block_hbd/, "const uint16_t *src, uint16_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") { if (aom_config("CONFIG_AOM_HIGHBITDEPTH") eq "yes") {
add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp"; add_proto qw/void aom_clpf_detect_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int bd, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp"; add_proto qw/void aom_clpf_detect_multi_hbd/, "const uint16_t *rec, const uint16_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int bd, unsigned int dmp";
@ -867,7 +867,7 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
if ($opts{config} !~ /libs-x86-win32-vs.*/) { if ($opts{config} !~ /libs-x86-win32-vs.*/) {
specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/; specialize qw/aom_clpf_block_hbd sse2 ssse3 sse4_1 neon/;
} }
add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, BOUNDARY_TYPE bt, unsigned int bd"; add_proto qw/void aom_clpf_block/, "const uint8_t *src, uint8_t *dst, int sstride, int dstride, int x0, int y0, int sizex, int sizey, unsigned int strength, unsigned int bd";
add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp"; add_proto qw/void aom_clpf_detect/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum0, int *sum1, unsigned int strength, int size, unsigned int dmp";
add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp"; add_proto qw/void aom_clpf_detect_multi/, "const uint8_t *rec, const uint8_t *org, int rstride, int ostride, int x0, int y0, int width, int height, int *sum, int size, unsigned int dmp";
# VS compiling for 32 bit targets does not support vector types in # VS compiling for 32 bit targets does not support vector types in

Просмотреть файл

@ -265,10 +265,12 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
int level, clpf_strength; int level, clpf_strength;
int nhb, nvb; int nhb, nvb;
int cstart = 0; int cstart = 0;
#if 0 // TODO(stemidts/jmvalin): Handle tile borders correctly
BOUNDARY_TYPE boundary_type = BOUNDARY_TYPE boundary_type =
cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride + cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
MAX_MIB_SIZE * sbc] MAX_MIB_SIZE * sbc]
->mbmi.boundary_info; ->mbmi.boundary_info;
#endif
if (!dering_left) cstart = -OD_FILT_HBORDER; if (!dering_left) cstart = -OD_FILT_HBORDER;
nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc); nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr); nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
@ -435,7 +437,7 @@ void av1_cdef_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm, MACROBLOCKD *xd,
od_dering(dst, od_dering(dst,
&src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER], &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
dec[pli], dir, pli, dlist, dering_count, threshold, dec[pli], dir, pli, dlist, dering_count, threshold,
clpf_strength, clpf_damping, coeff_shift, boundary_type); clpf_strength, clpf_damping, coeff_shift);
#if CONFIG_AOM_HIGHBITDEPTH #if CONFIG_AOM_HIGHBITDEPTH
if (cm->use_highbitdepth) { if (cm->use_highbitdepth) {
copy_dering_16bit_to_16bit( copy_dering_16bit_to_16bit(

Просмотреть файл

@ -33,25 +33,20 @@ int av1_clpf_sample(int X, int A, int B, int C, int D, int E, int F, int G,
void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride, void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey, int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt, unsigned int strength, unsigned int damping) {
unsigned int damping) {
int x, y; int x, y;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
const int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
const int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
for (y = y0; y < y0 + sizey; y++) { for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) { for (x = x0; x < x0 + sizex; x++) {
const int X = src[y * sstride + x]; const int X = src[y * sstride + x];
const int A = src[AOMMAX(ymin, y - 2) * sstride + x]; const int A = src[(y - 2) * sstride + x];
const int B = src[AOMMAX(ymin, y - 1) * sstride + x]; const int B = src[(y - 1) * sstride + x];
const int C = src[y * sstride + AOMMAX(xmin, x - 2)]; const int C = src[y * sstride + x - 2];
const int D = src[y * sstride + AOMMAX(xmin, x - 1)]; const int D = src[y * sstride + x - 1];
const int E = src[y * sstride + AOMMIN(xmax, x + 1)]; const int E = src[y * sstride + x + 1];
const int F = src[y * sstride + AOMMIN(xmax, x + 2)]; const int F = src[y * sstride + x + 2];
const int G = src[AOMMIN(ymax, y + 1) * sstride + x]; const int G = src[(y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x]; const int H = src[(y + 2) * sstride + x];
const int delta = const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping); av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
dst[y * dstride + x] = X + delta; dst[y * dstride + x] = X + delta;
@ -63,25 +58,20 @@ void aom_clpf_block_c(const uint8_t *src, uint8_t *dst, int sstride,
// TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally // TODO(stemidts): Put under CONFIG_AOM_HIGHBITDEPTH if CDEF do 8 bit internally
void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride, void aom_clpf_block_hbd_c(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey, int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt, unsigned int strength, unsigned int damping) {
unsigned int damping) {
int x, y; int x, y;
const int xmin = x0 - !(bt & TILE_LEFT_BOUNDARY) * 2;
const int ymin = y0 - !(bt & TILE_ABOVE_BOUNDARY) * 2;
const int xmax = x0 + sizex + !(bt & TILE_RIGHT_BOUNDARY) * 2 - 1;
const int ymax = y0 + sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
for (y = y0; y < y0 + sizey; y++) { for (y = y0; y < y0 + sizey; y++) {
for (x = x0; x < x0 + sizex; x++) { for (x = x0; x < x0 + sizex; x++) {
const int X = src[y * sstride + x]; const int X = src[y * sstride + x];
const int A = src[AOMMAX(ymin, y - 2) * sstride + x]; const int A = src[(y - 2) * sstride + x];
const int B = src[AOMMAX(ymin, y - 1) * sstride + x]; const int B = src[(y - 1) * sstride + x];
const int C = src[y * sstride + AOMMAX(xmin, x - 2)]; const int C = src[y * sstride + x - 2];
const int D = src[y * sstride + AOMMAX(xmin, x - 1)]; const int D = src[y * sstride + x - 1];
const int E = src[y * sstride + AOMMIN(xmax, x + 1)]; const int E = src[y * sstride + x + 1];
const int F = src[y * sstride + AOMMIN(xmax, x + 2)]; const int F = src[y * sstride + x + 2];
const int G = src[AOMMIN(ymax, y + 1) * sstride + x]; const int G = src[(y + 1) * sstride + x];
const int H = src[AOMMIN(ymax, y + 2) * sstride + x]; const int H = src[(y + 2) * sstride + x];
const int delta = const int delta =
av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping); av1_clpf_sample(X, A, B, C, D, E, F, G, H, strength, damping);
dst[y * dstride + x] = X + delta; dst[y * dstride + x] = X + delta;

Просмотреть файл

@ -16,69 +16,6 @@
// Process blocks of width 8, two lines at a time, 8 bit. // Process blocks of width 8, two lines at a time, 8 bit.
static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride, static void clpf_block8(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
BOUNDARY_TYPE bt, unsigned int strength,
unsigned int dmp) {
const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int left = !(bt & TILE_LEFT_BOUNDARY);
const int top = bt & TILE_ABOVE_BOUNDARY ? 0 : -1;
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504030201000000LL, 0x0d0c0b0a09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0605040302010000LL, 0x0e0d0c0b0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707060504030201LL, 0x0f0f0e0d0c0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
f_shuff[]) = { 0x0707070605040302LL, 0x0f0f0f0e0d0c0b0aLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
const v64 l3 = v64_load_aligned(src - (y != top) * sstride);
const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride);
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3);
const v128 b = v128_from_v64(l3, l1);
const v128 g = v128_from_v64(l2, l4);
const v128 h = v128_from_v64(
l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride));
v128 c, d, e, f;
if (left) {
c = v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride));
d = v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride));
} else { // Left clipping
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
}
if (right) {
e = v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride));
f = v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride));
} else { // Right clipping
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
v64_store_aligned(dst, v128_high_v64(o));
v64_store_aligned(dst + dstride, v128_low_v64(o));
src += sstride * 2;
dst += dstride * 2;
}
}
// As above, but with no clipping tests
static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int dstride, int x0, int y0, int sizey,
unsigned int strength, unsigned int dmp) { unsigned int strength, unsigned int dmp) {
int y; int y;
@ -115,84 +52,6 @@ static void clpf_block8_noclip(const uint8_t *src, uint8_t *dst, int sstride,
// Process blocks of width 4, four lines at a time, 8 bit. // Process blocks of width 4, four lines at a time, 8 bit.
static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride, static void clpf_block4(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey,
BOUNDARY_TYPE bt, unsigned int strength,
unsigned int dmp) {
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 4 : -1;
const int left = !(bt & TILE_LEFT_BOUNDARY);
const int top = bt & TILE_ABOVE_BOUNDARY ? 0 : -1;
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0504040401000000LL, 0x0d0c0c0c09080808LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0605040402010000LL, 0x0e0d0c0c0a090808LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0707060503030201LL, 0x0f0f0e0d0b0b0a09LL };
DECLARE_ALIGNED(16, static const uint64_t,
f_shuff[]) = { 0x0707070603030302LL, 0x0f0f0f0e0b0b0b0aLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y += 4) {
const uint32_t l0 = u32_load_aligned(src - 2 * (y != top) * sstride);
const uint32_t l1 = u32_load_aligned(src - (y != top) * sstride);
const uint32_t l2 = u32_load_aligned(src);
const uint32_t l3 = u32_load_aligned(src + sstride);
const uint32_t l4 = u32_load_aligned(src + 2 * sstride);
const uint32_t l5 = u32_load_aligned(src + 3 * sstride);
const uint32_t l6 = u32_load_aligned(src + ((y != bottom) + 3) * sstride);
const uint32_t l7 =
u32_load_aligned(src + (2 * (y != bottom) + 3) * sstride);
v128 o = v128_from_32(l2, l3, l4, l5);
const v128 a = v128_from_32(l0, l1, l2, l3);
const v128 b = v128_from_32(l1, l2, l3, l4);
const v128 g = v128_from_32(l3, l4, l5, l6);
const v128 h = v128_from_32(l4, l5, l6, l7);
v128 c, d, e, f;
if (left) {
c = v128_from_32(u32_load_unaligned(src - 2),
u32_load_unaligned(src + sstride - 2),
u32_load_unaligned(src + 2 * sstride - 2),
u32_load_unaligned(src + 3 * sstride - 2));
d = v128_from_32(u32_load_unaligned(src - 1),
u32_load_unaligned(src + sstride - 1),
u32_load_unaligned(src + 2 * sstride - 1),
u32_load_unaligned(src + 3 * sstride - 1));
} else { // Left clipping
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
}
if (right) {
e = v128_from_32(u32_load_unaligned(src + 1),
u32_load_unaligned(src + sstride + 1),
u32_load_unaligned(src + 2 * sstride + 1),
u32_load_unaligned(src + 3 * sstride + 1));
f = v128_from_32(u32_load_unaligned(src + 2),
u32_load_unaligned(src + sstride + 2),
u32_load_unaligned(src + 2 * sstride + 2),
u32_load_unaligned(src + 3 * sstride + 2));
} else { // Right clipping
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
o = calc_delta(o, a, b, c, d, e, f, g, h, strength, dmp);
u32_store_aligned(dst, v128_low_u32(v128_shr_n_byte(o, 12)));
u32_store_aligned(dst + dstride, v128_low_u32(v128_shr_n_byte(o, 8)));
u32_store_aligned(dst + 2 * dstride, v128_low_u32(v128_shr_n_byte(o, 4)));
u32_store_aligned(dst + 3 * dstride, v128_low_u32(o));
dst += 4 * dstride;
src += 4 * sstride;
}
}
// As above, but with no clipping tests
static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int dstride, int x0, int y0, int sizey,
unsigned int strength, unsigned int dmp) { unsigned int strength, unsigned int dmp) {
int y; int y;
@ -246,20 +105,16 @@ static void clpf_block4_noclip(const uint8_t *src, uint8_t *dst, int sstride,
void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride, void SIMD_FUNC(aom_clpf_block)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int dmp) { unsigned int dmp) {
if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) { if ((sizex != 4 && sizex != 8) || ((sizey & 3) && sizex == 4)) {
// Fallback to C for odd sizes: // Fallback to C for odd sizes:
// * block widths not 4 or 8 // * block widths not 4 or 8
// * block heights not a multiple of 4 if the block width is 4 // * block heights not a multiple of 4 if the block width is 4
aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength, aom_clpf_block_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, strength,
bt, dmp); dmp);
} else { } else {
if (bt) (sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, y0,
(sizex == 4 ? clpf_block4 : clpf_block8)(src, dst, sstride, dstride, x0, sizey, strength, dmp);
y0, sizey, bt, strength, dmp);
else
(sizex == 4 ? clpf_block4_noclip : clpf_block8_noclip)(
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
} }
} }
@ -324,68 +179,6 @@ static void calc_delta_hbd8(v128 o, v128 a, v128 b, v128 c, v128 d, v128 e,
SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst, SIMD_INLINE void clpf_block_hbd4(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0, int sstride, int dstride, int x0, int y0,
int sizey, unsigned int strength, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int dmp) {
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int bottom = bt & TILE_BOTTOM_BOUNDARY ? sizey - 2 : -1;
const int left = !(bt & TILE_LEFT_BOUNDARY);
const int top = bt & TILE_ABOVE_BOUNDARY ? 0 : -1;
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090809080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080908LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0706070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
f_shuff[]) = { 0x0706070607060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
for (y = 0; y < sizey; y += 2) {
const v64 l1 = v64_load_aligned(src);
const v64 l2 = v64_load_aligned(src + sstride);
const v64 l3 = v64_load_aligned(src - (y != top) * sstride);
const v64 l4 = v64_load_aligned(src + ((y != bottom) + 1) * sstride);
v128 o = v128_from_v64(l1, l2);
const v128 a =
v128_from_v64(v64_load_aligned(src - 2 * (y != top) * sstride), l3);
const v128 b = v128_from_v64(l3, l1);
const v128 g = v128_from_v64(l2, l4);
const v128 h = v128_from_v64(
l4, v64_load_aligned(src + (2 * (y != bottom) + 1) * sstride));
v128 c, d, e, f;
if (left) {
c = v128_from_v64(v64_load_unaligned(src - 2),
v64_load_unaligned(src - 2 + sstride));
d = v128_from_v64(v64_load_unaligned(src - 1),
v64_load_unaligned(src - 1 + sstride));
} else { // Left clipping
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
}
if (right) {
e = v128_from_v64(v64_load_unaligned(src + 1),
v64_load_unaligned(src + 1 + sstride));
f = v128_from_v64(v64_load_unaligned(src + 2),
v64_load_unaligned(src + 2 + sstride));
} else { // Right clipping
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
calc_delta_hbd4(o, a, b, c, d, e, f, g, h, dst, strength, dmp, dstride);
src += sstride * 2;
dst += dstride * 2;
}
}
// As above, but with no clipping tests
SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0,
int y0, int sizey,
unsigned int strength,
unsigned int dmp) { unsigned int dmp) {
int y; int y;
@ -420,62 +213,7 @@ SIMD_INLINE void clpf_block_hbd4_noclip(const uint16_t *src, uint16_t *dst,
// The most simple case. Start here if you need to understand the functions. // The most simple case. Start here if you need to understand the functions.
SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride, SIMD_INLINE void clpf_block_hbd(const uint16_t *src, uint16_t *dst, int sstride,
int dstride, int x0, int y0, int sizey, int dstride, int x0, int y0, int sizey,
unsigned int strength, BOUNDARY_TYPE bt, unsigned int strength, unsigned int dmp) {
unsigned int dmp) {
const int right = !(bt & TILE_RIGHT_BOUNDARY);
const int left = !(bt & TILE_LEFT_BOUNDARY);
const int ymin = -!(bt & TILE_ABOVE_BOUNDARY) * 2;
const int ymax = sizey + !(bt & TILE_BOTTOM_BOUNDARY) * 2 - 1;
DECLARE_ALIGNED(16, static const uint64_t,
c_shuff[]) = { 0x0302010001000100LL, 0x0b0a090807060504LL };
DECLARE_ALIGNED(16, static const uint64_t,
d_shuff[]) = { 0x0504030201000100LL, 0x0d0c0b0a09080706LL };
DECLARE_ALIGNED(16, static const uint64_t,
e_shuff[]) = { 0x0908070605040302LL, 0x0f0e0f0e0d0c0b0aLL };
DECLARE_ALIGNED(16, static const uint64_t,
f_shuff[]) = { 0x0b0a090807060504LL, 0x0f0e0f0e0f0e0d0cLL };
int y;
dst += x0 + y0 * dstride;
src += x0 + y0 * sstride;
// Read 8 set of pixels at a time. Clipping along upper and lower
// edges is handled by reading the upper or lower line twice.
// Clipping along the left and right edges is handled by shuffle
// instructions doing shift and pad.
for (y = 0; y < sizey; y++) {
const v128 o = v128_load_aligned(src + y * sstride);
const v128 a = v128_load_aligned(src + AOMMAX(ymin, y - 2) * sstride);
const v128 b = v128_load_aligned(src + AOMMAX(ymin, y - 1) * sstride);
const v128 g = v128_load_aligned(src + AOMMIN(ymax, y + 1) * sstride);
const v128 h = v128_load_aligned(src + AOMMIN(ymax, y + 2) * sstride);
v128 c, d, e, f;
if (left) {
c = v128_load_unaligned(src + y * sstride - 2);
d = v128_load_unaligned(src + y * sstride - 1);
} else { // Left clipping
c = v128_shuffle_8(o, v128_load_aligned(c_shuff));
d = v128_shuffle_8(o, v128_load_aligned(d_shuff));
}
if (right) {
e = v128_load_unaligned(src + y * sstride + 1);
f = v128_load_unaligned(src + y * sstride + 2);
} else { // Right clipping
e = v128_shuffle_8(o, v128_load_aligned(e_shuff));
f = v128_shuffle_8(o, v128_load_aligned(f_shuff));
}
calc_delta_hbd8(o, a, b, c, d, e, f, g, h, dst, strength, dmp);
dst += dstride;
}
}
// As above, but with no clipping tests
SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0,
int sizey, unsigned int strength,
unsigned int dmp) {
int y; int y;
dst += x0 + y0 * dstride; dst += x0 + y0 * dstride;
@ -501,19 +239,15 @@ SIMD_INLINE void clpf_block_hbd_noclip(const uint16_t *src, uint16_t *dst,
void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst, void SIMD_FUNC(aom_clpf_block_hbd)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0, int sstride, int dstride, int x0, int y0,
int sizex, int sizey, unsigned int strength, int sizex, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int dmp) { unsigned int dmp) {
if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) { if ((sizex != 4 && sizex != 8) || ((sizey & 1) && sizex == 4)) {
// Fallback to C for odd sizes: // Fallback to C for odd sizes:
// * block width not 4 or 8 // * block width not 4 or 8
// * block heights not a multiple of 2 if the block width is 4 // * block heights not a multiple of 2 if the block width is 4
aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey, aom_clpf_block_hbd_c(src, dst, sstride, dstride, x0, y0, sizex, sizey,
strength, bt, dmp); strength, dmp);
} else { } else {
if (bt)
(sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)( (sizex == 4 ? clpf_block_hbd4 : clpf_block_hbd)(
src, dst, sstride, dstride, x0, y0, sizey, strength, bt, dmp);
else
(sizex == 4 ? clpf_block_hbd4_noclip : clpf_block_hbd_noclip)(
src, dst, sstride, dstride, x0, y0, sizey, strength, dmp); src, dst, sstride, dstride, x0, y0, sizey, strength, dmp);
} }
} }

Просмотреть файл

@ -242,8 +242,7 @@ void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
void od_dering(uint16_t *y, uint16_t *in, int xdec, void od_dering(uint16_t *y, uint16_t *in, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
dering_list *dlist, int dering_count, int threshold, dering_list *dlist, int dering_count, int threshold,
int clpf_strength, int clpf_damping, int coeff_shift, int clpf_strength, int clpf_damping, int coeff_shift) {
BOUNDARY_TYPE bt) {
int bi; int bi;
int bx; int bx;
int by; int by;
@ -286,17 +285,12 @@ void od_dering(uint16_t *y, uint16_t *in, int xdec,
copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count, copy_dering_16bit_to_16bit(in, OD_FILT_BSTRIDE, y, dlist, dering_count,
bsize); bsize);
for (bi = 0; bi < dering_count; bi++) { for (bi = 0; bi < dering_count; bi++) {
BOUNDARY_TYPE bt2 = 0;
by = dlist[bi].by; by = dlist[bi].by;
bx = dlist[bi].bx; bx = dlist[bi].bx;
// Prevent CLPF from reading across superblock boundaries
if (!by) bt2 |= TILE_ABOVE_BOUNDARY;
if (by == (1 << bsize) - 1) bt2 |= TILE_BOTTOM_BOUNDARY;
aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)], aom_clpf_block_hbd(in, &y[((bi - by) << 2 * bsize) - (bx << bsize)],
OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize, OD_FILT_BSTRIDE, 1 << bsize, bx << bsize, by << bsize,
1 << bsize, 1 << bsize, clpf_strength << coeff_shift, 1 << bsize, 1 << bsize, clpf_strength << coeff_shift,
bt | bt2, clpf_damping + coeff_shift); clpf_damping + coeff_shift);
} }
} }

Просмотреть файл

@ -49,8 +49,7 @@ void copy_dering_16bit_to_16bit(uint16_t *dst, int dstride, uint16_t *src,
void od_dering(uint16_t *y, uint16_t *in, int xdec, void od_dering(uint16_t *y, uint16_t *in, int xdec,
int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli, int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS], int pli,
dering_list *dlist, int dering_count, int threshold, dering_list *dlist, int dering_count, int threshold,
int clpf_strength, int clpf_damping, int coeff_shift, int clpf_strength, int clpf_damping, int coeff_shift);
BOUNDARY_TYPE bt);
int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride, int od_filter_dering_direction_4x4_c(uint16_t *y, int ystride,
const uint16_t *in, int threshold, const uint16_t *in, int threshold,
int dir); int dir);

Просмотреть файл

@ -140,7 +140,7 @@ void av1_cdef_search(YV12_BUFFER_CONFIG *frame, const YV12_BUFFER_CONFIG *ref,
} }
for (i = 0; i < CLPF_STRENGTHS; i++) { for (i = 0; i < CLPF_STRENGTHS; i++) {
od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold, od_dering(tmp_dst, in, 0, dir, 0, dlist, dering_count, threshold,
i + (i == 3), clpf_damping, coeff_shift, 0); i + (i == 3), clpf_damping, coeff_shift);
copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst, copy_dering_16bit_to_16bit(dst, MAX_MIB_SIZE << bsize[0], tmp_dst,
dlist, dering_count, bsize[0]); dlist, dering_count, bsize[0]);
mse[sb_count][gi][i] = (int)compute_dist( mse[sb_count][gi][i] = (int)compute_dist(

Просмотреть файл

@ -28,8 +28,7 @@ namespace {
typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride, typedef void (*clpf_block_t)(const uint8_t *src, uint8_t *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey, int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt, unsigned int strength, unsigned int bitdepth);
unsigned int bitdepth);
typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int> typedef std::tr1::tuple<clpf_block_t, clpf_block_t, int, int>
clpf_block_param_t; clpf_block_param_t;
@ -59,7 +58,7 @@ typedef ClpfBlockTest ClpfSpeedTest;
typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst, typedef void (*clpf_block_hbd_t)(const uint16_t *src, uint16_t *dst,
int sstride, int dstride, int x0, int y0, int sstride, int dstride, int x0, int y0,
int sizex, int sizey, unsigned int strength, int sizex, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int bitdepth); unsigned int bitdepth);
typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int> typedef std::tr1::tuple<clpf_block_hbd_t, clpf_block_hbd_t, int, int>
clpf_block_hbd_param_t; clpf_block_hbd_param_t;
@ -91,12 +90,11 @@ template <typename pixel>
void test_clpf(int w, int h, int depth, int iterations, void test_clpf(int w, int h, int depth, int iterations,
void (*clpf)(const pixel *src, pixel *dst, int sstride, void (*clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex, int sizey, int dstride, int x0, int y0, int sizex, int sizey,
unsigned int strength, BOUNDARY_TYPE bt, unsigned int strength, unsigned int bitdepth),
unsigned int bitdepth),
void (*ref_clpf)(const pixel *src, pixel *dst, int sstride, void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex, int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int bitdepth)) { unsigned int bitdepth)) {
const int size = 24; const int size = 24;
ACMRandom rnd(ACMRandom::DeterministicSeed()); ACMRandom rnd(ACMRandom::DeterministicSeed());
DECLARE_ALIGNED(16, pixel, s[size * size]); DECLARE_ALIGNED(16, pixel, s[size * size]);
@ -125,16 +123,11 @@ void test_clpf(int w, int h, int depth, int iterations,
for (xpos = 0; xpos < size && !error; xpos += w * !error) { for (xpos = 0; xpos < size && !error; xpos += w * !error) {
for (strength = depth - 8; strength < depth - 5 && !error; for (strength = depth - 8; strength < depth - 5 && !error;
strength += !error) { strength += !error) {
BOUNDARY_TYPE bt =
BOUNDARY_TYPE((TILE_LEFT_BOUNDARY & -(!xpos)) |
(TILE_ABOVE_BOUNDARY & -(!ypos)) |
(TILE_RIGHT_BOUNDARY & -(xpos + w == size)) |
(TILE_BOTTOM_BOUNDARY & -(ypos + h == size)));
ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, 1 << strength, ref_clpf(s, ref_d, size, size, xpos, ypos, w, h, 1 << strength,
bt, depth); depth);
if (clpf != ref_clpf) if (clpf != ref_clpf)
ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w, ASM_REGISTER_STATE_CHECK(clpf(s, d, size, size, xpos, ypos, w,
h, 1 << strength, bt, depth)); h, 1 << strength, depth));
if (ref_clpf != clpf) if (ref_clpf != clpf)
for (pos = 0; pos < size * size && !error; pos++) { for (pos = 0; pos < size * size && !error; pos++) {
error = ref_d[pos] != d[pos]; error = ref_d[pos] != d[pos];
@ -175,11 +168,10 @@ void test_clpf_speed(int w, int h, int depth, int iterations,
void (*clpf)(const pixel *src, pixel *dst, int sstride, void (*clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex, int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength, int sizey, unsigned int strength,
BOUNDARY_TYPE bt, unsigned int bitdepth), unsigned int bitdepth),
void (*ref_clpf)(const pixel *src, pixel *dst, int sstride, void (*ref_clpf)(const pixel *src, pixel *dst, int sstride,
int dstride, int x0, int y0, int sizex, int dstride, int x0, int y0, int sizex,
int sizey, unsigned int strength, int sizey, unsigned int strength,
BOUNDARY_TYPE bt,
unsigned int bitdepth)) { unsigned int bitdepth)) {
aom_usec_timer ref_timer; aom_usec_timer ref_timer;
aom_usec_timer timer; aom_usec_timer timer;