implement combined parallel_deblocking experiment

The parallel_deblocking experiment is proposed jointly by Intel
and Microsoft. The following changes are implemented in this
experiment:

- deblocking filter order is changed to filter all vertical edges
  of the whole frame followed by filtering all horizontal edges
  of the whole frame

- filter length decision is made based on the transform block size
  on both sides of the edge. block with smaller transform size
  determines the final filter length.

- transform blocks on both sides of the edge are checked, only when
  both blocks are skipped and they belong to the same prediction
  block, filtering of that edge can be skipped.

- 15-tap filter and extended flat area detection are removed.

- special rule for handling 4x4 transform block on the super block
  boundary in VP9 is removed.

Change-Id: I1aa82c6b5335d47c2f73eec8fc8bee2c08a1cf74
This commit is contained in:
Ryan Lei 2017-02-09 13:05:42 -08:00
Родитель ab77e73b77
Коммит 392d0ff726
3 изменённых файлов: 537 добавлений и 48 удалений

Просмотреть файл

@ -30,8 +30,17 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
}
}
#endif
#if CONFIG_PARALLEL_DEBLOCKING
// should we apply any filter at all: 11111111 yes, 00000000 no
static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
uint8_t p0, uint8_t q0, uint8_t q1) {
int8_t mask = 0;
mask |= (abs(p1 - p0) > limit) * -1;
mask |= (abs(q1 - q0) > limit) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
return ~mask;
}
#endif // CONFIG_PARALLEL_DEBLOCKING
static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
uint8_t q1, uint8_t q2, uint8_t q3) {
@ -118,10 +127,16 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
#else // CONFIG_PARALLEL_DEBLOCKING
const uint8_t p1 = s[-2 * p], p0 = s[-p];
const uint8_t q0 = s[0 * p], q1 = s[1 * p];
const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
#endif // !CONFIG_PARALLEL_DEBLOCKING
filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
++s;
}
@ -142,10 +157,16 @@ void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask =
filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
#else // CONFIG_PARALLEL_DEBLOCKING
const uint8_t p1 = s[-2], p0 = s[-1];
const uint8_t q0 = s[0], q1 = s[1];
const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
#endif // !CONFIG_PARALLEL_DEBLOCKING
filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
s += pitch;
}
@ -351,6 +372,21 @@ void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
}
#if CONFIG_AOM_HIGHBITDEPTH
#if CONFIG_PARALLEL_DEBLOCKING
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
uint16_t p1, uint16_t p0, uint16_t q0,
uint16_t q1, int bd) {
int8_t mask = 0;
int16_t limit16 = (uint16_t)limit << (bd - 8);
int16_t blimit16 = (uint16_t)blimit << (bd - 8);
mask |= (abs(p1 - p0) > limit16) * -1;
mask |= (abs(q1 - q0) > limit16) * -1;
mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
return ~mask;
}
#endif // CONFIG_PARALLEL_DEBLOCKING
// Should we apply any filter at all: 11111111 yes, 00000000 no ?
static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
uint16_t p3, uint16_t p2, uint16_t p1,
@ -449,6 +485,7 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint16_t p3 = s[-4 * p];
const uint16_t p2 = s[-3 * p];
const uint16_t p1 = s[-2 * p];
@ -459,6 +496,14 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
const uint16_t q3 = s[3 * p];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
#else // CONFIG_PARALLEL_DEBLOCKING
const uint16_t p1 = s[-2 * p];
const uint16_t p0 = s[-p];
const uint16_t q0 = s[0 * p];
const uint16_t q1 = s[1 * p];
const int8_t mask =
highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
#endif // !CONFIG_PARALLEL_DEBLOCKING
highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
++s;
}
@ -480,10 +525,17 @@ void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
// loop filter designed to work using chars so that we can make maximum use
// of 8 bit simd instructions.
for (i = 0; i < 8; ++i) {
#if !CONFIG_PARALLEL_DEBLOCKING
const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
const int8_t mask =
highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
#else // CONFIG_PARALLEL_DEBLOCKING
const uint16_t p1 = s[-2], p0 = s[-1];
const uint16_t q0 = s[0], q1 = s[1];
const int8_t mask =
highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
#endif // !CONFIG_PARALLEL_DEBLOCKING
highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
s += pitch;
}

Просмотреть файл

@ -19,6 +19,40 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
}
#if CONFIG_PARALLEL_DEBLOCKING
// filter_mask and hev_mask
#define FILTER_HEV_MASK4 \
do { \
/* (abs(q1 - q0), abs(p1 - p0) */ \
__m128i flat = abs_diff(q1p1, q0p0); \
/* abs(p1 - q1), abs(p0 - q0) */ \
const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0); \
__m128i abs_p0q0, abs_p1q1; \
\
/* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */ \
hev = \
_mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
hev = _mm_cmpgt_epi16(hev, thresh); \
hev = _mm_packs_epi16(hev, hev); \
\
/* const int8_t mask = filter_mask2(*limit, *blimit, */ \
/* p1, p0, q0, q1); */ \
abs_p0q0 = \
_mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */ \
abs_p1q1 = \
_mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */ \
abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9); \
abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */ \
/* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \
mask = _mm_adds_epu8(abs_p0q0, abs_p1q1); \
flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); \
mask = _mm_unpacklo_epi64(mask, flat); \
mask = _mm_subs_epu8(mask, limit); \
mask = _mm_cmpeq_epi8(mask, zero); \
mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8)); \
} while (0)
#endif // CONFIG_PARALLEL_DEBLOCKING
// filter_mask and hev_mask
#define FILTER_HEV_MASK \
do { \
@ -114,23 +148,34 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
const __m128i thresh =
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
#if !CONFIG_PARALLEL_DEBLOCKING
__m128i p3p2, p2p1, q3q2, q2q1;
#endif // !CONFIG_PARALLEL_DEBLOCKING
__m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
#if !CONFIG_PARALLEL_DEBLOCKING
p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
_mm_loadl_epi64((__m128i *)(s - 4 * p)));
#endif // !CONFIG_PARALLEL_DEBLOCKING
q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
_mm_loadl_epi64((__m128i *)(s + 1 * p)));
q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
_mm_loadl_epi64((__m128i *)(s + 0 * p)));
#if !CONFIG_PARALLEL_DEBLOCKING
q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
_mm_loadl_epi64((__m128i *)(s + 3 * p)));
#endif // !CONFIG_PARALLEL_DEBLOCKING
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
#if !CONFIG_PARALLEL_DEBLOCKING
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
#endif // !CONFIG_PARALLEL_DEBLOCKING
#if !CONFIG_PARALLEL_DEBLOCKING
FILTER_HEV_MASK;
#else // CONFIG_PARALLEL_DEBLOCKING
FILTER_HEV_MASK4;
#endif // !CONFIG_PARALLEL_DEBLOCKING
FILTER4;
_mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0)); // *op1
@ -150,7 +195,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
_mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
const __m128i ff = _mm_cmpeq_epi8(zero, zero);
__m128i x0, x1, x2, x3;
__m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
#if !CONFIG_PARALLEL_DEBLOCKING
__m128i p3p2, p2p1, q3q2, q2q1;
#endif // !CONFIG_PARALLEL_DEBLOCKING
__m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
__m128i mask, hev;
// 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
@ -174,29 +222,40 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
p1p0 = _mm_unpacklo_epi16(q1q0, x1);
// 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
x0 = _mm_unpacklo_epi16(x2, x3);
#if !CONFIG_PARALLEL_DEBLOCKING
// 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
p3p2 = _mm_unpacklo_epi32(p1p0, x0);
#endif // !CONFIG_PARALLEL_DEBLOCKING
// 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
p1p0 = _mm_unpackhi_epi32(p1p0, x0);
#if !CONFIG_PARALLEL_DEBLOCKING
p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8)); // swap lo and high
#endif // !CONFIG_PARALLEL_DEBLOCKING
p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8)); // swap lo and high
// 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
q1q0 = _mm_unpackhi_epi16(q1q0, x1);
// 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
x2 = _mm_unpackhi_epi16(x2, x3);
#if !CONFIG_PARALLEL_DEBLOCKING
// 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
q3q2 = _mm_unpackhi_epi32(q1q0, x2);
#endif // !CONFIG_PARALLEL_DEBLOCKING
// 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
q1q0 = _mm_unpacklo_epi32(q1q0, x2);
q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
#if !CONFIG_PARALLEL_DEBLOCKING
p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
#endif // !CONFIG_PARALLEL_DEBLOCKING
#if !CONFIG_PARALLEL_DEBLOCKING
FILTER_HEV_MASK;
#else // CONFIG_PARALLEL_DEBLOCKING
FILTER_HEV_MASK4;
#endif // !CONFIG_PARALLEL_DEBLOCKING
FILTER4;
// Transpose 8x4 to 4x8
@ -1395,18 +1454,23 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
_mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
_mm_load_si128((const __m128i *)_thresh1));
const __m128i zero = _mm_set1_epi16(0);
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
#if !CONFIG_PARALLEL_DEBLOCKING
__m128i p3, p2, q2, q3;
#endif // !CONFIG_PARALLEL_DEBLOCKING
__m128i p1, p0, q0, q1;
__m128i mask, hev, flat;
#if !CONFIG_PARALLEL_DEBLOCKING
p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
#endif // !CONFIG_PARALLEL_DEBLOCKING
p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
#if !CONFIG_PARALLEL_DEBLOCKING
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
#endif // !CONFIG_PARALLEL_DEBLOCKING
// filter_mask and hev_mask
{
const __m128i abs_p1p0 =
@ -1419,8 +1483,9 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
_mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
__m128i abs_p1q1 =
_mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
#if !CONFIG_PARALLEL_DEBLOCKING
__m128i work;
#endif // !CONFIG_PARALLEL_DEBLOCKING
flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
hev = _mm_subs_epu8(flat, thresh);
hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@ -1431,6 +1496,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
// mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
mask = _mm_max_epu8(flat, mask);
#if !CONFIG_PARALLEL_DEBLOCKING
// mask |= (abs(p1 - p0) > limit) * -1;
// mask |= (abs(q1 - q0) > limit) * -1;
work = _mm_max_epu8(
@ -1441,6 +1507,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
_mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
_mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
mask = _mm_max_epu8(work, mask);
#endif // !CONFIG_PARALLEL_DEBLOCKING
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
}
@ -1584,6 +1651,51 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
_mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
}
#if CONFIG_PARALLEL_DEBLOCKING
#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
enum { ROTATE_DWORD_RIGHT = 0x39 };
static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
const uint8_t *pSrc,
const ptrdiff_t srcStride) {
for (uint32_t idx = 0; idx < 2; idx += 1) {
__m128i r0, r1, r2, r3;
// load data
r0 = movq(pSrc);
r1 = movq(pSrc + srcStride);
r2 = movq(pSrc + srcStride * 2);
r3 = movq(pSrc + srcStride * 3);
// transpose
r0 = punpcklbw(r0, r1);
r2 = punpcklbw(r2, r3);
r1 = punpckhwd(r0, r2);
r0 = punpcklwd(r0, r2);
// store data
movd(pDst, r0);
r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
movd(pDst + dstStride, r0);
r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
movd(pDst + dstStride * 2, r0);
r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
movd(pDst + dstStride * 3, r0);
movd(pDst + dstStride * 4, r1);
r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
movd(pDst + dstStride * 5, r1);
r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
movd(pDst + dstStride * 6, r1);
r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
movd(pDst + dstStride * 7, r1);
// advance the pointers
pDst += dstStride * 8;
pSrc += 8;
}
}
#endif // CONFIG_PARALLEL_DEBLOCKING
static INLINE void transpose(unsigned char *src[], int in_p,
unsigned char *dst[], int out_p,
int num_8x8_to_transpose) {
@ -1663,15 +1775,17 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
const uint8_t *blimit1, const uint8_t *limit1,
const uint8_t *thresh1) {
DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
#if !CONFIG_PARALLEL_DEBLOCKING
unsigned char *src[2];
unsigned char *dst[2];
#endif // !CONFIG_PARALLEL_DEBLOCKING
// Transpose 8x16
transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
// Loop filtering
aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
blimit1, limit1, thresh1);
#if !CONFIG_PARALLEL_DEBLOCKING
src[0] = t_dst;
src[1] = t_dst + 8;
dst[0] = s - 4;
@ -1679,6 +1793,9 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
// Transpose back
transpose(src, 16, dst, p, 2);
#else // CONFIG_PARALLEL_DEBLOCKING
transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
#endif // !CONFIG_PARALLEL_DEBLOCKING
}
void aom_lpf_vertical_8_sse2(unsigned char *s, int p,

Просмотреть файл

@ -1938,6 +1938,345 @@ void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
dst->buf = dst0;
}
#if CONFIG_PARALLEL_DEBLOCKING
typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = {
// mask for vertical edges filtering
{
#if CONFIG_CB4X4
2 - 1, // BLOCK_2X2
2 - 1, // BLOCK_2X4
4 - 1, // BLOCK_4X2
#endif // CONFIG_CB4X4
4 - 1, // BLOCK_4X4
4 - 1, // BLOCK_4X8
8 - 1, // BLOCK_8X4
8 - 1, // BLOCK_8X8
8 - 1, // BLOCK_8X16
16 - 1, // BLOCK_16X8
16 - 1, // BLOCK_16X16
16 - 1, // BLOCK_16X32
32 - 1, // BLOCK_32X16
32 - 1, // BLOCK_32X32
32 - 1, // BLOCK_32X64
64 - 1, // BLOCK_64X32
64 - 1, // BLOCK_64X64
#if CONFIG_EXT_PARTITION
64 - 1, // BLOCK_64X128
128 - 1, // BLOCK_128X64
128 - 1 // BLOCK_128X128
#endif // CONFIG_EXT_PARTITION
},
// mask for horizontal edges filtering
{
#if CONFIG_CB4X4
2 - 1, // BLOCK_2X2
4 - 1, // BLOCK_2X4
2 - 1, // BLOCK_4X2
#endif // CONFIG_CB4X4
4 - 1, // BLOCK_4X4
8 - 1, // BLOCK_4X8
4 - 1, // BLOCK_8X4
8 - 1, // BLOCK_8X8
16 - 1, // BLOCK_8X16
8 - 1, // BLOCK_16X8
16 - 1, // BLOCK_16X16
32 - 1, // BLOCK_16X32
16 - 1, // BLOCK_32X16
32 - 1, // BLOCK_32X32
64 - 1, // BLOCK_32X64
32 - 1, // BLOCK_64X32
64 - 1, // BLOCK_64X64
#if CONFIG_EXT_PARTITION
128 - 1, // BLOCK_64X128
64 - 1, // BLOCK_128X64
128 - 1 // BLOCK_128X128
#endif // CONFIG_EXT_PARTITION
},
};
static const uint32_t av1_transform_masks[NUM_EDGE_DIRS][TX_SIZES_ALL] = {
{
#if CONFIG_CB4X4
2 - 1, // TX_2X2
#endif
4 - 1, // TX_4X4
8 - 1, // TX_8X8
16 - 1, // TX_16X16
32 - 1, // TX_32X32
#if CONFIG_TX64X64
64 - 1, // TX_64X64
#endif // CONFIG_TX64X64
4 - 1, // TX_4X8
8 - 1, // TX_8X4
8 - 1, // TX_8X16
16 - 1, // TX_16X8
16 - 1, // TX_16X32
32 - 1, // TX_32X16
4 - 1, // TX_4X16
16 - 1, // TX_16X4
8 - 1, // TX_8X32
32 - 1 // TX_32X8
},
{
#if CONFIG_CB4X4
2 - 1, // TX_2X2
#endif
4 - 1, // TX_4X4
8 - 1, // TX_8X8
16 - 1, // TX_16X16
32 - 1, // TX_32X32
#if CONFIG_TX64X64
64 - 1, // TX_64X64
#endif // CONFIG_TX64X64
8 - 1, // TX_4X8
4 - 1, // TX_8X4
16 - 1, // TX_8X16
8 - 1, // TX_16X8
32 - 1, // TX_16X32
16 - 1, // TX_32X16
16 - 1, // TX_4X16
4 - 1, // TX_16X4
32 - 1, // TX_8X32
8 - 1 // TX_32X8
}
};
static TX_SIZE av1_get_transform_size(const MODE_INFO *const pCurr,
const EDGE_DIR edgeDir,
const uint32_t scaleHorz,
const uint32_t scaleVert) {
const BLOCK_SIZE bs = pCurr->mbmi.sb_type;
TX_SIZE txSize;
// since in case of chrominance or non-square transorm need to convert
// transform size into transform size in particular direction.
txSize = uv_txsize_lookup[bs][pCurr->mbmi.tx_size][scaleHorz][scaleVert];
if (VERT_EDGE == edgeDir) {
txSize = txsize_horz_map[txSize];
} else {
txSize = txsize_vert_map[txSize];
}
return txSize;
}
typedef struct AV1_DEBLOCKING_PARAMETERS {
// length of the filter applied to the outer edge
uint32_t filterLength;
// length of the filter applied to the inner edge
uint32_t filterLengthInternal;
// deblocking limits
const uint8_t *lim;
const uint8_t *mblim;
const uint8_t *hev_thr;
} AV1_DEBLOCKING_PARAMETERS;
static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
const MODE_INFO **const ppCurr,
const ptrdiff_t modeStep,
const AV1_COMMON *const cm,
const EDGE_DIR edgeDir, const uint32_t x,
const uint32_t y, const uint32_t width,
const uint32_t height, const uint32_t scaleHorz,
const uint32_t scaleVert) {
// reset to initial values
pParams->filterLength = 0;
pParams->filterLengthInternal = 0;
// no deblocking is required
if ((width <= x) || (height <= y)) {
return;
}
#if CONFIG_EXT_PARTITION
// not sure if changes are required.
assert(0 && "Not yet updated");
#endif // CONFIG_EXT_PARTITION
{
const TX_SIZE ts =
av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert);
const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi);
const int currSkipped =
ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi);
const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y);
uint32_t level = currLevel;
// prepare outer edge parameters. deblock the edge if it's an edge of a TU
if (coord) {
#if CONFIG_LOOPFILTERING_ACROSS_TILES
if (!av1_disable_loopfilter_on_tile_boundary(cm) ||
((VERT_EDGE == edgeDir) &&
(0 == (ppCurr[0]->mbmi.boundary_info & TILE_LEFT_BOUNDARY))) ||
((HORZ_EDGE == edgeDir) &&
(0 == (ppCurr[0]->mbmi.boundary_info & TILE_ABOVE_BOUNDARY))))
#endif // CONFIG_LOOPFILTERING_ACROSS_TILES
{
const int32_t tuEdge =
(coord & av1_transform_masks[edgeDir][ts]) ? (0) : (1);
if (tuEdge) {
const MODE_INFO *const pPrev = *(ppCurr - modeStep);
const TX_SIZE pvTs =
av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert);
const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi);
const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi);
const int32_t puEdge =
(coord &
av1_prediction_masks[edgeDir]
[ss_size_lookup[ppCurr[0]->mbmi.sb_type]
[scaleHorz][scaleVert]])
? (0)
: (1);
// if the current and the previous blocks are skipped,
// deblock the edge if the edge belongs to a PU's edge only.
if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) {
pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8);
// update the level if the current block is skipped,
// but the previous one is not
level = (currLevel) ? (currLevel) : (pvLvl);
}
}
}
// prepare internal edge parameters
if (currLevel && !currSkipped) {
pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0);
}
// prepare common parameters
if (pParams->filterLength || pParams->filterLengthInternal) {
const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
pParams->lim = limits->lim;
pParams->mblim = limits->mblim;
pParams->hev_thr = limits->hev_thr;
}
}
}
}
static void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
const MACROBLOCKD_PLANE *const pPlane,
const MODE_INFO **ppModeInfo,
const ptrdiff_t modeStride,
const uint32_t cuX,
const uint32_t cuY) {
const uint32_t scaleHorz = pPlane->subsampling_x;
const uint32_t scaleVert = pPlane->subsampling_y;
const uint32_t width = pPlane->dst.width;
const uint32_t height = pPlane->dst.height;
uint8_t *const pDst = pPlane->dst.buf;
const int dstStride = pPlane->dst.stride;
for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
uint8_t *p = pDst + y * MI_SIZE * dstStride;
for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
const MODE_INFO **const pCurr =
ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
AV1_DEBLOCKING_PARAMETERS params;
memset(&params, 0, sizeof(params));
set_lpf_parameters(&params, pCurr, ((ptrdiff_t)1 << scaleHorz), cm,
VERT_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
height, scaleHorz, scaleVert);
switch (params.filterLength) {
// apply 4-tap filtering
case 4:
aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 8-tap filtering
case 8:
aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim,
params.hev_thr);
break;
// no filtering
default: break;
}
// process the internal edge
if (params.filterLengthInternal) {
aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim,
params.hev_thr);
}
// advance the destination pointer
p += 8;
}
}
}
static void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
const MACROBLOCKD_PLANE *const pPlane,
const MODE_INFO **ppModeInfo,
const ptrdiff_t modeStride,
const uint32_t cuX,
const uint32_t cuY) {
const uint32_t scaleHorz = pPlane->subsampling_x;
const uint32_t scaleVert = pPlane->subsampling_y;
const uint32_t width = pPlane->dst.width;
const uint32_t height = pPlane->dst.height;
uint8_t *const pDst = pPlane->dst.buf;
const int dstStride = pPlane->dst.stride;
for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
uint8_t *p = pDst + y * MI_SIZE * dstStride;
for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
const MODE_INFO **const pCurr =
ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
AV1_DEBLOCKING_PARAMETERS params;
memset(&params, 0, sizeof(params));
set_lpf_parameters(&params, pCurr, (modeStride << scaleVert), cm,
HORZ_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
height, scaleHorz, scaleVert);
switch (params.filterLength) {
// apply 4-tap filtering
case 4:
aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim,
params.hev_thr);
break;
// apply 8-tap filtering
case 8:
aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim,
params.hev_thr);
break;
// no filtering
default: break;
}
// process the internal edge
if (params.filterLengthInternal) {
aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim,
params.lim, params.hev_thr);
}
// advance the destination pointer
p += 8;
}
}
}
#endif // CONFIG_PARALLEL_DEBLOCKING
void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
struct macroblockd_plane planes[MAX_MB_PLANE],
int start, int stop, int y_only) {
@ -1970,6 +2309,7 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
#else // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
const int num_planes = y_only ? 1 : MAX_MB_PLANE;
int mi_row, mi_col;
#if !CONFIG_PARALLEL_DEBLOCKING
enum lf_path path;
LOOP_FILTER_MASK lfm;
@ -1981,58 +2321,38 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
path = LF_PATH_444;
else
path = LF_PATH_SLOW;
#endif
#if CONFIG_PARALLEL_DEBLOCKING
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
int plane;
av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
// TODO(JBB): Make setup_mask work for non 420.
av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
const int32_t scaleHorz = planes[planeIdx].subsampling_x;
const int32_t scaleVert = planes[planeIdx].subsampling_y;
av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
for (plane = 1; plane < num_planes; ++plane) {
switch (path) {
case LF_PATH_420:
av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_444:
av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_SLOW:
av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
mi_row, mi_col);
break;
}
av1_filter_block_plane_vert(
cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
(mi_row * MI_SIZE) >> scaleVert);
}
}
}
for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
int plane;
av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
// TODO(JBB): Make setup_mask work for non 420.
av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
const int32_t scaleHorz = planes[planeIdx].subsampling_x;
const int32_t scaleVert = planes[planeIdx].subsampling_y;
av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
for (plane = 1; plane < num_planes; ++plane) {
switch (path) {
case LF_PATH_420:
av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_444:
av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
break;
case LF_PATH_SLOW:
av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
mi_row, mi_col);
break;
}
av1_filter_block_plane_horz(
cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
(mi_row * MI_SIZE) >> scaleVert);
}
}
}