implement combined parallel_deblocking experiment

The parallel_deblocking experiment is proposed jointly by Intel and Microsoft. The following changes are implemented in this experiment: - deblocking filter order is changed to filter all vertical edges of the whole frame followed by filtering all horizontal edges of the whole frame - filter length decision is made based on the transform block size on both sides of the edge. block with smaller transform size determines the final filter length. - transform blocks on both sides of the edge are checked, only when both blocks are skipped and they belong to the same prediction block, filtering of that edge can be skipped. - 15-tap filter and extended flat area detection are removed. - special rule for handling 4x4 transform block on the super block boundary in VP9 is removed. Change-Id: I1aa82c6b5335d47c2f73eec8fc8bee2c08a1cf74
2017-02-09 13:05:42 -08:00 · 2017-02-09 13:05:42 -08:00 · 392d0ff726
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@ -30,8 +30,17 @@ static INLINE int16_t signed_char_clamp_high(int t, int bd) {
  }
 }
 #endif
-
+#if CONFIG_PARALLEL_DEBLOCKING
 // should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+                                  uint8_t p0, uint8_t q0, uint8_t q1) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
                                 uint8_t q1, uint8_t q2, uint8_t q3) {
@ -118,10 +127,16 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
    const int8_t mask =
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint8_t p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
    ++s;
  }
@ -142,10 +157,16 @@ void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask =
        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint8_t p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
    s += pitch;
  }
@ -351,6 +372,21 @@ void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
 }

 #if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_PARALLEL_DEBLOCKING
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+                                         uint16_t p1, uint16_t p0, uint16_t q0,
+                                         uint16_t q1, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 // Should we apply any filter at all: 11111111 yes, 00000000 no ?
 static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
                                        uint16_t p3, uint16_t p2, uint16_t p1,
@ -449,6 +485,7 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
    const uint16_t p1 = s[-2 * p];
@ -459,6 +496,14 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
    const uint16_t q3 = s[3 * p];
    const int8_t mask =
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
    ++s;
  }
@ -480,10 +525,17 @@ void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
  for (i = 0; i < 8; ++i) {
+#if !CONFIG_PARALLEL_DEBLOCKING
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask =
        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+#else   // CONFIG_PARALLEL_DEBLOCKING
+    const uint16_t p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
    s += pitch;
  }
--- a/aom_dsp/x86/loopfilter_sse2.c
+++ b/aom_dsp/x86/loopfilter_sse2.c
@ -19,6 +19,40 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
 }

+#if CONFIG_PARALLEL_DEBLOCKING
+// filter_mask and hev_mask
+#define FILTER_HEV_MASK4                                                      \
+  do {                                                                        \
+    /* (abs(q1 - q0), abs(p1 - p0) */                                         \
+    __m128i flat = abs_diff(q1p1, q0p0);                                      \
+    /* abs(p1 - q1), abs(p0 - q0) */                                          \
+    const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
+    __m128i abs_p0q0, abs_p1q1;                                               \
+                                                                              \
+    /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
+    hev =                                                                     \
+        _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
+    hev = _mm_cmpgt_epi16(hev, thresh);                                       \
+    hev = _mm_packs_epi16(hev, hev);                                          \
+                                                                              \
+    /* const int8_t mask = filter_mask2(*limit, *blimit, */                   \
+    /*                                  p1, p0, q0, q1); */                   \
+    abs_p0q0 =                                                                \
+        _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
+    abs_p1q1 =                                                                \
+        _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
+    abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
+    abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
+    /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
+    mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
+    mask = _mm_unpacklo_epi64(mask, flat);                                    \
+    mask = _mm_subs_epu8(mask, limit);                                        \
+    mask = _mm_cmpeq_epi8(mask, zero);                                        \
+    mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
+  } while (0)
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 // filter_mask and hev_mask
 #define FILTER_HEV_MASK                                                       \
  do {                                                                        \
@ -114,23 +148,34 @@ void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
  const __m128i thresh =
      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p3p2, p2p1, q3q2, q2q1;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
  __m128i mask, hev;
-
+#if !CONFIG_PARALLEL_DEBLOCKING
  p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
                            _mm_loadl_epi64((__m128i *)(s - 4 * p)));
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
                            _mm_loadl_epi64((__m128i *)(s + 0 * p)));
+#if !CONFIG_PARALLEL_DEBLOCKING
  q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
-  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+#if !CONFIG_PARALLEL_DEBLOCKING
+  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+#if !CONFIG_PARALLEL_DEBLOCKING
  FILTER_HEV_MASK;
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  FILTER_HEV_MASK4;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  FILTER4;

  _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
@ -150,7 +195,10 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
  __m128i x0, x1, x2, x3;
-  __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p3p2, p2p1, q3q2, q2q1;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+  __m128i q1p1, q0p0, p1p0, q1q0, ps1ps0, qs1qs0;
  __m128i mask, hev;

  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
@ -174,29 +222,40 @@ void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
  p1p0 = _mm_unpacklo_epi16(q1q0, x1);
  // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
  x0 = _mm_unpacklo_epi16(x2, x3);
+#if !CONFIG_PARALLEL_DEBLOCKING
  // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
  p3p2 = _mm_unpacklo_epi32(p1p0, x0);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
  p1p0 = _mm_unpackhi_epi32(p1p0, x0);
+#if !CONFIG_PARALLEL_DEBLOCKING
  p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high

  // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
  q1q0 = _mm_unpackhi_epi16(q1q0, x1);
  // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
  x2 = _mm_unpackhi_epi16(x2, x3);
+#if !CONFIG_PARALLEL_DEBLOCKING
  // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
  q3q2 = _mm_unpackhi_epi32(q1q0, x2);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
  q1q0 = _mm_unpacklo_epi32(q1q0, x2);

  q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
  q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+#if !CONFIG_PARALLEL_DEBLOCKING
  p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
  q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+#if !CONFIG_PARALLEL_DEBLOCKING
  FILTER_HEV_MASK;
+#else   // CONFIG_PARALLEL_DEBLOCKING
+  FILTER_HEV_MASK4;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  FILTER4;

  // Transpose 8x4 to 4x8
@ -1395,18 +1454,23 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
                         _mm_load_si128((const __m128i *)_thresh1));
  const __m128i zero = _mm_set1_epi16(0);
-  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+#if !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p3, p2, q2, q3;
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
+  __m128i p1, p0, q0, q1;
  __m128i mask, hev, flat;
-
+#if !CONFIG_PARALLEL_DEBLOCKING
  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+#if !CONFIG_PARALLEL_DEBLOCKING
  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  // filter_mask and hev_mask
  {
    const __m128i abs_p1p0 =
@ -1419,8 +1483,9 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    __m128i abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+#if !CONFIG_PARALLEL_DEBLOCKING
    __m128i work;
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@ -1431,6 +1496,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(flat, mask);
+#if !CONFIG_PARALLEL_DEBLOCKING
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;
    work = _mm_max_epu8(
@ -1441,6 +1507,7 @@ void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    mask = _mm_max_epu8(work, mask);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }
@ -1584,6 +1651,51 @@ static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
 }

+#if CONFIG_PARALLEL_DEBLOCKING
+#define movq(p) _mm_loadl_epi64((const __m128i *)(p))
+#define punpcklbw(r0, r1) _mm_unpacklo_epi8(r0, r1)
+#define punpcklwd(r0, r1) _mm_unpacklo_epi16(r0, r1)
+#define punpckhwd(r0, r1) _mm_unpackhi_epi16(r0, r1)
+#define movd(p, r) *((uint32_t *)(p)) = _mm_cvtsi128_si32(r)
+#define pshufd(r, imm) _mm_shuffle_epi32(r, imm)
+enum { ROTATE_DWORD_RIGHT = 0x39 };
+static INLINE void transpose16x4(uint8_t *pDst, const ptrdiff_t dstStride,
+                                 const uint8_t *pSrc,
+                                 const ptrdiff_t srcStride) {
+  for (uint32_t idx = 0; idx < 2; idx += 1) {
+    __m128i r0, r1, r2, r3;
+    // load data
+    r0 = movq(pSrc);
+    r1 = movq(pSrc + srcStride);
+    r2 = movq(pSrc + srcStride * 2);
+    r3 = movq(pSrc + srcStride * 3);
+    // transpose
+    r0 = punpcklbw(r0, r1);
+    r2 = punpcklbw(r2, r3);
+    r1 = punpckhwd(r0, r2);
+    r0 = punpcklwd(r0, r2);
+    // store data
+    movd(pDst, r0);
+    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride, r0);
+    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 2, r0);
+    r0 = pshufd(r0, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 3, r0);
+    movd(pDst + dstStride * 4, r1);
+    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 5, r1);
+    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 6, r1);
+    r1 = pshufd(r1, ROTATE_DWORD_RIGHT);
+    movd(pDst + dstStride * 7, r1);
+    // advance the pointers
+    pDst += dstStride * 8;
+    pSrc += 8;
+  }
+}
+
+#endif  // CONFIG_PARALLEL_DEBLOCKING
 static INLINE void transpose(unsigned char *src[], int in_p,
                             unsigned char *dst[], int out_p,
                             int num_8x8_to_transpose) {
@ -1663,15 +1775,17 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
                                  const uint8_t *blimit1, const uint8_t *limit1,
                                  const uint8_t *thresh1) {
  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+#if !CONFIG_PARALLEL_DEBLOCKING
  unsigned char *src[2];
  unsigned char *dst[2];
-
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
  // Transpose 8x16
  transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);

  // Loop filtering
  aom_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
                                 blimit1, limit1, thresh1);
+#if !CONFIG_PARALLEL_DEBLOCKING
  src[0] = t_dst;
  src[1] = t_dst + 8;
  dst[0] = s - 4;
@ -1679,6 +1793,9 @@ void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,

  // Transpose back
  transpose(src, 16, dst, p, 2);
+#else  // CONFIG_PARALLEL_DEBLOCKING
+  transpose16x4(s - 2, p, t_dst + 16 * 2, 16);
+#endif  // !CONFIG_PARALLEL_DEBLOCKING
 }

 void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@ -1938,6 +1938,345 @@ void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
  dst->buf = dst0;
 }

+#if CONFIG_PARALLEL_DEBLOCKING
+
+typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
+
+static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = {
+  // mask for vertical edges filtering
+  {
+#if CONFIG_CB4X4
+      2 - 1,   // BLOCK_2X2
+      2 - 1,   // BLOCK_2X4
+      4 - 1,   // BLOCK_4X2
+#endif         // CONFIG_CB4X4
+      4 - 1,   // BLOCK_4X4
+      4 - 1,   // BLOCK_4X8
+      8 - 1,   // BLOCK_8X4
+      8 - 1,   // BLOCK_8X8
+      8 - 1,   // BLOCK_8X16
+      16 - 1,  // BLOCK_16X8
+      16 - 1,  // BLOCK_16X16
+      16 - 1,  // BLOCK_16X32
+      32 - 1,  // BLOCK_32X16
+      32 - 1,  // BLOCK_32X32
+      32 - 1,  // BLOCK_32X64
+      64 - 1,  // BLOCK_64X32
+      64 - 1,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+      64 - 1,   // BLOCK_64X128
+      128 - 1,  // BLOCK_128X64
+      128 - 1   // BLOCK_128X128
+#endif          // CONFIG_EXT_PARTITION
+  },
+
+  // mask for horizontal edges filtering
+  {
+#if CONFIG_CB4X4
+      2 - 1,   // BLOCK_2X2
+      4 - 1,   // BLOCK_2X4
+      2 - 1,   // BLOCK_4X2
+#endif         // CONFIG_CB4X4
+      4 - 1,   // BLOCK_4X4
+      8 - 1,   // BLOCK_4X8
+      4 - 1,   // BLOCK_8X4
+      8 - 1,   // BLOCK_8X8
+      16 - 1,  // BLOCK_8X16
+      8 - 1,   // BLOCK_16X8
+      16 - 1,  // BLOCK_16X16
+      32 - 1,  // BLOCK_16X32
+      16 - 1,  // BLOCK_32X16
+      32 - 1,  // BLOCK_32X32
+      64 - 1,  // BLOCK_32X64
+      32 - 1,  // BLOCK_64X32
+      64 - 1,  // BLOCK_64X64
+#if CONFIG_EXT_PARTITION
+      128 - 1,  // BLOCK_64X128
+      64 - 1,   // BLOCK_128X64
+      128 - 1   // BLOCK_128X128
+#endif          // CONFIG_EXT_PARTITION
+  },
+};
+
+static const uint32_t av1_transform_masks[NUM_EDGE_DIRS][TX_SIZES_ALL] = {
+  {
+#if CONFIG_CB4X4
+      2 - 1,  // TX_2X2
+#endif
+      4 - 1,   // TX_4X4
+      8 - 1,   // TX_8X8
+      16 - 1,  // TX_16X16
+      32 - 1,  // TX_32X32
+#if CONFIG_TX64X64
+      64 - 1,  // TX_64X64
+#endif         // CONFIG_TX64X64
+      4 - 1,   // TX_4X8
+      8 - 1,   // TX_8X4
+      8 - 1,   // TX_8X16
+      16 - 1,  // TX_16X8
+      16 - 1,  // TX_16X32
+      32 - 1,  // TX_32X16
+      4 - 1,   // TX_4X16
+      16 - 1,  // TX_16X4
+      8 - 1,   // TX_8X32
+      32 - 1   // TX_32X8
+  },
+
+  {
+#if CONFIG_CB4X4
+      2 - 1,  // TX_2X2
+#endif
+      4 - 1,   // TX_4X4
+      8 - 1,   // TX_8X8
+      16 - 1,  // TX_16X16
+      32 - 1,  // TX_32X32
+#if CONFIG_TX64X64
+      64 - 1,  // TX_64X64
+#endif         // CONFIG_TX64X64
+      8 - 1,   // TX_4X8
+      4 - 1,   // TX_8X4
+      16 - 1,  // TX_8X16
+      8 - 1,   // TX_16X8
+      32 - 1,  // TX_16X32
+      16 - 1,  // TX_32X16
+      16 - 1,  // TX_4X16
+      4 - 1,   // TX_16X4
+      32 - 1,  // TX_8X32
+      8 - 1    // TX_32X8
+  }
+};
+
+static TX_SIZE av1_get_transform_size(const MODE_INFO *const pCurr,
+                                      const EDGE_DIR edgeDir,
+                                      const uint32_t scaleHorz,
+                                      const uint32_t scaleVert) {
+  const BLOCK_SIZE bs = pCurr->mbmi.sb_type;
+  TX_SIZE txSize;
+
+  // since in case of chrominance or non-square transorm need to convert
+  // transform size into transform size in particular direction.
+
+  txSize = uv_txsize_lookup[bs][pCurr->mbmi.tx_size][scaleHorz][scaleVert];
+
+  if (VERT_EDGE == edgeDir) {
+    txSize = txsize_horz_map[txSize];
+  } else {
+    txSize = txsize_vert_map[txSize];
+  }
+
+  return txSize;
+}
+
+typedef struct AV1_DEBLOCKING_PARAMETERS {
+  // length of the filter applied to the outer edge
+  uint32_t filterLength;
+  // length of the filter applied to the inner edge
+  uint32_t filterLengthInternal;
+
+  // deblocking limits
+  const uint8_t *lim;
+  const uint8_t *mblim;
+  const uint8_t *hev_thr;
+} AV1_DEBLOCKING_PARAMETERS;
+
+static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
+                               const MODE_INFO **const ppCurr,
+                               const ptrdiff_t modeStep,
+                               const AV1_COMMON *const cm,
+                               const EDGE_DIR edgeDir, const uint32_t x,
+                               const uint32_t y, const uint32_t width,
+                               const uint32_t height, const uint32_t scaleHorz,
+                               const uint32_t scaleVert) {
+  // reset to initial values
+  pParams->filterLength = 0;
+  pParams->filterLengthInternal = 0;
+
+  // no deblocking is required
+  if ((width <= x) || (height <= y)) {
+    return;
+  }
+
+#if CONFIG_EXT_PARTITION
+  // not sure if changes are required.
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
+
+  {
+    const TX_SIZE ts =
+        av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert);
+    const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi);
+    const int currSkipped =
+        ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi);
+    const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y);
+    uint32_t level = currLevel;
+
+    // prepare outer edge parameters. deblock the edge if it's an edge of a TU
+    if (coord) {
+#if CONFIG_LOOPFILTERING_ACROSS_TILES
+      if (!av1_disable_loopfilter_on_tile_boundary(cm) ||
+          ((VERT_EDGE == edgeDir) &&
+           (0 == (ppCurr[0]->mbmi.boundary_info & TILE_LEFT_BOUNDARY))) ||
+          ((HORZ_EDGE == edgeDir) &&
+           (0 == (ppCurr[0]->mbmi.boundary_info & TILE_ABOVE_BOUNDARY))))
+#endif  // CONFIG_LOOPFILTERING_ACROSS_TILES
+      {
+        const int32_t tuEdge =
+            (coord & av1_transform_masks[edgeDir][ts]) ? (0) : (1);
+
+        if (tuEdge) {
+          const MODE_INFO *const pPrev = *(ppCurr - modeStep);
+          const TX_SIZE pvTs =
+              av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert);
+          const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi);
+          const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi);
+          const int32_t puEdge =
+              (coord &
+               av1_prediction_masks[edgeDir]
+                                   [ss_size_lookup[ppCurr[0]->mbmi.sb_type]
+                                                  [scaleHorz][scaleVert]])
+                  ? (0)
+                  : (1);
+
+          // if the current and the previous blocks are skipped,
+          // deblock the edge if the edge belongs to a PU's edge only.
+          if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) {
+            pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8);
+            // update the level if the current block is skipped,
+            // but the previous one is not
+            level = (currLevel) ? (currLevel) : (pvLvl);
+          }
+        }
+      }
+
+      // prepare internal edge parameters
+      if (currLevel && !currSkipped) {
+        pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0);
+      }
+
+      // prepare common parameters
+      if (pParams->filterLength || pParams->filterLengthInternal) {
+        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
+
+        pParams->lim = limits->lim;
+        pParams->mblim = limits->mblim;
+        pParams->hev_thr = limits->hev_thr;
+      }
+    }
+  }
+}
+
+static void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
+                                        const MACROBLOCKD_PLANE *const pPlane,
+                                        const MODE_INFO **ppModeInfo,
+                                        const ptrdiff_t modeStride,
+                                        const uint32_t cuX,
+                                        const uint32_t cuY) {
+  const uint32_t scaleHorz = pPlane->subsampling_x;
+  const uint32_t scaleVert = pPlane->subsampling_y;
+  const uint32_t width = pPlane->dst.width;
+  const uint32_t height = pPlane->dst.height;
+  uint8_t *const pDst = pPlane->dst.buf;
+  const int dstStride = pPlane->dst.stride;
+
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+    uint8_t *p = pDst + y * MI_SIZE * dstStride;
+
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+      const MODE_INFO **const pCurr =
+          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      set_lpf_parameters(&params, pCurr, ((ptrdiff_t)1 << scaleHorz), cm,
+                         VERT_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
+                         height, scaleHorz, scaleVert);
+
+      switch (params.filterLength) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim,
+                             params.hev_thr);
+          break;
+
+        // no filtering
+        default: break;
+      }
+
+      // process the internal edge
+      if (params.filterLengthInternal) {
+        aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim,
+                           params.hev_thr);
+      }
+
+      // advance the destination pointer
+      p += 8;
+    }
+  }
+}
+
+static void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
+                                        const MACROBLOCKD_PLANE *const pPlane,
+                                        const MODE_INFO **ppModeInfo,
+                                        const ptrdiff_t modeStride,
+                                        const uint32_t cuX,
+                                        const uint32_t cuY) {
+  const uint32_t scaleHorz = pPlane->subsampling_x;
+  const uint32_t scaleVert = pPlane->subsampling_y;
+  const uint32_t width = pPlane->dst.width;
+  const uint32_t height = pPlane->dst.height;
+  uint8_t *const pDst = pPlane->dst.buf;
+  const int dstStride = pPlane->dst.stride;
+
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+    uint8_t *p = pDst + y * MI_SIZE * dstStride;
+
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+      const MODE_INFO **const pCurr =
+          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      set_lpf_parameters(&params, pCurr, (modeStride << scaleVert), cm,
+                         HORZ_EDGE, cuX + x * MI_SIZE, cuY + y * MI_SIZE, width,
+                         height, scaleHorz, scaleVert);
+
+      switch (params.filterLength) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
+          break;
+
+        // no filtering
+        default: break;
+      }
+
+      // process the internal edge
+      if (params.filterLengthInternal) {
+        aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim,
+                             params.lim, params.hev_thr);
+      }
+
+      // advance the destination pointer
+      p += 8;
+    }
+  }
+}
+
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                          struct macroblockd_plane planes[MAX_MB_PLANE],
                          int start, int stop, int y_only) {
@ -1970,6 +2309,7 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
 #else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
  int mi_row, mi_col;
+#if !CONFIG_PARALLEL_DEBLOCKING
  enum lf_path path;
  LOOP_FILTER_MASK lfm;

@ -1981,58 +2321,38 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
    path = LF_PATH_444;
  else
    path = LF_PATH_SLOW;
+#endif
+
 #if CONFIG_PARALLEL_DEBLOCKING
  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      int plane;
-
      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;

-      av1_filter_block_plane_ss00_ver(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11_ver(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00_ver(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420_ver(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col);
-            break;
-        }
+        av1_filter_block_plane_vert(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
      }
    }
  }
  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
-      int plane;
-
      av1_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);

-      // TODO(JBB): Make setup_mask work for non 420.
-      av1_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;

-      av1_filter_block_plane_ss00_hor(cm, &planes[0], mi_row, &lfm);
-      for (plane = 1; plane < num_planes; ++plane) {
-        switch (path) {
-          case LF_PATH_420:
-            av1_filter_block_plane_ss11_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_444:
-            av1_filter_block_plane_ss00_hor(cm, &planes[plane], mi_row, &lfm);
-            break;
-          case LF_PATH_SLOW:
-            av1_filter_block_plane_non420_hor(cm, &planes[plane], mi + mi_col,
-                                              mi_row, mi_col);
-            break;
-        }
+        av1_filter_block_plane_horz(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
      }
    }
  }