integrate parallel_deblocking with CB4x4

this change makes parallel deblocking experiment works with cb4x4. the inner loop process every 4x4 block. Change-Id: I86adb3d7b6d67a91ccc12aab29da9bfb8c522cf1
2017-05-23 18:28:51 -07:00 · 2017-05-23 18:28:51 -07:00 · 17905edfe0
--- a/aom_dsp/loopfilter.c
+++ b/aom_dsp/loopfilter.c
@ -149,10 +149,15 @@ void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
                            const uint8_t *blimit, const uint8_t *limit,
                            const uint8_t *thresh) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
@ -179,10 +184,15 @@ void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
 void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
                          const uint8_t *limit, const uint8_t *thresh) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
@ -229,10 +239,15 @@ static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
 void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
                            const uint8_t *limit, const uint8_t *thresh) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

@ -256,8 +271,13 @@ void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
 void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
                          const uint8_t *limit, const uint8_t *thresh) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask =
@ -390,10 +410,15 @@ static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
                                     const uint8_t *limit,
                                     const uint8_t *thresh, int count) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int step = 4;
+#else
+  int step = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < step * count; ++i) {
    const uint8_t p7 = s[-8 * p], p6 = s[-7 * p], p5 = s[-6 * p],
                  p4 = s[-5 * p], p3 = s[-4 * p], p2 = s[-3 * p],
                  p1 = s[-2 * p], p0 = s[-p];
@ -436,7 +461,11 @@ void aom_lpf_horizontal_edge_8_c(uint8_t *s, int p, const uint8_t *blimit,

 void aom_lpf_horizontal_edge_16_c(uint8_t *s, int p, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+#else
  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2);
+#endif
 }

 static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
@ -478,7 +507,11 @@ static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,

 void aom_lpf_vertical_16_c(uint8_t *s, int p, const uint8_t *blimit,
                           const uint8_t *limit, const uint8_t *thresh) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
+#else
  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8);
+#endif
 }

 void aom_lpf_vertical_16_dual_c(uint8_t *s, int p, const uint8_t *blimit,
@ -596,10 +629,15 @@ void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
                                   const uint8_t *blimit, const uint8_t *limit,
                                   const uint8_t *thresh, int bd) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
@ -636,10 +674,15 @@ void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int bd) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
 #if !CONFIG_PARALLEL_DEBLOCKING
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
@ -689,10 +732,15 @@ void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
                                   const uint8_t *limit, const uint8_t *thresh,
                                   int bd) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];

@ -718,8 +766,13 @@ void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
                                 const uint8_t *limit, const uint8_t *thresh,
                                 int bd) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int count = 4;
+#else
+  int count = 8;
+#endif

-  for (i = 0; i < 8; ++i) {
+  for (i = 0; i < count; ++i) {
    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
    const int8_t mask =
@ -813,10 +866,15 @@ static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
                                            const uint8_t *thresh, int count,
                                            int bd) {
  int i;
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  int step = 4;
+#else
+  int step = 8;
+#endif

  // loop filter designed to work using chars so that we can make maximum use
  // of 8 bit simd instructions.
-  for (i = 0; i < 8 * count; ++i) {
+  for (i = 0; i < step * count; ++i) {
    const uint16_t p3 = s[-4 * p];
    const uint16_t p2 = s[-3 * p];
    const uint16_t p1 = s[-2 * p];
@ -852,7 +910,11 @@ void aom_highbd_lpf_horizontal_edge_16_c(uint16_t *s, int p,
                                         const uint8_t *blimit,
                                         const uint8_t *limit,
                                         const uint8_t *thresh, int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1, bd);
+#else
  highbd_mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 2, bd);
+#endif
 }

 static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
@ -888,13 +950,21 @@ static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
 void aom_highbd_lpf_vertical_16_c(uint16_t *s, int p, const uint8_t *blimit,
                                  const uint8_t *limit, const uint8_t *thresh,
                                  int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
+#else
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+#endif
 }

 void aom_highbd_lpf_vertical_16_dual_c(uint16_t *s, int p,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
+#if CONFIG_PARALLEL_DEBLOCKING && CONFIG_CB4X4
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 8, bd);
+#else
  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 16, bd);
+#endif
 }
 #endif  // CONFIG_HIGHBITDEPTH
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@ -22,7 +22,7 @@

 #include "av1/common/seg_common.h"

-#define CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY 0
+#define PARALLEL_DEBLOCKING_15TAPLUMAONLY 1

 // 64 bit masks for left transform size. Each 1 represents a position where
 // we should apply a loop filter across the left border of an 8x8 block
@ -1857,8 +1857,6 @@ void av1_filter_block_plane_ss11_hor(AV1_COMMON *const cm,
  dst->buf = dst0;
 }

-#if !(CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
-      CONFIG_CB4X4)
 #if CONFIG_PARALLEL_DEBLOCKING
 typedef enum EDGE_DIR { VERT_EDGE = 0, HORZ_EDGE = 1, NUM_EDGE_DIRS } EDGE_DIR;
 static const uint32_t av1_prediction_masks[NUM_EDGE_DIRS][BLOCK_SIZES] = {
@ -2010,10 +2008,17 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
  // not sure if changes are required.
  assert(0 && "Not yet updated");
 #endif  // CONFIG_EXT_PARTITION
+
  {
    const TX_SIZE ts =
        av1_get_transform_size(ppCurr[0], edgeDir, scaleHorz, scaleVert);
+#if CONFIG_EXT_DELTA_Q
+    const uint32_t currLevel =
+        get_filter_level(cm, &cm->lf_info, &ppCurr[0]->mbmi);
+#else
    const uint32_t currLevel = get_filter_level(&cm->lf_info, &ppCurr[0]->mbmi);
+#endif  // CONFIG_EXT_DELTA_Q
+
    const int currSkipped =
        ppCurr[0]->mbmi.skip && is_inter_block(&ppCurr[0]->mbmi);
    const uint32_t coord = (VERT_EDGE == edgeDir) ? (x) : (y);
@ -2034,7 +2039,13 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
          const MODE_INFO *const pPrev = *(ppCurr - modeStep);
          const TX_SIZE pvTs =
              av1_get_transform_size(pPrev, edgeDir, scaleHorz, scaleVert);
+#if CONFIG_EXT_DELTA_Q
+          const uint32_t pvLvl =
+              get_filter_level(cm, &cm->lf_info, &pPrev->mbmi);
+#else
          const uint32_t pvLvl = get_filter_level(&cm->lf_info, &pPrev->mbmi);
+#endif  // CONFIG_EXT_DELTA_Q
+
          const int pvSkip = pPrev->mbmi.skip && is_inter_block(&pPrev->mbmi);
          const int32_t puEdge =
              (coord &
@ -2046,7 +2057,7 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
          // if the current and the previous blocks are skipped,
          // deblock the edge if the edge belongs to a PU's edge only.
          if ((currLevel || pvLvl) && (!pvSkip || !currSkipped || puEdge)) {
-#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
            const TX_SIZE minTs = AOMMIN(ts, pvTs);
            if (TX_4X4 >= minTs) {
              pParams->filterLength = 4;
@ -2054,7 +2065,7 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
              pParams->filterLength = 8;
            } else {
              pParams->filterLength = 16;
-#if CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if PARALLEL_DEBLOCKING_15TAPLUMAONLY
              // No wide filtering for chroma plane
              if (scaleHorz || scaleVert) {
                pParams->filterLength = 8;
@ -2064,7 +2075,7 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
 #else
            pParams->filterLength = (TX_4X4 >= AOMMIN(ts, pvTs)) ? (4) : (8);

-#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY

            // update the level if the current block is skipped,
            // but the previous one is not
@ -2072,10 +2083,14 @@ static void set_lpf_parameters(AV1_DEBLOCKING_PARAMETERS *const pParams,
          }
        }
      }
+
+#if !CONFIG_CB4X4
      // prepare internal edge parameters
      if (currLevel && !currSkipped) {
        pParams->filterLengthInternal = (TX_4X4 >= ts) ? (4) : (0);
      }
+#endif
+
      // prepare common parameters
      if (pParams->filterLength || pParams->filterLengthInternal) {
        const loop_filter_thresh *const limits = cm->lf_info.lfthr + level;
@ -2093,15 +2108,21 @@ static void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
                                        const ptrdiff_t modeStride,
                                        const uint32_t cuX,
                                        const uint32_t cuY) {
+  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
  const uint32_t scaleHorz = pPlane->subsampling_x;
  const uint32_t scaleVert = pPlane->subsampling_y;
  const uint32_t width = pPlane->dst.width;
  const uint32_t height = pPlane->dst.height;
  uint8_t *const pDst = pPlane->dst.buf;
  const int dstStride = pPlane->dst.stride;
-  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += row_step) {
    uint8_t *p = pDst + y * MI_SIZE * dstStride;
-    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += col_step) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 trasnform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
      const MODE_INFO **const pCurr =
          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
      AV1_DEBLOCKING_PARAMETERS params;
@ -2112,31 +2133,59 @@ static void av1_filter_block_plane_vert(const AV1_COMMON *const cm,
      switch (params.filterLength) {
        // apply 4-tap filtering
        case 4:
-          aom_lpf_vertical_4(p, dstStride, params.mblim, params.lim,
-                             params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_vertical_4_c(p, dstStride, params.mblim, params.lim,
+                                 params.hev_thr);
          break;
        // apply 8-tap filtering
        case 8:
-          aom_lpf_vertical_8(p, dstStride, params.mblim, params.lim,
-                             params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_vertical_8_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                        params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_vertical_8_c(p, dstStride, params.mblim, params.lim,
+                                 params.hev_thr);
          break;
-#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
        // apply 16-tap filtering
        case 16:
-          aom_lpf_vertical_16(p, dstStride, params.mblim, params.lim,
-                              params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_vertical_16_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                         params.mblim, params.lim,
+                                         params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_vertical_16_c(p, dstStride, params.mblim, params.lim,
+                                  params.hev_thr);
          break;
-#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
        // no filtering
        default: break;
      }
      // process the internal edge
      if (params.filterLengthInternal) {
-        aom_lpf_vertical_4(p + 4, dstStride, params.mblim, params.lim,
-                           params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          aom_highbd_lpf_vertical_4_c(CONVERT_TO_SHORTPTR(p + 4), dstStride,
+                                      params.mblim, params.lim, params.hev_thr,
+                                      cm->bit_depth);
+        else
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_lpf_vertical_4_c(p + 4, dstStride, params.mblim, params.lim,
+                               params.hev_thr);
      }
      // advance the destination pointer
-      p += 8;
+      p += MI_SIZE;
    }
  }
 }
@ -2147,15 +2196,21 @@ static void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
                                        const ptrdiff_t modeStride,
                                        const uint32_t cuX,
                                        const uint32_t cuY) {
+  const int col_step = MI_SIZE >> MI_SIZE_LOG2;
+  const int row_step = MI_SIZE >> MI_SIZE_LOG2;
  const uint32_t scaleHorz = pPlane->subsampling_x;
  const uint32_t scaleVert = pPlane->subsampling_y;
  const uint32_t width = pPlane->dst.width;
  const uint32_t height = pPlane->dst.height;
  uint8_t *const pDst = pPlane->dst.buf;
  const int dstStride = pPlane->dst.stride;
-  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += 1) {
+  for (int y = 0; y < (MAX_MIB_SIZE >> scaleVert); y += row_step) {
    uint8_t *p = pDst + y * MI_SIZE * dstStride;
-    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += 1) {
+    for (int x = 0; x < (MAX_MIB_SIZE >> scaleHorz); x += col_step) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 trasnform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
      const MODE_INFO **const pCurr =
          ppModeInfo + (y << scaleVert) * modeStride + (x << scaleHorz);
      AV1_DEBLOCKING_PARAMETERS params;
@ -2166,45 +2221,74 @@ static void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
      switch (params.filterLength) {
        // apply 4-tap filtering
        case 4:
-          aom_lpf_horizontal_4(p, dstStride, params.mblim, params.lim,
-                               params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                          params.mblim, params.lim,
+                                          params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_horizontal_4_c(p, dstStride, params.mblim, params.lim,
+                                   params.hev_thr);
          break;
        // apply 8-tap filtering
        case 8:
-          aom_lpf_horizontal_8(p, dstStride, params.mblim, params.lim,
-                               params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_horizontal_8_c(CONVERT_TO_SHORTPTR(p), dstStride,
+                                          params.mblim, params.lim,
+                                          params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_horizontal_8_c(p, dstStride, params.mblim, params.lim,
+                                   params.hev_thr);
          break;
-#if CONFIG_PARALLEL_DEBLOCKING_15TAP || CONFIG_PARALLEL_DEBLOCKING_15TAPLUMAONLY
+#if CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
        // apply 16-tap filtering
        case 16:
-          aom_lpf_horizontal_edge_16(p, dstStride, params.mblim, params.lim,
-                                     params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+          if (cm->use_highbitdepth)
+            aom_highbd_lpf_horizontal_edge_16_c(
+                CONVERT_TO_SHORTPTR(p), dstStride, params.mblim, params.lim,
+                params.hev_thr, cm->bit_depth);
+          else
+#endif  // CONFIG_HIGHBITDEPTH
+            aom_lpf_horizontal_edge_16_c(p, dstStride, params.mblim, params.lim,
+                                         params.hev_thr);
          break;
-#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP
+#endif  // CONFIG_PARALLEL_DEBLOCKING_15TAP || PARALLEL_DEBLOCKING_15TAPLUMAONLY
        // no filtering
        default: break;
      }
      // process the internal edge
      if (params.filterLengthInternal) {
-        aom_lpf_horizontal_4(p + 4 * dstStride, dstStride, params.mblim,
-                             params.lim, params.hev_thr);
+#if CONFIG_HIGHBITDEPTH
+        if (cm->use_highbitdepth)
+          aom_highbd_lpf_horizontal_4_c(CONVERT_TO_SHORTPTR(p + 4 * dstStride),
+                                        dstStride, params.mblim, params.lim,
+                                        params.hev_thr, cm->bit_depth);
+        else
+#endif  // CONFIG_HIGHBITDEPTH
+          aom_lpf_horizontal_4_c(p + 4 * dstStride, dstStride, params.mblim,
+                                 params.lim, params.hev_thr);
      }
      // advance the destination pointer
-      p += 8;
+      p += MI_SIZE;
    }
  }
 }
 #endif  // CONFIG_PARALLEL_DEBLOCKING
-#endif

 void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
                          struct macroblockd_plane planes[MAX_MB_PLANE],
                          int start, int stop, int y_only) {
-#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
-    CONFIG_CB4X4
  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
  int mi_row, mi_col;

+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES || \
+    CONFIG_CB4X4
+
+#if !CONFIG_PARALLEL_DEBLOCKING
 #if CONFIG_VAR_TX
  for (int i = 0; i < MAX_MB_PLANE; ++i)
    memset(cm->top_txfm_context[i], TX_32X32, cm->mi_cols << TX_UNIT_WIDE_LOG2);
@ -2229,27 +2313,17 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
      }
    }
  }
-#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
-  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  int mi_row, mi_col;
-#if !CONFIG_PARALLEL_DEBLOCKING
-  enum lf_path path;
-  LOOP_FILTER_MASK lfm;
+#else
+
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  assert(0 && "Not yet updated. ToDo as next steps");
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES

-  if (y_only)
-    path = LF_PATH_444;
-  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
-    path = LF_PATH_420;
-  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
-    path = LF_PATH_444;
-  else
-    path = LF_PATH_SLOW;
-#endif
-#if CONFIG_PARALLEL_DEBLOCKING
  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all vertical edges in every 64x64 super block
      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
        const int32_t scaleVert = planes[planeIdx].subsampling_y;
@ -2264,6 +2338,42 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all horizontal edges in every 64x64 super block
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
+        av1_filter_block_plane_horz(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
+      }
+    }
+  }
+#endif  // CONFIG_PARALLEL_DEBLOCKING
+
+#else  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_PARALLEL_DEBLOCKING
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all vertical edges in every 64x64 super block
+      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
+        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
+        const int32_t scaleVert = planes[planeIdx].subsampling_y;
+        av1_filter_block_plane_vert(
+            cm, planes + planeIdx, (const MODE_INFO **)(mi + mi_col),
+            cm->mi_stride, (mi_col * MI_SIZE) >> scaleHorz,
+            (mi_row * MI_SIZE) >> scaleVert);
+      }
+    }
+  }
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
+      av1_setup_dst_planes(planes, cm->sb_size, frame_buffer, mi_row, mi_col);
+      // filter all horizontal edges in every 64x64 super block
      for (int planeIdx = 0; planeIdx < num_planes; planeIdx += 1) {
        const int32_t scaleHorz = planes[planeIdx].subsampling_x;
        const int32_t scaleVert = planes[planeIdx].subsampling_y;
@ -2275,6 +2385,18 @@ void av1_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, AV1_COMMON *cm,
    }
  }
 #else   // CONFIG_PARALLEL_DEBLOCKING
+  enum lf_path path;
+  LOOP_FILTER_MASK lfm;
+
+  if (y_only)
+    path = LF_PATH_444;
+  else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
+    path = LF_PATH_420;
+  else if (planes[1].subsampling_y == 0 && planes[1].subsampling_x == 0)
+    path = LF_PATH_444;
+  else
+    path = LF_PATH_SLOW;
+
  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
--- a/5
+++ b/5
@ -551,6 +551,11 @@ post_process_cmdline() {
      soft_enable accounting
      soft_enable inspection
    fi
+    if enabled parallel_deblocking_15tap && ! enabled parallel_deblocking; then
+      log_echo "parallel_deblocking_15tap dependes on parallel_deblocking, so"
+      log_echo "enabling parallel_deblocking"
+      soft_enable parallel_deblocking
+    fi
 }

 process_targets() {