Shorter-tap interp first in highbitdepth mode

BDRate varies within +-0.04% Change-Id: I76f440c479d411c09ef39a19b46eb8dbc5330efb
2016-12-11 22:53:17 -08:00 · 2016-12-11 22:53:17 -08:00 · 9e963dc0ed
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@ -131,65 +131,65 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                       unsigned int output_height,
                                       const int16_t *filter, int bd);

-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void aom_highbd_convolve8_##name##_##opt(                               \
-      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,           \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \
-    if (step_q4 == 16 && filter[3] != 128) {                              \
-      uint16_t *src = CONVERT_TO_SHORTPTR(src8);                          \
-      uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                          \
-      if (filter[0] | filter[1] | filter[2]) {                            \
-        while (w >= 16) {                                                 \
-          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      } else {                                                            \
-        while (w >= 16) {                                                 \
-          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      }                                                                   \
-    }                                                                     \
-    if (w) {                                                              \
-      aom_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                      filter_x, x_step_q4, filter_y,      \
-                                      y_step_q4, w, h, bd);               \
-    }                                                                     \
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)  \
+  void aom_highbd_convolve8_##name##_##opt(                                \
+      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,            \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,        \
+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {      \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
+    if (step_q4 == 16 && filter[3] != 128) {                               \
+      if (filter[0] | filter[1] | filter[2]) {                             \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else {                                                             \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+    if (w) {                                                               \
+      aom_highbd_convolve8_##name##_c(                                     \
+          CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst),    \
+          dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+    }                                                                      \
  }

 #define HIGH_FUN_CONV_2D(avg, opt)                                            \
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@ -390,6 +390,7 @@ void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
                                     int avg, int bd) {
  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
  if (filter_params.taps == SUBPEL_TAPS) {
    const int16_t *filter_y =
        av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
@ -451,44 +452,76 @@ void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
                                    filter_params, subpel_y_q4, y_step_q4,
                                    ref_idx, bd);
  } else {
-    // temp's size is set to (maximum possible intermediate_height) *
-    // MAX_BLOCK_WIDTH
-    uint16_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
-                   MAX_FILTER_TAP) *
-                  MAX_BLOCK_WIDTH];
+    // temp's size is set to a 256 aligned value to facilitate SIMD
+    // implementation. The value is greater than (maximum possible intermediate
+    // height or width) * MAX_SB_SIZE
+    DECLARE_ALIGNED(16, uint16_t,
+                    temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
    uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
-    int temp_stride = MAX_BLOCK_WIDTH;
-
+    int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
+    int filter_size;
+    InterpFilterParams filter_params;
 #if CONFIG_DUAL_FILTER
    InterpFilterParams filter_params_x =
        av1_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
    InterpFilterParams filter_params_y =
        av1_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
-    InterpFilterParams filter_params = filter_params_x;
-    int filter_size = filter_params_y.taps;
-#else
-    InterpFilterParams filter_params =
-        av1_get_interp_filter_params(interp_filter);
-    int filter_size = filter_params.taps;
 #endif

-    int intermediate_height =
-        (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
-
-    av1_highbd_convolve_horiz_facade(src8 - src_stride * (filter_size / 2 - 1),
-                                     src_stride, temp8, temp_stride, w,
-                                     intermediate_height, filter_params,
-                                     subpel_x_q4, x_step_q4, 0, bd);
-
 #if CONFIG_DUAL_FILTER
-    filter_params = filter_params_y;
-#endif
-    filter_size = filter_params.taps;
-    assert(filter_params.taps <= MAX_FILTER_TAP);
+    if (filter_params_y.taps < filter_params_x.taps) {
+      int intermediate_width;
+      int temp_stride = max_intermediate_size;
+      filter_params = filter_params_y;
+      filter_size = filter_params_x.taps;
+      intermediate_width =
+          (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
+      assert(intermediate_width <= max_intermediate_size);

-    av1_highbd_convolve_vert_facade(
-        temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
-        dst_stride, w, h, filter_params, subpel_y_q4, y_step_q4, ref_idx, bd);
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_highbd_convolve_vert_facade(
+          src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
+          intermediate_width, h, filter_params, subpel_y_q4, y_step_q4, 0, bd);
+
+      filter_params = filter_params_x;
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_highbd_convolve_horiz_facade(
+          temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
+          filter_params, subpel_x_q4, x_step_q4, ref_idx, bd);
+    } else
+#endif  // CONFIG_DUAL_FILTER
+    {
+      int intermediate_height;
+      int temp_stride = MAX_SB_SIZE;
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_x;
+      filter_size = filter_params_y.taps;
+#else
+      filter_params = av1_get_interp_filter_params(interp_filter);
+      filter_size = filter_params.taps;
+#endif
+      intermediate_height =
+          (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+      assert(intermediate_height <= max_intermediate_size);
+      (void)max_intermediate_size;
+
+      av1_highbd_convolve_horiz_facade(
+          src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
+          temp_stride, w, intermediate_height, filter_params, subpel_x_q4,
+          x_step_q4, 0, bd);
+
+#if CONFIG_DUAL_FILTER
+      filter_params = filter_params_y;
+#endif
+      filter_size = filter_params.taps;
+      assert(filter_params.taps <= MAX_FILTER_TAP);
+
+      av1_highbd_convolve_vert_facade(
+          temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
+          dst_stride, w, h, filter_params, subpel_y_q4, y_step_q4, ref_idx, bd);
+    }
  }
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH