More accurate chroma warping

Previously, the projected positions of chroma pixels would effectively undergo double rounding, since we round both when calculating x4 / y4 and when calculating the filter index. Further, the two roundings were different: x4 / y4 used ROUND_POWER_OF_TWO_SIGNED, whereas the filter index uses ROUND_POWER_OF_TWO. It is slightly more accurate (and faster) to replace the first rounding by a shift; this is motivated by the fact that ROUND_POWER_OF_TWO(x >> a, b) == ROUND_POWER_OF_TWO(x, a + b) Change-Id: Ia52b05745168d0aeb05f0af4c75ff33eee791d82
2017-05-05 11:18:14 +01:00 · 2017-05-05 11:18:14 +01:00 · f7a5ee536b
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@ -984,18 +984,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
    for (j = p_col; j < p_col + p_width; j += 8) {
      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];

      if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];

@ -1229,18 +1227,16 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
    for (j = p_col; j < p_col + p_width; j += 8) {
      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * (j + 4) + mat[3] * 4 * (i + 4) + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];

      if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * (j + 4) + mat[5] * 4 * (i + 4) + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];

--- a/av1/common/x86/highbd_warp_plane_ssse3.c
+++ b/av1/common/x86/highbd_warp_plane_ssse3.c
@ -50,18 +50,16 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,

      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];

      if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];

--- a/av1/common/x86/warp_plane_sse2.c
+++ b/av1/common/x86/warp_plane_sse2.c
@ -45,18 +45,16 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,

      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];

      if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];

--- a/av1/common/x86/warp_plane_ssse3.c
+++ b/av1/common/x86/warp_plane_ssse3.c
@ -232,18 +232,16 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,

      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
-        x4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
-                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        x4 = (mat[2] * 4 * dst_x + mat[3] * 4 * dst_y + mat[0] * 2 +
+              (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];

      if (subsampling_y)
-        y4 = ROUND_POWER_OF_TWO_SIGNED(
-            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
-                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-            1);
+        y4 = (mat[4] * 4 * dst_x + mat[5] * 4 * dst_y + mat[1] * 2 +
+              (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS))) /
+             4;
      else
        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];