Change warp filter to use one less precision bit
Change-Id: Idc7bb686f5751b0457c9f21daac0fa6f4865fd22
This commit is contained in:
Родитель
8feaaac8fd
Коммит
a77ec1c922
|
@ -701,12 +701,8 @@ static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
|
|||
8240, 8224, 8208, 8192,
|
||||
};
|
||||
|
||||
static INLINE int16_t saturate_int16(int32_t v) {
|
||||
if (v > 32767)
|
||||
return 32767;
|
||||
else if (v < -32768)
|
||||
return -32768;
|
||||
return v;
|
||||
static INLINE uint16_t saturate_uint(int32_t v, int bits) {
|
||||
return (uint16_t)clamp(v, 0, (1 << bits) - 1);
|
||||
}
|
||||
|
||||
#if CONFIG_WARPED_MOTION
|
||||
|
@ -1028,14 +1024,18 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
|
|||
if (ix4 <= -7) {
|
||||
for (l = 0; l < 8; ++l) {
|
||||
tmp[(k + 7) * 8 + l] =
|
||||
ref[iy * stride] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS));
|
||||
}
|
||||
} else if (ix4 >= width + 6) {
|
||||
for (l = 0; l < 8; ++l) {
|
||||
tmp[(k + 7) * 8 + l] =
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
|
||||
tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS));
|
||||
}
|
||||
} else {
|
||||
int sx = sx4 + beta * (k + 4);
|
||||
|
@ -1045,14 +1045,16 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
|
|||
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
|
||||
WARPEDPIXEL_PREC_SHIFTS;
|
||||
const int16_t *coeffs = warped_filter[offs];
|
||||
int32_t sum = 0;
|
||||
int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
|
||||
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||
for (m = 0; m < 8; ++m) {
|
||||
sum += ref[iy * stride + ix + m] * coeffs[m];
|
||||
}
|
||||
sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
|
||||
#if HORSHEAR_REDUCE_PREC_BITS >= 5
|
||||
tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
|
||||
tmp[(k + 7) * 8 + (l + 4)] =
|
||||
saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS + 1);
|
||||
#else
|
||||
tmp[(k + 7) * 8 + (l + 4)] = sum;
|
||||
#endif
|
||||
|
@ -1070,7 +1072,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
|
|||
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
|
||||
WARPEDPIXEL_PREC_SHIFTS;
|
||||
const int16_t *coeffs = warped_filter[offs];
|
||||
int32_t sum = 0;
|
||||
int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1));
|
||||
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||
for (m = 0; m < 8; ++m) {
|
||||
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
|
||||
|
@ -1232,6 +1234,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||
int16_t delta) {
|
||||
int16_t tmp[15 * 8];
|
||||
int i, j, k, l, m;
|
||||
const int bd = 8;
|
||||
|
||||
/* Note: For this code to work, the left/right frame borders need to be
|
||||
extended by at least 13 pixels each. By the time we get here, other
|
||||
|
@ -1288,8 +1291,10 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||
// (once border extension is taken into account)
|
||||
for (l = 0; l < 8; ++l) {
|
||||
tmp[(k + 7) * 8 + l] =
|
||||
ref[iy * stride] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
ref[iy * stride] * (1 << (WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS));
|
||||
}
|
||||
} else if (ix4 >= width + 6) {
|
||||
// In this case, the leftmost pixel sampled is in column
|
||||
|
@ -1297,9 +1302,11 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||
// will sample only from the rightmost column
|
||||
// (once border extension is taken into account)
|
||||
for (l = 0; l < 8; ++l) {
|
||||
tmp[(k + 7) * 8 + l] =
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
|
||||
tmp[(k + 7) * 8 + l] = (1 << (bd + WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS));
|
||||
}
|
||||
} else {
|
||||
// If we get here, then
|
||||
|
@ -1317,13 +1324,15 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||
const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
|
||||
WARPEDPIXEL_PREC_SHIFTS;
|
||||
const int16_t *coeffs = warped_filter[offs];
|
||||
int32_t sum = 0;
|
||||
int32_t sum = 1 << (bd + WARPEDPIXEL_FILTER_BITS - 1);
|
||||
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||
for (m = 0; m < 8; ++m) {
|
||||
sum += ref[iy * stride + ix + m] * coeffs[m];
|
||||
}
|
||||
sum = ROUND_POWER_OF_TWO(sum, HORSHEAR_REDUCE_PREC_BITS);
|
||||
tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
|
||||
tmp[(k + 7) * 8 + (l + 4)] =
|
||||
saturate_uint(sum, bd + WARPEDPIXEL_FILTER_BITS -
|
||||
HORSHEAR_REDUCE_PREC_BITS + 1);
|
||||
sx += alpha;
|
||||
}
|
||||
}
|
||||
|
@ -1339,7 +1348,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
|
|||
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
|
||||
WARPEDPIXEL_PREC_SHIFTS;
|
||||
const int16_t *coeffs = warped_filter[offs];
|
||||
int32_t sum = 0;
|
||||
int32_t sum = -(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1));
|
||||
// assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
|
||||
for (m = 0; m < 8; ++m) {
|
||||
sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
|
||||
|
|
|
@ -89,8 +89,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
|
|||
else if (iy > height - 1)
|
||||
iy = height - 1;
|
||||
tmp[k + 7] = _mm_set1_epi16(
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
|
||||
1)) +
|
||||
ref[iy * stride] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
}
|
||||
} else if (ix4 >= width + 6) {
|
||||
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
|
||||
|
@ -100,8 +102,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
|
|||
else if (iy > height - 1)
|
||||
iy = height - 1;
|
||||
tmp[k + 7] = _mm_set1_epi16(
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
|
||||
1)) +
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
}
|
||||
} else {
|
||||
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
|
||||
|
@ -151,7 +155,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
|
|||
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
|
||||
|
||||
const __m128i round_const =
|
||||
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
|
||||
_mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
|
||||
((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
|
||||
|
||||
// Calculate filtered results
|
||||
const __m128i res_0 = _mm_madd_epi16(src, coeff_0);
|
||||
|
@ -299,7 +304,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
|
|||
|
||||
// Round and pack into 8 bits
|
||||
const __m128i round_const =
|
||||
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
|
||||
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
|
||||
|
||||
const __m128i res_lo_round = _mm_srai_epi32(
|
||||
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
|
||||
|
|
|
@ -23,6 +23,7 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
|
|||
int16_t delta) {
|
||||
__m128i tmp[15];
|
||||
int i, j, k;
|
||||
const int bd = 8;
|
||||
|
||||
/* Note: For this code to work, the left/right frame borders need to be
|
||||
extended by at least 13 pixels each. By the time we get here, other
|
||||
|
@ -84,8 +85,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
|
|||
else if (iy > height - 1)
|
||||
iy = height - 1;
|
||||
tmp[k + 7] = _mm_set1_epi16(
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
|
||||
1)) +
|
||||
ref[iy * stride] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
}
|
||||
} else if (ix4 >= width + 6) {
|
||||
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
|
||||
|
@ -95,8 +98,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
|
|||
else if (iy > height - 1)
|
||||
iy = height - 1;
|
||||
tmp[k + 7] = _mm_set1_epi16(
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
|
||||
1)) +
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
}
|
||||
} else {
|
||||
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
|
||||
|
@ -145,7 +150,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
|
|||
const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
|
||||
|
||||
const __m128i round_const =
|
||||
_mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
|
||||
_mm_set1_epi32((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
|
||||
((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
|
||||
|
||||
// Calculate filtered results
|
||||
const __m128i src_0 = _mm_unpacklo_epi8(src, zero);
|
||||
|
@ -294,7 +300,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
|
|||
|
||||
// Round and pack into 8 bits
|
||||
const __m128i round_const =
|
||||
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
|
||||
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
|
||||
|
||||
const __m128i res_lo_round = _mm_srai_epi32(
|
||||
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
|
||||
|
|
|
@ -210,6 +210,7 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
|
|||
int16_t delta) {
|
||||
__m128i tmp[15];
|
||||
int i, j, k;
|
||||
const int bd = 8;
|
||||
|
||||
/* Note: For this code to work, the left/right frame borders need to be
|
||||
extended by at least 13 pixels each. By the time we get here, other
|
||||
|
@ -271,8 +272,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
|
|||
else if (iy > height - 1)
|
||||
iy = height - 1;
|
||||
tmp[k + 7] = _mm_set1_epi16(
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
|
||||
1)) +
|
||||
ref[iy * stride] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
}
|
||||
} else if (ix4 >= width + 6) {
|
||||
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
|
||||
|
@ -282,8 +285,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
|
|||
else if (iy > height - 1)
|
||||
iy = height - 1;
|
||||
tmp[k + 7] = _mm_set1_epi16(
|
||||
(1 << (bd + WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS -
|
||||
1)) +
|
||||
ref[iy * stride + (width - 1)] *
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
|
||||
}
|
||||
} else {
|
||||
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
|
||||
|
@ -365,7 +370,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
|
|||
const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff_57);
|
||||
|
||||
const __m128i round_const =
|
||||
_mm_set1_epi16((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);
|
||||
_mm_set1_epi16((1 << (bd + WARPEDPIXEL_FILTER_BITS - 1)) +
|
||||
((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1));
|
||||
|
||||
// Note: res_02 + res_46 and res_13 + res_57 are always in the range
|
||||
// [-6120, 32640]. This gives us enough room to add the rounding
|
||||
|
@ -374,12 +380,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
|
|||
_mm_add_epi16(_mm_add_epi16(res_02, res_46), round_const);
|
||||
const __m128i res_b = _mm_add_epi16(res_13, res_57);
|
||||
|
||||
// Calculate (res_a + res_b) >> 1 while avoiding overflow
|
||||
const __m128i t1 = _mm_and_si128(res_a, res_b);
|
||||
const __m128i t2 = _mm_srai_epi16(_mm_xor_si128(res_a, res_b), 1);
|
||||
|
||||
const __m128i res = _mm_srai_epi16(_mm_add_epi16(t1, t2),
|
||||
HORSHEAR_REDUCE_PREC_BITS - 1);
|
||||
const __m128i res = _mm_srli_epi16(_mm_add_epi16(res_a, res_b),
|
||||
HORSHEAR_REDUCE_PREC_BITS);
|
||||
tmp[k + 7] = res;
|
||||
}
|
||||
}
|
||||
|
@ -471,7 +473,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
|
|||
|
||||
// Round and pack into 8 bits
|
||||
const __m128i round_const =
|
||||
_mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);
|
||||
_mm_set1_epi32(-(1 << (bd + VERSHEAR_REDUCE_PREC_BITS - 1)) +
|
||||
((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1));
|
||||
|
||||
const __m128i res_lo_round = _mm_srai_epi32(
|
||||
_mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
|
||||
|
|
Загрузка…
Ссылка в новой задаче