Extra rounding to let hw to use narrower integers.

Change-Id: I175d6ff03f31a2e0d2fe7cd1c3852210d6e0ddf5
This commit is contained in:
Sean Purser-Haskell 2017-05-03 16:50:07 -07:00 коммит произвёл Sean Purser-haskell
Родитель f89056aa06
Коммит 14b8112b42
5 изменённых файлов: 74 добавлений и 26 удалений

Просмотреть файл

@ -58,6 +58,8 @@ typedef struct mv32 {
// Precision of filter taps
#define WARPEDPIXEL_FILTER_BITS 7
#define WARP_PARAM_REDUCE_BITS 6
// Precision bits reduction after horizontal shear
#define HORSHEAR_REDUCE_PREC_BITS 5
#define VERSHEAR_REDUCE_PREC_BITS \

Просмотреть файл

@ -779,6 +779,15 @@ int get_shear_params(WarpedMotionParams *wm) {
INT16_MIN, INT16_MAX);
if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
return 0;
wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
return 1;
}
@ -1002,6 +1011,14 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
// Horizontal filter
for (k = -7; k < 8; ++k) {
int iy = iy4 + k;
@ -1023,7 +1040,7 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS));
}
} else {
int sx = sx4 + alpha * (-4) + beta * k;
int sx = sx4 + beta * (k + 4);
for (l = -4; l < 4; ++l) {
int ix = ix4 + l - 3;
@ -1048,8 +1065,8 @@ void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
int sy = sy4 + gamma * (-4) + delta * k;
for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
int sy = sy4 + delta * (k + 4);
for (l = -4; l < 4; ++l) {
uint16_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
@ -1245,6 +1262,14 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
// Horizontal filter
for (k = -7; k < 8; ++k) {
int iy = iy4 + k;
@ -1281,7 +1306,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
// So, assuming that border extension has been done, we
// don't need to explicitly clamp values.
int sx = sx4 + alpha * (-4) + beta * k;
int sx = sx4 + alpha * (4 - 4) + beta * (k + 4);
for (l = -4; l < 4; ++l) {
int ix = ix4 + l - 3;
@ -1303,7 +1328,7 @@ void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
int sy = sy4 + gamma * (-4) + delta * k;
int sy = sy4 + gamma * (4 - 4) + delta * (k + 4);
for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
uint8_t *p =
&pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];

Просмотреть файл

@ -68,6 +68,14 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
// Horizontal filter
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
@ -88,10 +96,10 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
const int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sx = sx4 + beta * (k + 4) +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load source pixels
const __m128i src =
@ -195,9 +203,8 @@ void av1_highbd_warp_affine_ssse3(const int32_t *mat, const uint16_t *ref,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
const int sy = sy4 + gamma * (-4) + delta * k +
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sy = sy4 + delta * (k + 4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7

Просмотреть файл

@ -63,6 +63,14 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
// Horizontal filter
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
@ -83,10 +91,10 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
const int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sx = sx4 + beta * (k + 4) +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load source pixels
const __m128i zero = _mm_setzero_si128();
@ -190,9 +198,8 @@ void av1_warp_affine_sse2(const int32_t *mat, const uint8_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
const int sy = sy4 + gamma * (-4) + delta * k +
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sy = sy4 + delta * (k + 4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7

Просмотреть файл

@ -250,6 +250,14 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
iy4 = y4 >> WARPEDMODEL_PREC_BITS;
sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
sx4 += alpha * (-4) + beta * (-4);
sy4 += gamma * (-4) + delta * (-4);
sx4 = ROUND_POWER_OF_TWO_SIGNED(sx4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
sy4 = ROUND_POWER_OF_TWO_SIGNED(sy4, WARP_PARAM_REDUCE_BITS)
<< WARP_PARAM_REDUCE_BITS;
// Horizontal filter
for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
int iy = iy4 + k;
@ -270,10 +278,10 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
ref[iy * stride + (width - 1)] *
(1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
} else {
const int sx = sx4 + alpha * (-4) + beta * k +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sx = sx4 + beta * (k + 4) +
// Include rounding and offset here
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load source pixels
const __m128i src =
@ -367,9 +375,8 @@ void av1_warp_affine_ssse3(const int32_t *mat, const uint8_t *ref, int width,
// Vertical filter
for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
const int sy = sy4 + gamma * (-4) + delta * k +
(1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
int sy = sy4 + delta * (k + 4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
(WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
// Load from tmp and rearrange pairs of consecutive rows into the
// column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7