Improve rectangular transform accuracy
By adjusting the internal scaling and rounding in the transforms, we can adjust the maximum round-trip errors to: * 8x16 and 16x8: 0 pixel values (ie, transforms are exact) * 16x32: 1 pixel value * 32x16: 2 pixel values Change-Id: I0ba691a8d27042dcf1dd5ae81568d07a92d68781
This commit is contained in:
Родитель
29c61068d7
Коммит
838a53d623
|
@ -1337,7 +1337,9 @@ void av1_fht8x16_c(const int16_t *input, tran_low_t *output, int stride,
|
||||||
for (i = 0; i < n2; ++i) {
|
for (i = 0; i < n2; ++i) {
|
||||||
for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
|
for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
|
||||||
ht.rows(temp_in, temp_out);
|
ht.rows(temp_in, temp_out);
|
||||||
for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
|
for (j = 0; j < n; ++j)
|
||||||
|
output[j + i * n] =
|
||||||
|
saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
}
|
}
|
||||||
// Note: overall scale factor of transform is 8 times unitary
|
// Note: overall scale factor of transform is 8 times unitary
|
||||||
}
|
}
|
||||||
|
@ -1388,7 +1390,9 @@ void av1_fht16x8_c(const int16_t *input, tran_low_t *output, int stride,
|
||||||
for (i = 0; i < n; ++i) {
|
for (i = 0; i < n; ++i) {
|
||||||
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
|
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
|
||||||
ht.rows(temp_in, temp_out);
|
ht.rows(temp_in, temp_out);
|
||||||
for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
|
for (j = 0; j < n2; ++j)
|
||||||
|
output[j + i * n2] =
|
||||||
|
saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
}
|
}
|
||||||
// Note: overall scale factor of transform is 8 times unitary
|
// Note: overall scale factor of transform is 8 times unitary
|
||||||
}
|
}
|
||||||
|
@ -1429,16 +1433,20 @@ void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,
|
||||||
// Columns
|
// Columns
|
||||||
for (i = 0; i < n; ++i) {
|
for (i = 0; i < n; ++i) {
|
||||||
for (j = 0; j < n2; ++j)
|
for (j = 0; j < n2; ++j)
|
||||||
temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
|
temp_in[j] =
|
||||||
|
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
|
||||||
ht.cols(temp_in, temp_out);
|
ht.cols(temp_in, temp_out);
|
||||||
for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
|
for (j = 0; j < n2; ++j)
|
||||||
|
out[j * n + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rows
|
// Rows
|
||||||
for (i = 0; i < n2; ++i) {
|
for (i = 0; i < n2; ++i) {
|
||||||
for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
|
for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
|
||||||
ht.rows(temp_in, temp_out);
|
ht.rows(temp_in, temp_out);
|
||||||
for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
|
for (j = 0; j < n; ++j)
|
||||||
|
output[j + i * n] =
|
||||||
|
saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
}
|
}
|
||||||
// Note: overall scale factor of transform is 4 times unitary
|
// Note: overall scale factor of transform is 4 times unitary
|
||||||
}
|
}
|
||||||
|
@ -1479,16 +1487,20 @@ void av1_fht32x16_c(const int16_t *input, tran_low_t *output, int stride,
|
||||||
// Columns
|
// Columns
|
||||||
for (i = 0; i < n2; ++i) {
|
for (i = 0; i < n2; ++i) {
|
||||||
for (j = 0; j < n; ++j)
|
for (j = 0; j < n; ++j)
|
||||||
temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
|
temp_in[j] =
|
||||||
|
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
|
||||||
ht.cols(temp_in, temp_out);
|
ht.cols(temp_in, temp_out);
|
||||||
for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
|
for (j = 0; j < n; ++j)
|
||||||
|
out[j * n2 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Rows
|
// Rows
|
||||||
for (i = 0; i < n; ++i) {
|
for (i = 0; i < n; ++i) {
|
||||||
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
|
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
|
||||||
ht.rows(temp_in, temp_out);
|
ht.rows(temp_in, temp_out);
|
||||||
for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
|
for (j = 0; j < n2; ++j)
|
||||||
|
output[j + i * n2] =
|
||||||
|
saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
|
||||||
}
|
}
|
||||||
// Note: overall scale factor of transform is 4 times unitary
|
// Note: overall scale factor of transform is 4 times unitary
|
||||||
}
|
}
|
||||||
|
|
|
@ -796,14 +796,14 @@ static INLINE void right_shift_8x8(__m128i *res, const int bit) {
|
||||||
|
|
||||||
if (bit == 2) {
|
if (bit == 2) {
|
||||||
const __m128i const_rounding = _mm_set1_epi16(1);
|
const __m128i const_rounding = _mm_set1_epi16(1);
|
||||||
res[0] = _mm_add_epi16(res[0], const_rounding);
|
res[0] = _mm_adds_epi16(res[0], const_rounding);
|
||||||
res[1] = _mm_add_epi16(res[1], const_rounding);
|
res[1] = _mm_adds_epi16(res[1], const_rounding);
|
||||||
res[2] = _mm_add_epi16(res[2], const_rounding);
|
res[2] = _mm_adds_epi16(res[2], const_rounding);
|
||||||
res[3] = _mm_add_epi16(res[3], const_rounding);
|
res[3] = _mm_adds_epi16(res[3], const_rounding);
|
||||||
res[4] = _mm_add_epi16(res[4], const_rounding);
|
res[4] = _mm_adds_epi16(res[4], const_rounding);
|
||||||
res[5] = _mm_add_epi16(res[5], const_rounding);
|
res[5] = _mm_adds_epi16(res[5], const_rounding);
|
||||||
res[6] = _mm_add_epi16(res[6], const_rounding);
|
res[6] = _mm_adds_epi16(res[6], const_rounding);
|
||||||
res[7] = _mm_add_epi16(res[7], const_rounding);
|
res[7] = _mm_adds_epi16(res[7], const_rounding);
|
||||||
}
|
}
|
||||||
|
|
||||||
res[0] = _mm_sub_epi16(res[0], sign0);
|
res[0] = _mm_sub_epi16(res[0], sign0);
|
||||||
|
@ -3140,14 +3140,6 @@ static INLINE void load_buffer_8x16(const int16_t *input, __m128i *in,
|
||||||
scale_sqrt2_8x8_signed(in + 8);
|
scale_sqrt2_8x8_signed(in + 8);
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void right_shift(__m128i *in, int size, int bit) {
|
|
||||||
int i = 0;
|
|
||||||
while (i < size) {
|
|
||||||
in[i] = _mm_srai_epi16(in[i], bit);
|
|
||||||
i += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
|
void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
int tx_type) {
|
int tx_type) {
|
||||||
__m128i in[16];
|
__m128i in[16];
|
||||||
|
@ -3288,8 +3280,8 @@ void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
#endif
|
#endif
|
||||||
default: assert(0); break;
|
default: assert(0); break;
|
||||||
}
|
}
|
||||||
right_shift(t, 8, 2);
|
right_shift_8x8(t, 2);
|
||||||
right_shift(b, 8, 2);
|
right_shift_8x8(b, 2);
|
||||||
write_buffer_8x8(output, t, 8);
|
write_buffer_8x8(output, t, 8);
|
||||||
write_buffer_8x8(output + 64, b, 8);
|
write_buffer_8x8(output + 64, b, 8);
|
||||||
}
|
}
|
||||||
|
@ -3424,8 +3416,8 @@ void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
}
|
}
|
||||||
array_transpose_8x8(l, l);
|
array_transpose_8x8(l, l);
|
||||||
array_transpose_8x8(r, r);
|
array_transpose_8x8(r, r);
|
||||||
right_shift(l, 8, 2);
|
right_shift_8x8(l, 2);
|
||||||
right_shift(r, 8, 2);
|
right_shift_8x8(r, 2);
|
||||||
write_buffer_8x8(output, l, 16);
|
write_buffer_8x8(output, l, 16);
|
||||||
write_buffer_8x8(output + 8, r, 16);
|
write_buffer_8x8(output + 8, r, 16);
|
||||||
}
|
}
|
||||||
|
@ -3496,12 +3488,14 @@ static INLINE void load_buffer_16x32(const int16_t *input, __m128i *intl,
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
intl[i + 0] = _mm_load_si128((const __m128i *)(input + i * stride + 0));
|
intl[i] = _mm_slli_epi16(
|
||||||
intr[i + 0] = _mm_load_si128((const __m128i *)(input + i * stride + 8));
|
_mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
|
||||||
inbl[i + 0] =
|
intr[i] = _mm_slli_epi16(
|
||||||
_mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0));
|
_mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
|
||||||
inbr[i + 0] =
|
inbl[i] = _mm_slli_epi16(
|
||||||
_mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8));
|
_mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
|
||||||
|
inbr[i] = _mm_slli_epi16(
|
||||||
|
_mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fliplr) {
|
if (fliplr) {
|
||||||
|
@ -3526,10 +3520,8 @@ static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
|
||||||
__m128i *restr, __m128i *resbl,
|
__m128i *restr, __m128i *resbl,
|
||||||
__m128i *resbr) {
|
__m128i *resbr) {
|
||||||
int i;
|
int i;
|
||||||
right_shift(restl, 16, 2);
|
right_shift_16x16(restl, restr);
|
||||||
right_shift(restr, 16, 2);
|
right_shift_16x16(resbl, resbr);
|
||||||
right_shift(resbl, 16, 2);
|
|
||||||
right_shift(resbr, 16, 2);
|
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
store_output(&restl[i], output + i * 16 + 0);
|
store_output(&restl[i], output + i * 16 + 0);
|
||||||
store_output(&restr[i], output + i * 16 + 8);
|
store_output(&restr[i], output + i * 16 + 8);
|
||||||
|
@ -3551,24 +3543,32 @@ void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
case DCT_DCT:
|
case DCT_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fdct32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fdct16_sse2(intl, intr);
|
fdct16_sse2(intl, intr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fdct16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case ADST_DCT:
|
case ADST_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fdct16_sse2(intl, intr);
|
fdct16_sse2(intl, intr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fdct16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case DCT_ADST:
|
case DCT_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fdct32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case ADST_ADST:
|
case ADST_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
|
@ -3576,72 +3576,96 @@ void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
case FLIPADST_DCT:
|
case FLIPADST_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fdct16_sse2(intl, intr);
|
fdct16_sse2(intl, intr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fdct16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case DCT_FLIPADST:
|
case DCT_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fdct32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case IDTX:
|
case IDTX:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fdct32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fdct16_sse2(intl, intr);
|
fdct16_sse2(intl, intr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fdct16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case V_ADST:
|
case V_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case H_ADST:
|
case H_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case V_FLIPADST:
|
case V_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case H_FLIPADST:
|
case H_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
|
right_shift_16x16(intl, intr);
|
||||||
|
right_shift_16x16(inbl, inbr);
|
||||||
fadst16_sse2(intl, intr);
|
fadst16_sse2(intl, intr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fadst16_sse2(inbl, inbr);
|
||||||
break;
|
break;
|
||||||
|
@ -3661,10 +3685,14 @@ static INLINE void load_buffer_32x16(const int16_t *input, __m128i *in0,
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
in0[i] = _mm_load_si128((const __m128i *)(input + i * stride + 0));
|
in0[i] = _mm_slli_epi16(
|
||||||
in1[i] = _mm_load_si128((const __m128i *)(input + i * stride + 8));
|
_mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
|
||||||
in2[i] = _mm_load_si128((const __m128i *)(input + i * stride + 16));
|
in1[i] = _mm_slli_epi16(
|
||||||
in3[i] = _mm_load_si128((const __m128i *)(input + i * stride + 24));
|
_mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
|
||||||
|
in2[i] = _mm_slli_epi16(
|
||||||
|
_mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
|
||||||
|
in3[i] = _mm_slli_epi16(
|
||||||
|
_mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fliplr) {
|
if (fliplr) {
|
||||||
|
@ -3688,10 +3716,8 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
|
||||||
__m128i *res1, __m128i *res2,
|
__m128i *res1, __m128i *res2,
|
||||||
__m128i *res3) {
|
__m128i *res3) {
|
||||||
int i;
|
int i;
|
||||||
right_shift(res0, 16, 2);
|
right_shift_16x16(res0, res1);
|
||||||
right_shift(res1, 16, 2);
|
right_shift_16x16(res2, res3);
|
||||||
right_shift(res2, 16, 2);
|
|
||||||
right_shift(res3, 16, 2);
|
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
store_output(&res0[i], output + i * 32 + 0);
|
store_output(&res0[i], output + i * 32 + 0);
|
||||||
store_output(&res1[i], output + i * 32 + 8);
|
store_output(&res1[i], output + i * 32 + 8);
|
||||||
|
@ -3709,21 +3735,29 @@ void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
case DCT_DCT:
|
case DCT_DCT:
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case ADST_DCT:
|
case ADST_DCT:
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case DCT_ADST:
|
case DCT_ADST:
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case ADST_ADST:
|
case ADST_ADST:
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
#if CONFIG_EXT_TX
|
#if CONFIG_EXT_TX
|
||||||
|
@ -3731,72 +3765,96 @@ void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case DCT_FLIPADST:
|
case DCT_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case IDTX:
|
case IDTX:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case V_ADST:
|
case V_ADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case H_ADST:
|
case H_ADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case V_FLIPADST:
|
case V_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case H_FLIPADST:
|
case H_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
|
right_shift_16x16(in0, in1);
|
||||||
|
right_shift_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -69,11 +69,11 @@ class AV1Trans16x32HT : public libaom_test::TransformTestBase,
|
||||||
IhtFunc inv_txfm_;
|
IhtFunc inv_txfm_;
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST_P(AV1Trans16x32HT, AccuracyCheck) { RunAccuracyCheck(48); }
|
TEST_P(AV1Trans16x32HT, AccuracyCheck) { RunAccuracyCheck(1); }
|
||||||
TEST_P(AV1Trans16x32HT, CoeffCheck) { RunCoeffCheck(); }
|
TEST_P(AV1Trans16x32HT, CoeffCheck) { RunCoeffCheck(); }
|
||||||
TEST_P(AV1Trans16x32HT, MemCheck) { RunMemCheck(); }
|
TEST_P(AV1Trans16x32HT, MemCheck) { RunMemCheck(); }
|
||||||
TEST_P(AV1Trans16x32HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
TEST_P(AV1Trans16x32HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
||||||
TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(9); }
|
TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
|
||||||
|
|
||||||
using std::tr1::make_tuple;
|
using std::tr1::make_tuple;
|
||||||
const Ht16x32Param kArrayHt16x32Param_c[] = {
|
const Ht16x32Param kArrayHt16x32Param_c[] = {
|
||||||
|
|
|
@ -69,11 +69,11 @@ class AV1Trans16x8HT : public libaom_test::TransformTestBase,
|
||||||
IhtFunc inv_txfm_;
|
IhtFunc inv_txfm_;
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST_P(AV1Trans16x8HT, AccuracyCheck) { RunAccuracyCheck(1); }
|
TEST_P(AV1Trans16x8HT, AccuracyCheck) { RunAccuracyCheck(0); }
|
||||||
TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
|
TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
|
||||||
TEST_P(AV1Trans16x8HT, MemCheck) { RunMemCheck(); }
|
TEST_P(AV1Trans16x8HT, MemCheck) { RunMemCheck(); }
|
||||||
TEST_P(AV1Trans16x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
TEST_P(AV1Trans16x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
||||||
TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
|
TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
|
||||||
|
|
||||||
using std::tr1::make_tuple;
|
using std::tr1::make_tuple;
|
||||||
|
|
||||||
|
|
|
@ -70,10 +70,10 @@ class AV1Trans32x16HT : public libaom_test::TransformTestBase,
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST_P(AV1Trans32x16HT, MemCheck) { RunMemCheck(); }
|
TEST_P(AV1Trans32x16HT, MemCheck) { RunMemCheck(); }
|
||||||
TEST_P(AV1Trans32x16HT, AccuracyCheck) { RunAccuracyCheck(43); }
|
TEST_P(AV1Trans32x16HT, AccuracyCheck) { RunAccuracyCheck(2); }
|
||||||
TEST_P(AV1Trans32x16HT, CoeffCheck) { RunCoeffCheck(); }
|
TEST_P(AV1Trans32x16HT, CoeffCheck) { RunCoeffCheck(); }
|
||||||
TEST_P(AV1Trans32x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
TEST_P(AV1Trans32x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
||||||
TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(9); }
|
TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
|
||||||
|
|
||||||
using std::tr1::make_tuple;
|
using std::tr1::make_tuple;
|
||||||
const Ht32x16Param kArrayHt32x16Param_c[] = {
|
const Ht32x16Param kArrayHt32x16Param_c[] = {
|
||||||
|
|
|
@ -70,10 +70,10 @@ class AV1Trans8x16HT : public libaom_test::TransformTestBase,
|
||||||
};
|
};
|
||||||
|
|
||||||
TEST_P(AV1Trans8x16HT, MemCheck) { RunMemCheck(); }
|
TEST_P(AV1Trans8x16HT, MemCheck) { RunMemCheck(); }
|
||||||
TEST_P(AV1Trans8x16HT, AccuracyCheck) { RunAccuracyCheck(1); }
|
TEST_P(AV1Trans8x16HT, AccuracyCheck) { RunAccuracyCheck(0); }
|
||||||
TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
|
TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
|
||||||
TEST_P(AV1Trans8x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
TEST_P(AV1Trans8x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
|
||||||
TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
|
TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
|
||||||
|
|
||||||
using std::tr1::make_tuple;
|
using std::tr1::make_tuple;
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче