Fix 16x32, 32x16 rectangular transform SSE2 to match C
- Turn on SSE2 unit tests Change-Id: I285771b04c0dec0501210fde570b9ac3cb9c4be0
This commit is contained in:
Родитель
ab9ecbabe8
Коммит
3b0b5f17eb
|
@ -92,14 +92,14 @@ static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
|
||||||
int diff_stride, TX_TYPE tx_type,
|
int diff_stride, TX_TYPE tx_type,
|
||||||
FWD_TXFM_OPT fwd_txfm_opt) {
|
FWD_TXFM_OPT fwd_txfm_opt) {
|
||||||
(void)fwd_txfm_opt;
|
(void)fwd_txfm_opt;
|
||||||
av1_fht16x32_c(src_diff, coeff, diff_stride, tx_type);
|
av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
|
static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
|
||||||
int diff_stride, TX_TYPE tx_type,
|
int diff_stride, TX_TYPE tx_type,
|
||||||
FWD_TXFM_OPT fwd_txfm_opt) {
|
FWD_TXFM_OPT fwd_txfm_opt) {
|
||||||
(void)fwd_txfm_opt;
|
(void)fwd_txfm_opt;
|
||||||
av1_fht32x16_c(src_diff, coeff, diff_stride, tx_type);
|
av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
|
static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
|
||||||
|
|
|
@ -3453,36 +3453,6 @@ static INLINE void fdct32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
|
||||||
array_transpose_16x16(bl, br);
|
array_transpose_16x16(bl, br);
|
||||||
}
|
}
|
||||||
|
|
||||||
static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
|
|
||||||
__m128i *br) {
|
|
||||||
__m128i tmpl[16], tmpr[16];
|
|
||||||
int i;
|
|
||||||
|
|
||||||
// Copy the bottom half of the input to temporary storage
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
tmpl[i] = bl[i];
|
|
||||||
tmpr[i] = br[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate the bottom half of the output
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
bl[i] = _mm_slli_epi16(tl[i], 2);
|
|
||||||
br[i] = _mm_slli_epi16(tr[i], 2);
|
|
||||||
}
|
|
||||||
array_transpose_16x16(bl, br);
|
|
||||||
|
|
||||||
// Copy the temporary storage back to the top half of the input
|
|
||||||
for (i = 0; i < 16; ++i) {
|
|
||||||
tl[i] = tmpl[i];
|
|
||||||
tr[i] = tmpr[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Generate the top half of the output
|
|
||||||
scale_sqrt2_8x16(tl);
|
|
||||||
scale_sqrt2_8x16(tr);
|
|
||||||
fdct16_sse2(tl, tr);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CONFIG_EXT_TX
|
#if CONFIG_EXT_TX
|
||||||
static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
|
static INLINE void fidtx32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
|
||||||
__m128i *br) {
|
__m128i *br) {
|
||||||
|
@ -3541,8 +3511,6 @@ static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
|
||||||
__m128i *restr, __m128i *resbl,
|
__m128i *restr, __m128i *resbl,
|
||||||
__m128i *resbr) {
|
__m128i *resbr) {
|
||||||
int i;
|
int i;
|
||||||
right_shift_16x16(restl, restr);
|
|
||||||
right_shift_16x16(resbl, resbr);
|
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
store_output(&restl[i], output + i * 16 + 0);
|
store_output(&restl[i], output + i * 16 + 0);
|
||||||
store_output(&restr[i], output + i * 16 + 8);
|
store_output(&restr[i], output + i * 16 + 8);
|
||||||
|
@ -3551,6 +3519,104 @@ static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static INLINE void round_signed_8x8(__m128i *in, const int bit) {
|
||||||
|
const __m128i rounding = _mm_set1_epi16((1 << bit) >> 1);
|
||||||
|
__m128i sign0 = _mm_srai_epi16(in[0], 15);
|
||||||
|
__m128i sign1 = _mm_srai_epi16(in[1], 15);
|
||||||
|
__m128i sign2 = _mm_srai_epi16(in[2], 15);
|
||||||
|
__m128i sign3 = _mm_srai_epi16(in[3], 15);
|
||||||
|
__m128i sign4 = _mm_srai_epi16(in[4], 15);
|
||||||
|
__m128i sign5 = _mm_srai_epi16(in[5], 15);
|
||||||
|
__m128i sign6 = _mm_srai_epi16(in[6], 15);
|
||||||
|
__m128i sign7 = _mm_srai_epi16(in[7], 15);
|
||||||
|
|
||||||
|
in[0] = _mm_add_epi16(_mm_add_epi16(in[0], rounding), sign0);
|
||||||
|
in[1] = _mm_add_epi16(_mm_add_epi16(in[1], rounding), sign1);
|
||||||
|
in[2] = _mm_add_epi16(_mm_add_epi16(in[2], rounding), sign2);
|
||||||
|
in[3] = _mm_add_epi16(_mm_add_epi16(in[3], rounding), sign3);
|
||||||
|
in[4] = _mm_add_epi16(_mm_add_epi16(in[4], rounding), sign4);
|
||||||
|
in[5] = _mm_add_epi16(_mm_add_epi16(in[5], rounding), sign5);
|
||||||
|
in[6] = _mm_add_epi16(_mm_add_epi16(in[6], rounding), sign6);
|
||||||
|
in[7] = _mm_add_epi16(_mm_add_epi16(in[7], rounding), sign7);
|
||||||
|
|
||||||
|
in[0] = _mm_srai_epi16(in[0], bit);
|
||||||
|
in[1] = _mm_srai_epi16(in[1], bit);
|
||||||
|
in[2] = _mm_srai_epi16(in[2], bit);
|
||||||
|
in[3] = _mm_srai_epi16(in[3], bit);
|
||||||
|
in[4] = _mm_srai_epi16(in[4], bit);
|
||||||
|
in[5] = _mm_srai_epi16(in[5], bit);
|
||||||
|
in[6] = _mm_srai_epi16(in[6], bit);
|
||||||
|
in[7] = _mm_srai_epi16(in[7], bit);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void round_signed_16x16(__m128i *in0, __m128i *in1) {
|
||||||
|
const int bit = 4;
|
||||||
|
round_signed_8x8(in0, bit);
|
||||||
|
round_signed_8x8(in0 + 8, bit);
|
||||||
|
round_signed_8x8(in1, bit);
|
||||||
|
round_signed_8x8(in1 + 8, bit);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note:
|
||||||
|
// suffix "t" indicates the transpose operation comes first
|
||||||
|
static void fdct16t_sse2(__m128i *in0, __m128i *in1) {
|
||||||
|
array_transpose_16x16(in0, in1);
|
||||||
|
fdct16_8col(in0);
|
||||||
|
fdct16_8col(in1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void fadst16t_sse2(__m128i *in0, __m128i *in1) {
|
||||||
|
array_transpose_16x16(in0, in1);
|
||||||
|
fadst16_8col(in0);
|
||||||
|
fadst16_8col(in1);
|
||||||
|
}
|
||||||
|
|
||||||
|
static INLINE void fdct32t_16col(__m128i *tl, __m128i *tr, __m128i *bl,
|
||||||
|
__m128i *br) {
|
||||||
|
array_transpose_16x16(tl, tr);
|
||||||
|
array_transpose_16x16(bl, br);
|
||||||
|
fdct32_8col(tl, bl);
|
||||||
|
fdct32_8col(tr, br);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef enum transpose_indicator_ {
|
||||||
|
transpose,
|
||||||
|
no_transpose,
|
||||||
|
} transpose_indicator;
|
||||||
|
|
||||||
|
static INLINE void fhalfright32_16col(__m128i *tl, __m128i *tr, __m128i *bl,
|
||||||
|
__m128i *br, transpose_indicator t) {
|
||||||
|
__m128i tmpl[16], tmpr[16];
|
||||||
|
int i;
|
||||||
|
|
||||||
|
// Copy the bottom half of the input to temporary storage
|
||||||
|
for (i = 0; i < 16; ++i) {
|
||||||
|
tmpl[i] = bl[i];
|
||||||
|
tmpr[i] = br[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate the bottom half of the output
|
||||||
|
for (i = 0; i < 16; ++i) {
|
||||||
|
bl[i] = _mm_slli_epi16(tl[i], 2);
|
||||||
|
br[i] = _mm_slli_epi16(tr[i], 2);
|
||||||
|
}
|
||||||
|
array_transpose_16x16(bl, br);
|
||||||
|
|
||||||
|
// Copy the temporary storage back to the top half of the input
|
||||||
|
for (i = 0; i < 16; ++i) {
|
||||||
|
tl[i] = tmpl[i];
|
||||||
|
tr[i] = tmpr[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Generate the top half of the output
|
||||||
|
scale_sqrt2_8x16(tl);
|
||||||
|
scale_sqrt2_8x16(tr);
|
||||||
|
if (t == transpose)
|
||||||
|
fdct16t_sse2(tl, tr);
|
||||||
|
else
|
||||||
|
fdct16_sse2(tl, tr);
|
||||||
|
}
|
||||||
|
|
||||||
// Note on data layout, for both this and the 32x16 transforms:
|
// Note on data layout, for both this and the 32x16 transforms:
|
||||||
// So that we can reuse the 16-element transforms easily,
|
// So that we can reuse the 16-element transforms easily,
|
||||||
// we want to split the input into 8x16 blocks.
|
// we want to split the input into 8x16 blocks.
|
||||||
|
@ -3563,132 +3629,132 @@ void av1_fht16x32_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
switch (tx_type) {
|
switch (tx_type) {
|
||||||
case DCT_DCT:
|
case DCT_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fdct16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fdct16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fdct16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fdct32t_16col(intl, intr, inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case ADST_DCT:
|
case ADST_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fdct16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fdct16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fdct16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case DCT_ADST:
|
case DCT_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fadst16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fadst16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fadst16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fdct32t_16col(intl, intr, inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case ADST_ADST:
|
case ADST_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fadst16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fadst16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fadst16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
#if CONFIG_EXT_TX
|
#if CONFIG_EXT_TX
|
||||||
case FLIPADST_DCT:
|
case FLIPADST_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fdct16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fdct16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fdct16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fdct16_sse2(inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case DCT_FLIPADST:
|
case DCT_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
fadst16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fadst16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fadst16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fdct32t_16col(intl, intr, inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fadst16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fadst16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fadst16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fadst16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fadst16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fadst16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
fadst16t_sse2(intl, intr);
|
||||||
right_shift_16x16(intl, intr);
|
fadst16t_sse2(inbl, inbr);
|
||||||
right_shift_16x16(inbl, inbr);
|
round_signed_16x16(intl, intr);
|
||||||
fadst16_sse2(intl, intr);
|
round_signed_16x16(inbl, inbr);
|
||||||
fadst16_sse2(inbl, inbr);
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case IDTX:
|
case IDTX:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fdct32_16col(intl, intr, inbl, inbr);
|
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
|
fdct32t_16col(intl, intr, inbl, inbr);
|
||||||
break;
|
break;
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
|
fdct16t_sse2(intl, intr);
|
||||||
|
fdct16t_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fdct16_sse2(intl, intr);
|
|
||||||
fdct16_sse2(inbl, inbr);
|
|
||||||
break;
|
break;
|
||||||
case V_ADST:
|
case V_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case H_ADST:
|
case H_ADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
|
||||||
|
fadst16t_sse2(intl, intr);
|
||||||
|
fadst16t_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fadst16_sse2(intl, intr);
|
|
||||||
fadst16_sse2(inbl, inbr);
|
|
||||||
break;
|
break;
|
||||||
case V_FLIPADST:
|
case V_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
|
||||||
fhalfright32_16col(intl, intr, inbl, inbr);
|
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fidtx16_sse2(intl, intr);
|
fidtx16_sse2(intl, intr);
|
||||||
fidtx16_sse2(inbl, inbr);
|
fidtx16_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
|
fhalfright32_16col(intl, intr, inbl, inbr, transpose);
|
||||||
break;
|
break;
|
||||||
case H_FLIPADST:
|
case H_FLIPADST:
|
||||||
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
|
||||||
|
fadst16t_sse2(intl, intr);
|
||||||
|
fadst16t_sse2(inbl, inbr);
|
||||||
|
round_signed_16x16(intl, intr);
|
||||||
|
round_signed_16x16(inbl, inbr);
|
||||||
fidtx32_16col(intl, intr, inbl, inbr);
|
fidtx32_16col(intl, intr, inbl, inbr);
|
||||||
right_shift_16x16(intl, intr);
|
|
||||||
right_shift_16x16(inbl, inbr);
|
|
||||||
fadst16_sse2(intl, intr);
|
|
||||||
fadst16_sse2(inbl, inbr);
|
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
default: assert(0); break;
|
default: assert(0); break;
|
||||||
|
@ -3737,8 +3803,6 @@ static INLINE void write_buffer_32x16(tran_low_t *output, __m128i *res0,
|
||||||
__m128i *res1, __m128i *res2,
|
__m128i *res1, __m128i *res2,
|
||||||
__m128i *res3) {
|
__m128i *res3) {
|
||||||
int i;
|
int i;
|
||||||
right_shift_16x16(res0, res1);
|
|
||||||
right_shift_16x16(res2, res3);
|
|
||||||
for (i = 0; i < 16; ++i) {
|
for (i = 0; i < 16; ++i) {
|
||||||
store_output(&res0[i], output + i * 32 + 0);
|
store_output(&res0[i], output + i * 32 + 0);
|
||||||
store_output(&res1[i], output + i * 32 + 8);
|
store_output(&res1[i], output + i * 32 + 8);
|
||||||
|
@ -3756,127 +3820,127 @@ void av1_fht32x16_sse2(const int16_t *input, tran_low_t *output, int stride,
|
||||||
case DCT_DCT:
|
case DCT_DCT:
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case ADST_DCT:
|
case ADST_DCT:
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case DCT_ADST:
|
case DCT_ADST:
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
case ADST_ADST:
|
case ADST_ADST:
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
#if CONFIG_EXT_TX
|
#if CONFIG_EXT_TX
|
||||||
case FLIPADST_DCT:
|
case FLIPADST_DCT:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case DCT_FLIPADST:
|
case DCT_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_FLIPADST:
|
case FLIPADST_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
case ADST_FLIPADST:
|
case ADST_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
case FLIPADST_ADST:
|
case FLIPADST_ADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
case IDTX:
|
case IDTX:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case V_DCT:
|
case V_DCT:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fdct16_sse2(in0, in1);
|
fdct16_sse2(in0, in1);
|
||||||
fdct16_sse2(in2, in3);
|
fdct16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case H_DCT:
|
case H_DCT:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fdct32_16col(in0, in1, in2, in3);
|
fdct32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case V_ADST:
|
case V_ADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case H_ADST:
|
case H_ADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
case V_FLIPADST:
|
case V_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
|
||||||
fadst16_sse2(in0, in1);
|
fadst16_sse2(in0, in1);
|
||||||
fadst16_sse2(in2, in3);
|
fadst16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fidtx32_16col(in0, in1, in2, in3);
|
fidtx32_16col(in0, in1, in2, in3);
|
||||||
break;
|
break;
|
||||||
case H_FLIPADST:
|
case H_FLIPADST:
|
||||||
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
|
||||||
fidtx16_sse2(in0, in1);
|
fidtx16_sse2(in0, in1);
|
||||||
fidtx16_sse2(in2, in3);
|
fidtx16_sse2(in2, in3);
|
||||||
right_shift_16x16(in0, in1);
|
round_signed_16x16(in0, in1);
|
||||||
right_shift_16x16(in2, in3);
|
round_signed_16x16(in2, in3);
|
||||||
fhalfright32_16col(in0, in1, in2, in3);
|
fhalfright32_16col(in0, in1, in2, in3, no_transpose);
|
||||||
break;
|
break;
|
||||||
#endif
|
#endif
|
||||||
default: assert(0); break;
|
default: assert(0); break;
|
||||||
|
|
|
@ -137,7 +137,7 @@ const Ht16x32Param kArrayHt16x32Param_sse2[] = {
|
||||||
512)
|
512)
|
||||||
#endif // CONFIG_EXT_TX
|
#endif // CONFIG_EXT_TX
|
||||||
};
|
};
|
||||||
INSTANTIATE_TEST_CASE_P(DISABLED_SSE2, AV1Trans16x32HT,
|
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x32HT,
|
||||||
::testing::ValuesIn(kArrayHt16x32Param_sse2));
|
::testing::ValuesIn(kArrayHt16x32Param_sse2));
|
||||||
#endif // HAVE_SSE2
|
#endif // HAVE_SSE2
|
||||||
|
|
||||||
|
|
|
@ -137,7 +137,7 @@ const Ht32x16Param kArrayHt32x16Param_sse2[] = {
|
||||||
512)
|
512)
|
||||||
#endif // CONFIG_EXT_TX
|
#endif // CONFIG_EXT_TX
|
||||||
};
|
};
|
||||||
INSTANTIATE_TEST_CASE_P(DISABLED_SSE2, AV1Trans32x16HT,
|
INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans32x16HT,
|
||||||
::testing::ValuesIn(kArrayHt32x16Param_sse2));
|
::testing::ValuesIn(kArrayHt32x16Param_sse2));
|
||||||
#endif // HAVE_SSE2
|
#endif // HAVE_SSE2
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче