SSE2 4x4 invserse ADST/DCT transform

Enable SSE2 4x4 inverse ADST/DCT transform. The runtime goes from
292 cycles down to 89 cycles. Running bus_cif at 2000 kbps, the
overall runtime of speed 0 goes from 301s to 295s (2% speed-up).

Change-Id: I24098136e7fee7ab2fbf1c11755bdf2ca37f3628
This commit is contained in:
Jingning Han 2013-07-10 12:11:09 -07:00
Родитель aedc7c59b1
Коммит 49b6302044
2 изменённых файлов: 150 добавлений и 1 удалений

Просмотреть файл

@ -325,7 +325,7 @@ prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int de
specialize vp9_short_idct10_32x32_add
prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht4x4_add
specialize vp9_short_iht4x4_add sse2
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
specialize vp9_short_iht8x8_add

Просмотреть файл

@ -241,6 +241,155 @@ void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
_mm_storel_epi64((__m128i *)output, in);
}
static INLINE void transpose_4x4(__m128i *res) {
const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
res[1] = _mm_unpackhi_epi64(res[0], res[0]);
res[3] = _mm_unpackhi_epi64(res[2], res[2]);
}
void idct4_1d_sse2(__m128i *in) {
const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
transpose_4x4(in);
// stage 1
u[0] = _mm_unpacklo_epi16(in[0], in[2]);
u[1] = _mm_unpacklo_epi16(in[1], in[3]);
v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
u[0] = _mm_packs_epi32(v[0], v[2]);
u[1] = _mm_packs_epi32(v[1], v[3]);
u[2] = _mm_unpackhi_epi64(u[0], u[0]);
u[3] = _mm_unpackhi_epi64(u[1], u[1]);
// stage 2
in[0] = _mm_add_epi16(u[0], u[3]);
in[1] = _mm_add_epi16(u[1], u[2]);
in[2] = _mm_sub_epi16(u[1], u[2]);
in[3] = _mm_sub_epi16(u[0], u[3]);
}
void iadst4_1d_sse2(__m128i *in) {
const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
const __m128i k__sinpi_p03_m04 = pair_set_epi16(sinpi_3_9, -sinpi_4_9);
const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
const __m128i kZero = _mm_set1_epi16(0);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8], in7;
transpose_4x4(in);
in7 = _mm_add_epi16(in[0], in[3]);
in7 = _mm_sub_epi16(in7, in[2]);
u[0] = _mm_unpacklo_epi16(in[0], in[2]);
u[1] = _mm_unpacklo_epi16(in[1], in[3]);
u[2] = _mm_unpacklo_epi16(in7, kZero);
u[3] = _mm_unpacklo_epi16(in[1], kZero);
v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04); // s0 + s3
v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02); // s2 + s5
v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x2
v[3] = _mm_madd_epi16(u[0], k__sinpi_p02_m01); // s1 - s4
v[4] = _mm_madd_epi16(u[1], k__sinpi_p03_m04); // s2 - s6
v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s2
u[0] = _mm_add_epi32(v[0], v[1]);
u[1] = _mm_add_epi32(v[3], v[4]);
u[2] = v[2];
u[3] = _mm_add_epi32(u[0], u[1]);
u[4] = _mm_slli_epi32(v[5], 2);
u[5] = _mm_add_epi32(u[3], v[5]);
u[6] = _mm_sub_epi32(u[5], u[4]);
v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
in[0] = _mm_packs_epi32(u[0], u[2]);
in[1] = _mm_packs_epi32(u[1], u[3]);
in[2] = _mm_unpackhi_epi64(in[0], in[0]);
in[3] = _mm_unpackhi_epi64(in[1], in[1]);
}
void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
int tx_type) {
__m128i in[4];
const __m128i zero = _mm_setzero_si128();
const __m128i eight = _mm_set1_epi16(8);
in[0] = _mm_loadl_epi64((__m128i *)input);
in[1] = _mm_loadl_epi64((__m128i *)(input + 4));
in[2] = _mm_loadl_epi64((__m128i *)(input + 8));
in[3] = _mm_loadl_epi64((__m128i *)(input + 12));
switch (tx_type) {
case 0: // DCT_DCT
idct4_1d_sse2(in);
idct4_1d_sse2(in);
break;
case 1: // ADST_DCT
idct4_1d_sse2(in);
iadst4_1d_sse2(in);
break;
case 2: // DCT_ADST
iadst4_1d_sse2(in);
idct4_1d_sse2(in);
break;
case 3: // ADST_ADST
iadst4_1d_sse2(in);
iadst4_1d_sse2(in);
break;
default:
assert(0);
break;
}
// Final round and shift
in[0] = _mm_add_epi16(in[0], eight);
in[1] = _mm_add_epi16(in[1], eight);
in[2] = _mm_add_epi16(in[2], eight);
in[3] = _mm_add_epi16(in[3], eight);
in[0] = _mm_srai_epi16(in[0], 4);
in[1] = _mm_srai_epi16(in[1], 4);
in[2] = _mm_srai_epi16(in[2], 4);
in[3] = _mm_srai_epi16(in[3], 4);
RECON_AND_STORE4X4(dest, in[0]);
RECON_AND_STORE4X4(dest, in[1]);
RECON_AND_STORE4X4(dest, in[2]);
RECON_AND_STORE4X4(dest, in[3]);
}
#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \