Merge "Improve vp9_fdct4x4_sse2 (x1.2)"

This commit is contained in:
Yunqing Wang 2013-11-22 10:39:55 -08:00 коммит произвёл Gerrit Code Review
Родитель 16ad35f64e ec2dbdd107
Коммит 384089004d
1 изменённых файлов: 31 добавлений и 33 удалений

Просмотреть файл

@ -26,24 +26,25 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// by constructing the 32 bit constant corresponding to that pair.
const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
const __m128i kOne = _mm_set1_epi16(1);
__m128i in0, in1, in2, in3;
__m128i in0, in1;
// Load inputs.
{
in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
(input + 1 * stride)));
in1 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
in1 = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)
(input + 3 * stride)), in1);
// x = x << 4
in0 = _mm_slli_epi16(in0, 4);
in1 = _mm_slli_epi16(in1, 4);
in2 = _mm_slli_epi16(in2, 4);
in3 = _mm_slli_epi16(in3, 4);
// if (i == 0 && input[0]) input[0] += 1;
{
// The mask will only contain wether the first value is zero, all
@ -60,18 +61,18 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// Do the two transform/transpose passes
for (pass = 0; pass < 2; ++pass) {
// Transform 1/2: Add/substract
const __m128i r0 = _mm_add_epi16(in0, in3);
const __m128i r1 = _mm_add_epi16(in1, in2);
const __m128i r2 = _mm_sub_epi16(in1, in2);
const __m128i r3 = _mm_sub_epi16(in0, in3);
const __m128i r0 = _mm_add_epi16(in0, in1);
const __m128i r1 = _mm_sub_epi16(in0, in1);
const __m128i r2 = _mm_unpacklo_epi64(r0, r1);
const __m128i r3 = _mm_unpackhi_epi64(r0, r1);
// Transform 1/2: Interleave to do the multiply by constants which gets us
// into 32 bits.
const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
const __m128i t0 = _mm_unpacklo_epi16(r2, r3);
const __m128i t2 = _mm_unpackhi_epi16(r2, r3);
const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p08_p24);
const __m128i u6 = _mm_madd_epi16(t2, k__cospi_p24_m08);
const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
@ -90,24 +91,21 @@ void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
in1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
in1 = _mm_shuffle_epi32(in1, 0x4E);
// 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
// 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3
if (0 == pass) {
// Extract values in the high part for second pass as transform code
// only uses the first four values.
in1 = _mm_unpackhi_epi64(in0, in0);
in3 = _mm_unpackhi_epi64(in2, in2);
} else {
// Post-condition output and store it (v + 1) >> 2, taking advantage
// of the fact 1/3 are stored just after 0/2.
__m128i out01 = _mm_add_epi16(in0, kOne);
__m128i out23 = _mm_add_epi16(in2, kOne);
out01 = _mm_srai_epi16(out01, 2);
out23 = _mm_srai_epi16(out23, 2);
_mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
_mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
}
// 02 12 22 32 03 13 23 33 in1 contains 2 followed by 3
}
in1 = _mm_shuffle_epi32(in1, 0x4E);
// Post-condition output and store it (v + 1) >> 2, taking advantage
// of the fact 1/3 are stored just after 0/2.
{
__m128i out01 = _mm_add_epi16(in0, kOne);
__m128i out23 = _mm_add_epi16(in1, kOne);
out01 = _mm_srai_epi16(out01, 2);
out23 = _mm_srai_epi16(out23, 2);
_mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
_mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
}
}