Special handle on DC only inverse 8x8 2D-DCT
This commit enables a special handle for the 8x8 inverse 2D-DCT, where only DC coefficient is quantized to be non-zero. For bus_cif at 2000 kbps, it provides about 1% speed-up at speed 0. Change-Id: I2523222359eec26b144cf8fd4c63a4ad63b1b011
This commit is contained in:
Родитель
52256cdbca
Коммит
325e0aa650
|
@ -225,6 +225,19 @@ void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp9_short_idct8x8_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
|
||||||
|
int i, j;
|
||||||
|
int a1;
|
||||||
|
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
|
||||||
|
out = dct_const_round_shift(out * cospi_16_64);
|
||||||
|
a1 = ROUND_POWER_OF_TWO(out, 5);
|
||||||
|
for (j = 0; j < 8; ++j) {
|
||||||
|
for (i = 0; i < 8; ++i)
|
||||||
|
dest[i] = clip_pixel(dest[i] + a1);
|
||||||
|
dest += dest_stride;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void iadst4_1d(int16_t *input, int16_t *output) {
|
static void iadst4_1d(int16_t *input, int16_t *output) {
|
||||||
int s0, s1, s2, s3, s4, s5, s6, s7;
|
int s0, s1, s2, s3, s4, s5, s6, s7;
|
||||||
|
|
||||||
|
@ -433,12 +446,6 @@ void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void vp9_short_idct1_8x8_c(int16_t *input, int16_t *output) {
|
|
||||||
int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
|
|
||||||
out = dct_const_round_shift(out * cospi_16_64);
|
|
||||||
output[0] = ROUND_POWER_OF_TWO(out, 5);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void idct16_1d(int16_t *input, int16_t *output) {
|
static void idct16_1d(int16_t *input, int16_t *output) {
|
||||||
int16_t step1[16], step2[16];
|
int16_t step1[16], step2[16];
|
||||||
int temp1, temp2;
|
int temp1, temp2;
|
||||||
|
|
|
@ -297,15 +297,15 @@ specialize vp9_short_idct4x4_1_add sse2
|
||||||
prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
|
prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||||
specialize vp9_short_idct4x4_add sse2
|
specialize vp9_short_idct4x4_add sse2
|
||||||
|
|
||||||
|
prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||||
|
specialize vp9_short_idct8x8_1_add sse2
|
||||||
|
|
||||||
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||||
specialize vp9_short_idct8x8_add sse2 neon
|
specialize vp9_short_idct8x8_add sse2 neon
|
||||||
|
|
||||||
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||||
specialize vp9_short_idct10_8x8_add sse2
|
specialize vp9_short_idct10_8x8_add sse2
|
||||||
|
|
||||||
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
|
|
||||||
specialize vp9_short_idct1_8x8
|
|
||||||
|
|
||||||
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
|
prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
|
||||||
specialize vp9_short_idct16x16_add sse2
|
specialize vp9_short_idct16x16_add sse2
|
||||||
|
|
||||||
|
|
|
@ -523,9 +523,9 @@ void vp9_short_iht4x4_add_sse2(int16_t *input, uint8_t *dest, int stride,
|
||||||
{ \
|
{ \
|
||||||
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
|
__m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
|
||||||
d0 = _mm_unpacklo_epi8(d0, zero); \
|
d0 = _mm_unpacklo_epi8(d0, zero); \
|
||||||
in_x = _mm_add_epi16(in_x, d0); \
|
d0 = _mm_add_epi16(in_x, d0); \
|
||||||
in_x = _mm_packus_epi16(in_x, in_x); \
|
d0 = _mm_packus_epi16(d0, d0); \
|
||||||
_mm_storel_epi64((__m128i *)(dest), in_x); \
|
_mm_storel_epi64((__m128i *)(dest), d0); \
|
||||||
dest += stride; \
|
dest += stride; \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -597,6 +597,27 @@ void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
||||||
RECON_AND_STORE(dest, in7);
|
RECON_AND_STORE(dest, in7);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void vp9_short_idct8x8_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {
|
||||||
|
__m128i dc_value;
|
||||||
|
const __m128i zero = _mm_setzero_si128();
|
||||||
|
int a;
|
||||||
|
|
||||||
|
a = dct_const_round_shift(input[0] * cospi_16_64);
|
||||||
|
a = dct_const_round_shift(a * cospi_16_64);
|
||||||
|
a = ROUND_POWER_OF_TWO(a, 5);
|
||||||
|
|
||||||
|
dc_value = _mm_set1_epi16(a);
|
||||||
|
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
RECON_AND_STORE(dest, dc_value);
|
||||||
|
}
|
||||||
|
|
||||||
// perform 8x8 transpose
|
// perform 8x8 transpose
|
||||||
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
|
static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
|
||||||
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
|
const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
|
||||||
|
|
|
@ -93,15 +93,8 @@ void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
|
||||||
if (eob) {
|
if (eob) {
|
||||||
if (eob == 1) {
|
if (eob == 1) {
|
||||||
// DC only DCT coefficient
|
// DC only DCT coefficient
|
||||||
int16_t in = input[0];
|
vp9_short_idct8x8_1_add(input, dest, stride);
|
||||||
int16_t out;
|
|
||||||
|
|
||||||
// Note: the idct1 will need to be modified accordingly whenever
|
|
||||||
// vp9_short_idct8x8_c() is modified.
|
|
||||||
vp9_short_idct1_8x8_c(&in, &out);
|
|
||||||
input[0] = 0;
|
input[0] = 0;
|
||||||
|
|
||||||
vp9_add_constant_residual_8x8(out, dest, stride);
|
|
||||||
} else {
|
} else {
|
||||||
vp9_short_idct8x8_add(input, dest, stride);
|
vp9_short_idct8x8_add(input, dest, stride);
|
||||||
vpx_memset(input, 0, 128);
|
vpx_memset(input, 0, 128);
|
||||||
|
|
|
@ -47,6 +47,14 @@ static void inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob,
|
||||||
xd->inv_txm4x4_add(dqcoeff, dest, stride);
|
xd->inv_txm4x4_add(dqcoeff, dest, stride);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void inverse_transform_b_8x8_add(MACROBLOCKD *xd, int eob,
|
||||||
|
int16_t *dqcoeff, uint8_t *dest,
|
||||||
|
int stride) {
|
||||||
|
if (eob <= 1)
|
||||||
|
vp9_short_idct8x8_1_add(dqcoeff, dest, stride);
|
||||||
|
else
|
||||||
|
vp9_short_idct8x8_add(dqcoeff, dest, stride);
|
||||||
|
}
|
||||||
|
|
||||||
static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
|
static void subtract_plane(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int plane) {
|
||||||
struct macroblock_plane *const p = &x->plane[plane];
|
struct macroblock_plane *const p = &x->plane[plane];
|
||||||
|
@ -533,7 +541,8 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||||
vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
|
vp9_short_idct16x16_add(dqcoeff, dst, pd->dst.stride);
|
||||||
break;
|
break;
|
||||||
case TX_8X8:
|
case TX_8X8:
|
||||||
vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
|
inverse_transform_b_8x8_add(xd, pd->eobs[block], dqcoeff,
|
||||||
|
dst, pd->dst.stride);
|
||||||
break;
|
break;
|
||||||
case TX_4X4:
|
case TX_4X4:
|
||||||
// this is like vp9_short_idct4x4 but has a special case around eob<=1
|
// this is like vp9_short_idct4x4 but has a special case around eob<=1
|
||||||
|
@ -711,7 +720,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||||
pd->dequant, p->zbin_extra, eob, scan, iscan);
|
pd->dequant, p->zbin_extra, eob, scan, iscan);
|
||||||
if (!x->skip_encode && *eob) {
|
if (!x->skip_encode && *eob) {
|
||||||
if (tx_type == DCT_DCT)
|
if (tx_type == DCT_DCT)
|
||||||
vp9_short_idct8x8_add(dqcoeff, dst, pd->dst.stride);
|
inverse_transform_b_8x8_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
|
||||||
else
|
else
|
||||||
vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
|
vp9_short_iht8x8_add(dqcoeff, dst, pd->dst.stride, tx_type);
|
||||||
}
|
}
|
||||||
|
@ -746,8 +755,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
|
||||||
// this is like vp9_short_idct4x4 but has a special case around eob<=1
|
// this is like vp9_short_idct4x4 but has a special case around eob<=1
|
||||||
// which is significant (not just an optimization) for the lossless
|
// which is significant (not just an optimization) for the lossless
|
||||||
// case.
|
// case.
|
||||||
inverse_transform_b_4x4_add(xd, *eob, dqcoeff,
|
inverse_transform_b_4x4_add(xd, *eob, dqcoeff, dst, pd->dst.stride);
|
||||||
dst, pd->dst.stride);
|
|
||||||
else
|
else
|
||||||
vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
|
vp9_short_iht4x4_add(dqcoeff, dst, pd->dst.stride, tx_type);
|
||||||
}
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче