Skip adding zero siginal to prediction with DC only idct

If DC only idct gives zero, then we can skip the steps which add zero signal to predicted signal. DC only idct cases will occur more frequently at lower bit rates. Similar changes can be done for C version of high bit depth idct functions. Change-Id: I53af22904568f7043091710da70ca8299bf361c5
2017-03-30 11:58:20 -07:00 · 2017-03-30 11:58:20 -07:00 · 27acc47869
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@ -145,6 +145,8 @@ void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  a1 = ROUND_POWER_OF_TWO(out, 4);

+  if (a1 == 0) return;
+
  for (i = 0; i < 4; i++) {
    dest[0] = clip_pixel_add(dest[0], a1);
    dest[1] = clip_pixel_add(dest[1], a1);
@ -238,6 +240,7 @@ void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  a1 = ROUND_POWER_OF_TWO(out, 5);
+  if (a1 == 0) return;
  for (j = 0; j < 8; ++j) {
    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    dest += stride;
@ -776,6 +779,7 @@ void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  a1 = ROUND_POWER_OF_TWO(out, 6);
+  if (a1 == 0) return;
  for (j = 0; j < 16; ++j) {
    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
    dest += stride;
@ -1245,6 +1249,7 @@ void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
  a1 = ROUND_POWER_OF_TWO(out, 6);
+  if (a1 == 0) return;

  for (j = 0; j < 32; ++j) {
    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
--- a/aom_dsp/x86/inv_txfm_sse2.c
+++ b/aom_dsp/x86/inv_txfm_sse2.c
@ -163,6 +163,8 @@ void aom_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  a = (int)dct_const_round_shift(a * cospi_16_64);
  a = ROUND_POWER_OF_TWO(a, 4);

+  if (a == 0) return;
+
  dc_value = _mm_set1_epi16(a);

  RECON_AND_STORE4X4(dest + 0 * stride, dc_value);
@ -521,6 +523,8 @@ void aom_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  a = (int)dct_const_round_shift(a * cospi_16_64);
  a = ROUND_POWER_OF_TWO(a, 5);

+  if (a == 0) return;
+
  dc_value = _mm_set1_epi16(a);

  RECON_AND_STORE(dest + 0 * stride, dc_value);
@ -1291,6 +1295,8 @@ void aom_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  a = (int)dct_const_round_shift(a * cospi_16_64);
  a = ROUND_POWER_OF_TWO(a, 6);

+  if (a == 0) return;
+
  dc_value = _mm_set1_epi16(a);

  for (i = 0; i < 16; ++i) {
@ -3437,6 +3443,8 @@ void aom_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
  a = (int)dct_const_round_shift(a * cospi_16_64);
  a = ROUND_POWER_OF_TWO(a, 6);

+  if (a == 0) return;
+
  dc_value = _mm_set1_epi16(a);

  for (j = 0; j < 32; ++j) {