Use low precision 32x32fdct for encodemb in speed1

The low precision 32x32 fdct has all the intermediate steps within 16-bit depth, hence allowing faster SSE2 implementation, at the expense of larger round-trip error. It was used in the rate-distortion optimization search loop only. Using the low precision version, in replace of the high precision one, affects the compression performance by about 0.7% (derf, stdhd) at speed 0. For speed 1, it makes derf set down by only 0.017%. Change-Id: I4e7d18fac5bea5317b91c8e7dabae143bc6b5c8b
2013-08-07 15:22:51 -07:00 · 2013-08-07 15:22:51 -07:00 · debb9c68c8
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@ -144,7 +144,7 @@ struct macroblock {
  int optimize;

  // indicate if it is in the rd search loop or encoding process
-  int rd_search;
+  int use_lp32x32fdct;
  int skip_encode;

  // Used to store sub partition's choices.
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@ -565,7 +565,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
  MACROBLOCK *const x = &cpi->mb;
  MACROBLOCKD *const xd = &x->e_mbd;

-  x->rd_search = 1;
+  x->use_lp32x32fdct = 1;

  if (bsize < BLOCK_8X8) {
    // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
@ -2546,7 +2546,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
  const int mis = cm->mode_info_stride;
  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  x->rd_search = 0;
+  x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
  x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
                    xd->q_index < QIDX_SKIP_THRESH);
  if (x->skip_encode)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@ -475,7 +475,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
      xoff = 32 * (block & twmask);
      yoff = 32 * (block >> twl);
      src_diff = p->src_diff + 4 * bw * yoff + xoff;
-      if (x->rd_search)
+      if (x->use_lp32x32fdct)
        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
      else
        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@ -670,7 +670,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
                              dst, pd->dst.stride, dst, pd->dst.stride);
      vp9_subtract_block(32, 32, src_diff, bw * 4,
                         src, p->src.stride, dst, pd->dst.stride);
-      if (x->rd_search)
+      if (x->use_lp32x32fdct)
        vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
      else
        vp9_short_fdct32x32(src_diff, coeff, bw * 8);
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@ -723,6 +723,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
  sf->adaptive_rd_thresh = 0;
  sf->use_lastframe_partitioning = 0;
  sf->tx_size_search_method = USE_FULL_RD;
+  sf->use_lp32x32fdct = 0;
  sf->use_8tap_always = 0;
  sf->use_avoid_tested_higherror = 0;
  sf->reference_masking = 0;
@ -794,6 +795,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
        sf->use_uv_intra_rd_estimate = 1;
        sf->use_rd_breakout = 1;
        sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
        sf->auto_mv_step_size = 1;

        sf->auto_min_max_partition_size = 1;
@ -825,6 +827,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
        sf->use_uv_intra_rd_estimate = 1;
        sf->use_rd_breakout = 1;
        sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
        sf->using_small_partition_info = 1;
        sf->disable_splitmv =
            (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
@ -848,6 +851,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                     FLAG_EARLY_TERMINATE;
        sf->use_rd_breakout = 1;
        sf->skip_encode_sb = 1;
+        sf->use_lp32x32fdct = 1;
        sf->disable_splitmv = 1;
        sf->auto_mv_step_size = 1;
        sf->search_method = BIGDIA;
@ -869,6 +873,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
                                     FLAG_SKIP_INTRA_LOWVAR |
                                     FLAG_EARLY_TERMINATE;
        sf->use_rd_breakout = 1;
+        sf->use_lp32x32fdct = 1;
        sf->optimize_coefficients = 0;
        sf->auto_mv_step_size = 1;
        // sf->reduce_first_step_size = 1;
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@ -257,6 +257,7 @@ typedef struct {
  int skip_encode_frame;
  int use_lastframe_partitioning;
  TX_SIZE_SEARCH_METHOD tx_size_search_method;
+  int use_lp32x32fdct;
  int use_8tap_always;
  int use_avoid_tested_higherror;
  int skip_lots_of_modes;