From debb9c68c8ea92b80627138f95de901cb39cf8dc Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Wed, 7 Aug 2013 15:22:51 -0700 Subject: [PATCH] Use low precision 32x32fdct for encodemb in speed1 The low precision 32x32 fdct has all the intermediate steps within 16-bit depth, hence allowing faster SSE2 implementation, at the expense of larger round-trip error. It was used in the rate-distortion optimization search loop only. Using the low precision version, in replace of the high precision one, affects the compression performance by about 0.7% (derf, stdhd) at speed 0. For speed 1, it makes derf set down by only 0.017%. Change-Id: I4e7d18fac5bea5317b91c8e7dabae143bc6b5c8b --- vp9/encoder/vp9_block.h | 2 +- vp9/encoder/vp9_encodeframe.c | 4 ++-- vp9/encoder/vp9_encodemb.c | 4 ++-- vp9/encoder/vp9_onyx_if.c | 5 +++++ vp9/encoder/vp9_onyx_int.h | 1 + 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 3e377cf6f..790b3c22c 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -144,7 +144,7 @@ struct macroblock { int optimize; // indicate if it is in the rd search loop or encoding process - int rd_search; + int use_lp32x32fdct; int skip_encode; // Used to store sub partition's choices. diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 82859c5d7..39ca5efc4 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -565,7 +565,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - x->rd_search = 1; + x->use_lp32x32fdct = 1; if (bsize < BLOCK_8X8) { // When ab_index = 0 all sub-blocks are handled, so for ab_index != 0 @@ -2546,7 +2546,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, const int mis = cm->mode_info_stride; const int mi_width = num_8x8_blocks_wide_lookup[bsize]; const int mi_height = num_8x8_blocks_high_lookup[bsize]; - x->rd_search = 0; + x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct; x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH); if (x->skip_encode) diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 40b0a4e5a..f43a28137 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -475,7 +475,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize, xoff = 32 * (block & twmask); yoff = 32 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; - if (x->rd_search) + if (x->use_lp32x32fdct) vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); else vp9_short_fdct32x32(src_diff, coeff, bw * 8); @@ -670,7 +670,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize, dst, pd->dst.stride, dst, pd->dst.stride); vp9_subtract_block(32, 32, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); - if (x->rd_search) + if (x->use_lp32x32fdct) vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); else vp9_short_fdct32x32(src_diff, coeff, bw * 8); diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index cf5ae5252..9ad63efcb 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -723,6 +723,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->adaptive_rd_thresh = 0; sf->use_lastframe_partitioning = 0; sf->tx_size_search_method = USE_FULL_RD; + sf->use_lp32x32fdct = 0; sf->use_8tap_always = 0; sf->use_avoid_tested_higherror = 0; sf->reference_masking = 0; @@ -794,6 +795,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_uv_intra_rd_estimate = 1; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; + sf->use_lp32x32fdct = 1; sf->auto_mv_step_size = 1; sf->auto_min_max_partition_size = 1; @@ -825,6 +827,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->use_uv_intra_rd_estimate = 1; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; + sf->use_lp32x32fdct = 1; sf->using_small_partition_info = 1; sf->disable_splitmv = (MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0; @@ -848,6 +851,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_EARLY_TERMINATE; sf->use_rd_breakout = 1; sf->skip_encode_sb = 1; + sf->use_lp32x32fdct = 1; sf->disable_splitmv = 1; sf->auto_mv_step_size = 1; sf->search_method = BIGDIA; @@ -869,6 +873,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { FLAG_SKIP_INTRA_LOWVAR | FLAG_EARLY_TERMINATE; sf->use_rd_breakout = 1; + sf->use_lp32x32fdct = 1; sf->optimize_coefficients = 0; sf->auto_mv_step_size = 1; // sf->reduce_first_step_size = 1; diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index 124910769..fdc108135 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -257,6 +257,7 @@ typedef struct { int skip_encode_frame; int use_lastframe_partitioning; TX_SIZE_SEARCH_METHOD tx_size_search_method; + int use_lp32x32fdct; int use_8tap_always; int use_avoid_tested_higherror; int skip_lots_of_modes;