Use low precision 32x32fdct for encodemb in speed1

The low precision 32x32 fdct has all the intermediate steps within
16-bit depth, hence allowing faster SSE2 implementation, at the
expense of larger round-trip error. It was used in the rate-distortion
optimization search loop only.

Using the low precision version, in replace of the high precision one,
affects the compression performance by about 0.7% (derf, stdhd) at
speed 0. For speed 1, it makes derf set down by only 0.017%.

Change-Id: I4e7d18fac5bea5317b91c8e7dabae143bc6b5c8b
This commit is contained in:
Jingning Han 2013-08-07 15:22:51 -07:00
Родитель 78182538d6
Коммит debb9c68c8
5 изменённых файлов: 11 добавлений и 5 удалений

Просмотреть файл

@ -144,7 +144,7 @@ struct macroblock {
int optimize;
// indicate if it is in the rd search loop or encoding process
int rd_search;
int use_lp32x32fdct;
int skip_encode;
// Used to store sub partition's choices.

Просмотреть файл

@ -565,7 +565,7 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
x->rd_search = 1;
x->use_lp32x32fdct = 1;
if (bsize < BLOCK_8X8) {
// When ab_index = 0 all sub-blocks are handled, so for ab_index != 0
@ -2546,7 +2546,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
const int mis = cm->mode_info_stride;
const int mi_width = num_8x8_blocks_wide_lookup[bsize];
const int mi_height = num_8x8_blocks_high_lookup[bsize];
x->rd_search = 0;
x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
x->skip_encode = (!output_enabled && cpi->sf.skip_encode_frame &&
xd->q_index < QIDX_SKIP_THRESH);
if (x->skip_encode)

Просмотреть файл

@ -475,7 +475,7 @@ void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
xoff = 32 * (block & twmask);
yoff = 32 * (block >> twl);
src_diff = p->src_diff + 4 * bw * yoff + xoff;
if (x->rd_search)
if (x->use_lp32x32fdct)
vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
else
vp9_short_fdct32x32(src_diff, coeff, bw * 8);
@ -670,7 +670,7 @@ void encode_block_intra(int plane, int block, BLOCK_SIZE_TYPE bsize,
dst, pd->dst.stride, dst, pd->dst.stride);
vp9_subtract_block(32, 32, src_diff, bw * 4,
src, p->src.stride, dst, pd->dst.stride);
if (x->rd_search)
if (x->use_lp32x32fdct)
vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8);
else
vp9_short_fdct32x32(src_diff, coeff, bw * 8);

Просмотреть файл

@ -723,6 +723,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->adaptive_rd_thresh = 0;
sf->use_lastframe_partitioning = 0;
sf->tx_size_search_method = USE_FULL_RD;
sf->use_lp32x32fdct = 0;
sf->use_8tap_always = 0;
sf->use_avoid_tested_higherror = 0;
sf->reference_masking = 0;
@ -794,6 +795,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->use_uv_intra_rd_estimate = 1;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
sf->use_lp32x32fdct = 1;
sf->auto_mv_step_size = 1;
sf->auto_min_max_partition_size = 1;
@ -825,6 +827,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->use_uv_intra_rd_estimate = 1;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
sf->use_lp32x32fdct = 1;
sf->using_small_partition_info = 1;
sf->disable_splitmv =
(MIN(cpi->common.width, cpi->common.height) >= 720)? 1 : 0;
@ -848,6 +851,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
FLAG_EARLY_TERMINATE;
sf->use_rd_breakout = 1;
sf->skip_encode_sb = 1;
sf->use_lp32x32fdct = 1;
sf->disable_splitmv = 1;
sf->auto_mv_step_size = 1;
sf->search_method = BIGDIA;
@ -869,6 +873,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
FLAG_SKIP_INTRA_LOWVAR |
FLAG_EARLY_TERMINATE;
sf->use_rd_breakout = 1;
sf->use_lp32x32fdct = 1;
sf->optimize_coefficients = 0;
sf->auto_mv_step_size = 1;
// sf->reduce_first_step_size = 1;

Просмотреть файл

@ -257,6 +257,7 @@ typedef struct {
int skip_encode_frame;
int use_lastframe_partitioning;
TX_SIZE_SEARCH_METHOD tx_size_search_method;
int use_lp32x32fdct;
int use_8tap_always;
int use_avoid_tested_higherror;
int skip_lots_of_modes;