Some minor optimizations for cost_coeffs().
Cycle timings for first 3 frames of bus (speed 0) at 1500kbps: 4x4: 298 -> 234 cycles 8x8: 1227 -> 878 cycles 16x16: 23426 -> 18134 cycles 32x32: 4906 -> 3664 cycles Total encode time of first 50 frames of bus @ 1500kbps (speed 0) goes from 3min0.7 to 2min51.6 seconds, i.e. 5.3% faster. Change-Id: I68a0e1b530b0563b84a67342cca4b45146077e95
This commit is contained in:
Родитель
af660715c0
Коммит
91d223bd5c
|
@ -133,8 +133,7 @@ struct macroblock {
|
||||||
unsigned char *active_ptr;
|
unsigned char *active_ptr;
|
||||||
|
|
||||||
// note that token_costs is the cost when eob node is skipped
|
// note that token_costs is the cost when eob node is skipped
|
||||||
vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES];
|
vp9_coeff_count token_costs[TX_SIZE_MAX_SB][BLOCK_TYPES][2];
|
||||||
vp9_coeff_count token_costs_noskip[TX_SIZE_MAX_SB][BLOCK_TYPES];
|
|
||||||
|
|
||||||
int optimize;
|
int optimize;
|
||||||
|
|
||||||
|
|
|
@ -223,10 +223,10 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
|
||||||
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
|
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
|
||||||
pad, default_eob);
|
pad, default_eob);
|
||||||
rate0 +=
|
rate0 +=
|
||||||
mb->token_costs_noskip[tx_size][type][ref][band][pt]
|
mb->token_costs[tx_size][type][ref][0][band][pt]
|
||||||
[tokens[next][0].token];
|
[tokens[next][0].token];
|
||||||
rate1 +=
|
rate1 +=
|
||||||
mb->token_costs_noskip[tx_size][type][ref][band][pt]
|
mb->token_costs[tx_size][type][ref][0][band][pt]
|
||||||
[tokens[next][1].token];
|
[tokens[next][1].token];
|
||||||
}
|
}
|
||||||
UPDATE_RD_COST();
|
UPDATE_RD_COST();
|
||||||
|
@ -275,22 +275,14 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
|
||||||
if (t0 != DCT_EOB_TOKEN) {
|
if (t0 != DCT_EOB_TOKEN) {
|
||||||
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
|
pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache,
|
||||||
pad, default_eob);
|
pad, default_eob);
|
||||||
if (!x)
|
rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt]
|
||||||
rate0 += mb->token_costs[tx_size][type][ref][band][pt][
|
[tokens[next][0].token];
|
||||||
tokens[next][0].token];
|
|
||||||
else
|
|
||||||
rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
|
|
||||||
tokens[next][0].token];
|
|
||||||
}
|
}
|
||||||
if (t1 != DCT_EOB_TOKEN) {
|
if (t1 != DCT_EOB_TOKEN) {
|
||||||
pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
|
pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache,
|
||||||
pad, default_eob);
|
pad, default_eob);
|
||||||
if (!x)
|
rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt]
|
||||||
rate1 += mb->token_costs[tx_size][type][ref][band][pt][
|
[tokens[next][1].token];
|
||||||
tokens[next][1].token];
|
|
||||||
else
|
|
||||||
rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][
|
|
||||||
tokens[next][1].token];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -322,12 +314,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
|
||||||
/* Update the cost of each path if we're past the EOB token. */
|
/* Update the cost of each path if we're past the EOB token. */
|
||||||
if (t0 != DCT_EOB_TOKEN) {
|
if (t0 != DCT_EOB_TOKEN) {
|
||||||
tokens[next][0].rate +=
|
tokens[next][0].rate +=
|
||||||
mb->token_costs[tx_size][type][ref][band][0][t0];
|
mb->token_costs[tx_size][type][ref][1][band][0][t0];
|
||||||
tokens[next][0].token = ZERO_TOKEN;
|
tokens[next][0].token = ZERO_TOKEN;
|
||||||
}
|
}
|
||||||
if (t1 != DCT_EOB_TOKEN) {
|
if (t1 != DCT_EOB_TOKEN) {
|
||||||
tokens[next][1].rate +=
|
tokens[next][1].rate +=
|
||||||
mb->token_costs[tx_size][type][ref][band][0][t1];
|
mb->token_costs[tx_size][type][ref][1][band][0][t1];
|
||||||
tokens[next][1].token = ZERO_TOKEN;
|
tokens[next][1].token = ZERO_TOKEN;
|
||||||
}
|
}
|
||||||
/* Don't update next, because we didn't add a new node. */
|
/* Don't update next, because we didn't add a new node. */
|
||||||
|
@ -343,8 +335,8 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb,
|
||||||
error1 = tokens[next][1].error;
|
error1 = tokens[next][1].error;
|
||||||
t0 = tokens[next][0].token;
|
t0 = tokens[next][0].token;
|
||||||
t1 = tokens[next][1].token;
|
t1 = tokens[next][1].token;
|
||||||
rate0 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t0];
|
rate0 += mb->token_costs[tx_size][type][ref][0][band][pt][t0];
|
||||||
rate1 += mb->token_costs_noskip[tx_size][type][ref][band][pt][t1];
|
rate1 += mb->token_costs[tx_size][type][ref][0][band][pt][t1];
|
||||||
UPDATE_RD_COST();
|
UPDATE_RD_COST();
|
||||||
best = rd_cost1 < rd_cost0;
|
best = rd_cost1 < rd_cost0;
|
||||||
final_eob = i0 - 1;
|
final_eob = i0 - 1;
|
||||||
|
|
|
@ -116,8 +116,7 @@ static int rd_thresh_block_size_factor[BLOCK_SIZE_TYPES] =
|
||||||
#define MAX_RD_THRESH_FREQ_FACT 32
|
#define MAX_RD_THRESH_FREQ_FACT 32
|
||||||
#define MAX_RD_THRESH_FREQ_INC 1
|
#define MAX_RD_THRESH_FREQ_INC 1
|
||||||
|
|
||||||
static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
|
static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES][2],
|
||||||
vp9_coeff_count (*cnoskip)[BLOCK_TYPES],
|
|
||||||
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
|
vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
|
||||||
int i, j, k, l;
|
int i, j, k, l;
|
||||||
TX_SIZE t;
|
TX_SIZE t;
|
||||||
|
@ -128,18 +127,18 @@ static void fill_token_costs(vp9_coeff_count (*c)[BLOCK_TYPES],
|
||||||
for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
|
for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
|
||||||
vp9_prob probs[ENTROPY_NODES];
|
vp9_prob probs[ENTROPY_NODES];
|
||||||
vp9_model_to_full_probs(p[t][i][j][k][l], probs);
|
vp9_model_to_full_probs(p[t][i][j][k][l], probs);
|
||||||
vp9_cost_tokens((int *)cnoskip[t][i][j][k][l], probs,
|
vp9_cost_tokens((int *)c[t][i][j][0][k][l], probs,
|
||||||
vp9_coef_tree);
|
vp9_coef_tree);
|
||||||
#if CONFIG_BALANCED_COEFTREE
|
#if CONFIG_BALANCED_COEFTREE
|
||||||
// Replace the eob node prob with a very small value so that the
|
// Replace the eob node prob with a very small value so that the
|
||||||
// cost approximately equals the cost without the eob node
|
// cost approximately equals the cost without the eob node
|
||||||
probs[1] = 1;
|
probs[1] = 1;
|
||||||
vp9_cost_tokens((int *)c[t][i][j][k][l], probs, vp9_coef_tree);
|
vp9_cost_tokens((int *)c[t][i][j][1][k][l], probs, vp9_coef_tree);
|
||||||
#else
|
#else
|
||||||
vp9_cost_tokens_skip((int *)c[t][i][j][k][l], probs,
|
vp9_cost_tokens_skip((int *)c[t][i][j][1][k][l], probs,
|
||||||
vp9_coef_tree);
|
vp9_coef_tree);
|
||||||
assert(c[t][i][j][k][l][DCT_EOB_TOKEN] ==
|
assert(c[t][i][j][0][k][l][DCT_EOB_TOKEN] ==
|
||||||
cnoskip[t][i][j][k][l][DCT_EOB_TOKEN]);
|
c[t][i][j][1][k][l][DCT_EOB_TOKEN]);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -260,9 +259,7 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fill_token_costs(cpi->mb.token_costs,
|
fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
|
||||||
cpi->mb.token_costs_noskip,
|
|
||||||
cpi->common.fc.coef_probs);
|
|
||||||
|
|
||||||
for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
|
for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
|
||||||
vp9_cost_tokens(cpi->mb.partition_cost[i],
|
vp9_cost_tokens(cpi->mb.partition_cost[i],
|
||||||
|
@ -310,18 +307,13 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
|
||||||
int cost = 0, pad;
|
int cost = 0, pad;
|
||||||
const int *scan, *nb;
|
const int *scan, *nb;
|
||||||
const int eob = xd->plane[plane].eobs[block];
|
const int eob = xd->plane[plane].eobs[block];
|
||||||
const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff,
|
const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16);
|
||||||
block, 16);
|
|
||||||
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
|
const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
|
||||||
unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
|
unsigned int (*token_costs)[COEF_BANDS][PREV_COEF_CONTEXTS]
|
||||||
mb->token_costs[tx_size][type][ref];
|
[MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref];
|
||||||
ENTROPY_CONTEXT above_ec, left_ec;
|
ENTROPY_CONTEXT above_ec, left_ec;
|
||||||
TX_TYPE tx_type = DCT_DCT;
|
TX_TYPE tx_type = DCT_DCT;
|
||||||
|
|
||||||
const int segment_id = xd->mode_info_context->mbmi.segment_id;
|
const int segment_id = xd->mode_info_context->mbmi.segment_id;
|
||||||
unsigned int (*token_costs_noskip)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
|
|
||||||
mb->token_costs_noskip[tx_size][type][ref];
|
|
||||||
|
|
||||||
int seg_eob, default_eob;
|
int seg_eob, default_eob;
|
||||||
uint8_t token_cache[1024];
|
uint8_t token_cache[1024];
|
||||||
const uint8_t * band_translate;
|
const uint8_t * band_translate;
|
||||||
|
@ -390,26 +382,38 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
|
||||||
if (eob < seg_eob)
|
if (eob < seg_eob)
|
||||||
assert(qcoeff_ptr[scan[eob]] == 0);
|
assert(qcoeff_ptr[scan[eob]] == 0);
|
||||||
|
|
||||||
{
|
if (eob == 0) {
|
||||||
for (c = 0; c < eob; c++) {
|
// single eob token
|
||||||
int v = qcoeff_ptr[scan[c]];
|
cost += token_costs[0][0][pt][DCT_EOB_TOKEN];
|
||||||
int t = vp9_dct_value_tokens_ptr[v].token;
|
} else {
|
||||||
int band = get_coef_band(band_translate, c);
|
int t, v, prev_rc = 0;
|
||||||
if (c)
|
|
||||||
pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
|
|
||||||
|
|
||||||
if (!c || token_cache[scan[c - 1]]) // do not skip eob
|
// dc token
|
||||||
cost += token_costs_noskip[band][pt][t] + vp9_dct_value_cost_ptr[v];
|
v = qcoeff_ptr[0];
|
||||||
else
|
t = vp9_dct_value_tokens_ptr[v].token;
|
||||||
cost += token_costs[band][pt][t] + vp9_dct_value_cost_ptr[v];
|
cost += token_costs[0][0][pt][t] + vp9_dct_value_cost_ptr[v];
|
||||||
token_cache[scan[c]] = vp9_pt_energy_class[t];
|
token_cache[0] = vp9_pt_energy_class[t];
|
||||||
}
|
|
||||||
if (c < seg_eob) {
|
// ac tokens
|
||||||
if (c)
|
for (c = 1; c < eob; c++) {
|
||||||
|
const int rc = scan[c];
|
||||||
|
int band = get_coef_band(band_translate, c);
|
||||||
|
|
||||||
|
v = qcoeff_ptr[rc];
|
||||||
|
t = vp9_dct_value_tokens_ptr[v].token;
|
||||||
pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
|
pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
|
||||||
cost += mb->token_costs_noskip[tx_size][type][ref]
|
// as an index at some level
|
||||||
[get_coef_band(band_translate, c)]
|
cost += token_costs[!token_cache[prev_rc]][band][pt][t] +
|
||||||
[pt][DCT_EOB_TOKEN];
|
vp9_dct_value_cost_ptr[v];
|
||||||
|
token_cache[rc] = vp9_pt_energy_class[t];
|
||||||
|
prev_rc = rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// eob token
|
||||||
|
if (c < seg_eob) {
|
||||||
|
pt = vp9_get_coef_context(scan, nb, pad, token_cache, c, default_eob);
|
||||||
|
cost += token_costs[0][get_coef_band(band_translate, c)][pt]
|
||||||
|
[DCT_EOB_TOKEN];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче