Merge "Fast computation path for forward transform and quantization"
This commit is contained in:
Коммит
d5ae43318e
|
@ -744,15 +744,27 @@ specialize qw/vp9_fht16x16 sse2/;
|
|||
add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
|
||||
|
||||
add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct4x4_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct4x4 sse2 avx2/;
|
||||
|
||||
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct8x8_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct16x16_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct16x16 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32_1 sse2/;
|
||||
|
||||
add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
|
||||
specialize qw/vp9_fdct32x32 sse2 avx2/;
|
||||
|
||||
|
|
|
@ -28,6 +28,7 @@ struct macroblock_plane {
|
|||
struct buf_2d src;
|
||||
|
||||
// Quantizer setings
|
||||
int16_t *quant_fp;
|
||||
int16_t *quant;
|
||||
int16_t *quant_shift;
|
||||
int16_t *zbin;
|
||||
|
@ -105,6 +106,9 @@ struct macroblock {
|
|||
int use_lp32x32fdct;
|
||||
int skip_encode;
|
||||
|
||||
// skip forward transform and quantization
|
||||
int skip_txfm;
|
||||
|
||||
// Used to store sub partition's choices.
|
||||
MV pred_mv[MAX_REF_FRAMES];
|
||||
|
||||
|
|
|
@ -33,6 +33,7 @@ typedef struct {
|
|||
int is_coded;
|
||||
int num_4x4_blk;
|
||||
int skip;
|
||||
int skip_txfm;
|
||||
int best_mode_index;
|
||||
int hybrid_pred_diff;
|
||||
int comp_pred_diff;
|
||||
|
|
|
@ -43,6 +43,17 @@ static void fdct4(const int16_t *input, int16_t *output) {
|
|||
output[3] = fdct_round_shift(temp2);
|
||||
}
|
||||
|
||||
void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
|
||||
int r, c;
|
||||
int16_t sum = 0;
|
||||
for (r = 0; r < 4; ++r)
|
||||
for (c = 0; c < 4; ++c)
|
||||
sum += input[r * stride + c];
|
||||
|
||||
output[0] = sum << 3;
|
||||
output[1] = 0;
|
||||
}
|
||||
|
||||
void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
|
@ -240,6 +251,17 @@ static void fdct8(const int16_t *input, int16_t *output) {
|
|||
output[7] = fdct_round_shift(t3);
|
||||
}
|
||||
|
||||
void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
|
||||
int r, c;
|
||||
int16_t sum = 0;
|
||||
for (r = 0; r < 8; ++r)
|
||||
for (c = 0; c < 8; ++c)
|
||||
sum += input[r * stride + c];
|
||||
|
||||
output[0] = sum * 8;
|
||||
output[1] = 0;
|
||||
}
|
||||
|
||||
void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
|
||||
int i, j;
|
||||
int16_t intermediate[64];
|
||||
|
@ -311,6 +333,17 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
|
||||
int r, c;
|
||||
int16_t sum = 0;
|
||||
for (r = 0; r < 16; ++r)
|
||||
for (c = 0; c < 16; ++c)
|
||||
sum += input[r * stride + c];
|
||||
|
||||
output[0] = sum * 8;
|
||||
output[1] = 0;
|
||||
}
|
||||
|
||||
void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
|
@ -1329,6 +1362,17 @@ static void fdct32(const int *input, int *output, int round) {
|
|||
output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
|
||||
}
|
||||
|
||||
void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
|
||||
int r, c;
|
||||
int16_t sum = 0;
|
||||
for (r = 0; r < 32; ++r)
|
||||
for (c = 0; c < 32; ++c)
|
||||
sum += input[r * stride + c];
|
||||
|
||||
output[0] = sum << 2;
|
||||
output[1] = 0;
|
||||
}
|
||||
|
||||
void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
|
||||
int i, j;
|
||||
int output[32 * 32];
|
||||
|
|
|
@ -1370,6 +1370,7 @@ static void update_state_rt(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
|
|||
}
|
||||
|
||||
x->skip = ctx->skip;
|
||||
x->skip_txfm = mbmi->segment_id ? 0 : ctx->skip_txfm;
|
||||
}
|
||||
|
||||
static void encode_b_rt(VP9_COMP *cpi, const TileInfo *const tile,
|
||||
|
@ -2613,6 +2614,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
|
|||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
|
||||
&this_rate, &this_dist, bsize);
|
||||
ctx->mic.mbmi = xd->mi[0]->mbmi;
|
||||
ctx->skip_txfm = x->skip_txfm;
|
||||
|
||||
if (this_rate != INT_MAX) {
|
||||
int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
|
||||
|
@ -2699,6 +2701,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
|
|||
&this_rate, &this_dist, subsize);
|
||||
|
||||
pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
|
||||
|
||||
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
|
||||
|
||||
|
@ -2708,6 +2711,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
|
|||
&this_rate, &this_dist, subsize);
|
||||
|
||||
pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
|
||||
|
||||
if (this_rate == INT_MAX) {
|
||||
sum_rd = INT64_MAX;
|
||||
|
@ -2737,12 +2741,14 @@ static void nonrd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
|
|||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col,
|
||||
&this_rate, &this_dist, subsize);
|
||||
pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->vertical[0].skip_txfm = x->skip_txfm;
|
||||
sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
|
||||
if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) {
|
||||
load_pred_mv(x, ctx);
|
||||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms,
|
||||
&this_rate, &this_dist, subsize);
|
||||
pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->vertical[1].skip_txfm = x->skip_txfm;
|
||||
if (this_rate == INT_MAX) {
|
||||
sum_rd = INT64_MAX;
|
||||
} else {
|
||||
|
@ -2831,14 +2837,17 @@ static void nonrd_use_partition(VP9_COMP *cpi,
|
|||
case PARTITION_NONE:
|
||||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
|
||||
pc_tree->none.mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->none.skip_txfm = x->skip_txfm;
|
||||
break;
|
||||
case PARTITION_VERT:
|
||||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
|
||||
pc_tree->vertical[0].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->vertical[0].skip_txfm = x->skip_txfm;
|
||||
if (mi_col + hbs < cm->mi_cols) {
|
||||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col + hbs,
|
||||
&rate, &dist, subsize);
|
||||
pc_tree->vertical[1].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->vertical[1].skip_txfm = x->skip_txfm;
|
||||
if (rate != INT_MAX && dist != INT64_MAX &&
|
||||
*totrate != INT_MAX && *totdist != INT64_MAX) {
|
||||
*totrate += rate;
|
||||
|
@ -2849,10 +2858,12 @@ static void nonrd_use_partition(VP9_COMP *cpi,
|
|||
case PARTITION_HORZ:
|
||||
nonrd_pick_sb_modes(cpi, tile, mi_row, mi_col, totrate, totdist, subsize);
|
||||
pc_tree->horizontal[0].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->horizontal[0].skip_txfm = x->skip_txfm;
|
||||
if (mi_row + hbs < cm->mi_rows) {
|
||||
nonrd_pick_sb_modes(cpi, tile, mi_row + hbs, mi_col,
|
||||
&rate, &dist, subsize);
|
||||
pc_tree->horizontal[1].mic.mbmi = xd->mi[0]->mbmi;
|
||||
pc_tree->horizontal[1].skip_txfm = x->skip_txfm;
|
||||
if (rate != INT_MAX && dist != INT64_MAX &&
|
||||
*totrate != INT_MAX && *totdist != INT64_MAX) {
|
||||
*totrate += rate;
|
||||
|
@ -3055,6 +3066,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
|
|||
init_encode_frame_mb_context(cpi);
|
||||
set_prev_mi(cm);
|
||||
|
||||
x->skip_txfm = 0;
|
||||
if (sf->use_nonrd_pick_mode) {
|
||||
// Initialize internal buffer pointers for rtc coding, where non-RD
|
||||
// mode decision is used and hence no buffer pointer swap needed.
|
||||
|
|
|
@ -301,6 +301,52 @@ static INLINE void fdct32x32(int rd_transform,
|
|||
vp9_fdct32x32(src, dst, src_stride);
|
||||
}
|
||||
|
||||
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
|
||||
MACROBLOCKD *const xd = &x->e_mbd;
|
||||
const struct macroblock_plane *const p = &x->plane[plane];
|
||||
const struct macroblockd_plane *const pd = &xd->plane[plane];
|
||||
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
|
||||
int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
|
||||
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
uint16_t *const eob = &p->eobs[block];
|
||||
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
|
||||
int i, j;
|
||||
const int16_t *src_diff;
|
||||
|
||||
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
|
||||
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
|
||||
|
||||
switch (tx_size) {
|
||||
case TX_32X32:
|
||||
vp9_fdct32x32_1(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_dc_32x32(coeff, x->skip_block, p->round,
|
||||
p->quant_fp[0], qcoeff, dqcoeff,
|
||||
pd->dequant[0], eob);
|
||||
break;
|
||||
case TX_16X16:
|
||||
vp9_fdct16x16_1(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_dc(coeff, x->skip_block, p->round,
|
||||
p->quant_fp[0], qcoeff, dqcoeff,
|
||||
pd->dequant[0], eob);
|
||||
break;
|
||||
case TX_8X8:
|
||||
vp9_fdct8x8_1(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_dc(coeff, x->skip_block, p->round,
|
||||
p->quant_fp[0], qcoeff, dqcoeff,
|
||||
pd->dequant[0], eob);
|
||||
break;
|
||||
case TX_4X4:
|
||||
x->fwd_txm4x4(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_dc(coeff, x->skip_block, p->round,
|
||||
p->quant_fp[0], qcoeff, dqcoeff,
|
||||
pd->dequant[0], eob);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
|
||||
MACROBLOCKD *const xd = &x->e_mbd;
|
||||
|
@ -376,8 +422,19 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
|
|||
return;
|
||||
}
|
||||
|
||||
if (!x->skip_recode)
|
||||
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
|
||||
if (x->skip_txfm == 0) {
|
||||
// full forward transform and quantization
|
||||
if (!x->skip_recode)
|
||||
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
|
||||
} else if (x->skip_txfm == 2) {
|
||||
// fast path forward transform and quantization
|
||||
vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
|
||||
} else {
|
||||
// skip forward transform
|
||||
p->eobs[block] = 0;
|
||||
*a = *l = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
|
||||
const int ctx = combine_entropy_contexts(*a, *l);
|
||||
|
|
|
@ -22,7 +22,8 @@ extern "C" {
|
|||
|
||||
void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
|
||||
void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
|
||||
|
||||
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
|
||||
void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
|
||||
|
||||
|
|
|
@ -156,24 +156,28 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
|
|||
unsigned int sse;
|
||||
int rate;
|
||||
int64_t dist;
|
||||
|
||||
struct macroblock_plane *const p = &x->plane[0];
|
||||
struct macroblockd_plane *const pd = &xd->plane[0];
|
||||
|
||||
const int quant = pd->dequant[1];
|
||||
unsigned int var = cpi->fn_ptr[bsize].vf(p->src.buf, p->src.stride,
|
||||
pd->dst.buf, pd->dst.stride, &sse);
|
||||
|
||||
*var_y = var;
|
||||
*sse_y = sse;
|
||||
|
||||
if (sse < pd->dequant[0] * pd->dequant[0] >> 6)
|
||||
x->skip_txfm = 1;
|
||||
else if (var < quant * quant >> 6)
|
||||
x->skip_txfm = 2;
|
||||
else
|
||||
x->skip_txfm = 0;
|
||||
|
||||
// TODO(jingning) This is a temporary solution to account for frames with
|
||||
// light changes. Need to customize the rate-distortion modeling for non-RD
|
||||
// mode decision.
|
||||
if ((sse >> 3) > var)
|
||||
sse = var;
|
||||
|
||||
vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize],
|
||||
pd->dequant[1] >> 3, &rate, &dist);
|
||||
quant >> 3, &rate, &dist);
|
||||
*out_rate_sum = rate;
|
||||
*out_dist_sum = dist << 3;
|
||||
}
|
||||
|
@ -199,6 +203,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
VP9_ALT_FLAG };
|
||||
int64_t best_rd = INT64_MAX;
|
||||
int64_t this_rd = INT64_MAX;
|
||||
int skip_txfm = 0;
|
||||
|
||||
int rate = INT_MAX;
|
||||
int64_t dist = INT64_MAX;
|
||||
|
@ -341,6 +346,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
if (cost < best_cost) {
|
||||
best_filter = filter;
|
||||
best_cost = cost;
|
||||
skip_txfm = x->skip_txfm;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -349,6 +355,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
dist = pf_dist[mbmi->interp_filter];
|
||||
var_y = pf_var[mbmi->interp_filter];
|
||||
sse_y = pf_sse[mbmi->interp_filter];
|
||||
x->skip_txfm = skip_txfm;
|
||||
} else {
|
||||
mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP: filter_ref;
|
||||
vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
|
||||
|
@ -438,6 +445,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
best_mode = this_mode;
|
||||
best_pred_filter = mbmi->interp_filter;
|
||||
best_ref_frame = ref_frame;
|
||||
skip_txfm = x->skip_txfm;
|
||||
}
|
||||
|
||||
if (x->skip)
|
||||
|
@ -450,6 +458,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
mbmi->ref_frame[0] = best_ref_frame;
|
||||
mbmi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
|
||||
xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
|
||||
x->skip_txfm = skip_txfm;
|
||||
|
||||
// Perform intra prediction search, if the best SAD is above a certain
|
||||
// threshold.
|
||||
|
@ -474,6 +483,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
|
|||
mbmi->ref_frame[0] = INTRA_FRAME;
|
||||
mbmi->uv_mode = this_mode;
|
||||
mbmi->mv[0].as_int = INVALID_MV;
|
||||
} else {
|
||||
x->skip_txfm = skip_txfm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,6 +19,50 @@
|
|||
#include "vp9/encoder/vp9_quantize.h"
|
||||
#include "vp9/encoder/vp9_rdopt.h"
|
||||
|
||||
void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr) {
|
||||
int eob = -1;
|
||||
|
||||
if (!skip_block) {
|
||||
const int rc = 0;
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
|
||||
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp = (tmp * quant) >> 16;
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr;
|
||||
if (tmp)
|
||||
eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr) {
|
||||
int eob = -1;
|
||||
|
||||
if (!skip_block) {
|
||||
const int rc = 0;
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
|
||||
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp = (tmp * quant) >> 15;
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2;
|
||||
if (tmp)
|
||||
eob = 0;
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
|
||||
int skip_block,
|
||||
const int16_t *zbin_ptr, const int16_t *round_ptr,
|
||||
|
@ -167,6 +211,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
|
||||
: vp9_ac_quant(q, 0);
|
||||
invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
|
||||
quants->y_quant_fp[q][i] = (1 << 16) / quant;
|
||||
quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
|
||||
quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
|
||||
cm->y_dequant[q][i] = quant;
|
||||
|
@ -176,6 +221,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
: vp9_ac_quant(q, cm->uv_ac_delta_q);
|
||||
invert_quant(&quants->uv_quant[q][i],
|
||||
&quants->uv_quant_shift[q][i], quant);
|
||||
quants->uv_quant_fp[q][i] = (1 << 16) / quant;
|
||||
quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
|
||||
quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
|
||||
cm->uv_dequant[q][i] = quant;
|
||||
|
@ -193,12 +239,14 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
|
||||
for (i = 2; i < 8; i++) {
|
||||
quants->y_quant[q][i] = quants->y_quant[q][1];
|
||||
quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
|
||||
quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
|
||||
quants->y_zbin[q][i] = quants->y_zbin[q][1];
|
||||
quants->y_round[q][i] = quants->y_round[q][1];
|
||||
cm->y_dequant[q][i] = cm->y_dequant[q][1];
|
||||
|
||||
quants->uv_quant[q][i] = quants->uv_quant[q][1];
|
||||
quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
|
||||
quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
|
||||
quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
|
||||
quants->uv_round[q][i] = quants->uv_round[q][1];
|
||||
|
@ -227,6 +275,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
|
|||
|
||||
// Y
|
||||
x->plane[0].quant = quants->y_quant[qindex];
|
||||
x->plane[0].quant_fp = quants->y_quant_fp[qindex];
|
||||
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
|
||||
x->plane[0].zbin = quants->y_zbin[qindex];
|
||||
x->plane[0].round = quants->y_round[qindex];
|
||||
|
@ -236,6 +285,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
|
|||
// UV
|
||||
for (i = 1; i < 3; i++) {
|
||||
x->plane[i].quant = quants->uv_quant[qindex];
|
||||
x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
|
||||
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
|
||||
x->plane[i].zbin = quants->uv_zbin[qindex];
|
||||
x->plane[i].round = quants->uv_round[qindex];
|
||||
|
|
|
@ -24,6 +24,11 @@ typedef struct {
|
|||
DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, y_round[QINDEX_RANGE][8]);
|
||||
|
||||
// TODO(jingning): in progress of re-working the quantization. will decide
|
||||
// if we want to deprecate the current use of y_quant.
|
||||
DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
|
||||
|
||||
DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, uv_zbin[QINDEX_RANGE][8]);
|
||||
|
@ -37,6 +42,14 @@ typedef struct {
|
|||
#endif
|
||||
} QUANTS;
|
||||
|
||||
void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant_ptr,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr);
|
||||
void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant_ptr,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr);
|
||||
void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
|
||||
const int16_t *scan, const int16_t *iscan);
|
||||
|
||||
|
|
|
@ -12,6 +12,35 @@
|
|||
#include "vp9/common/vp9_idct.h" // for cospi constants
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
void vp9_fdct4x4_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
__m128i in0, in1;
|
||||
__m128i tmp;
|
||||
const __m128i zero = _mm_setzero_si128();
|
||||
in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
|
||||
in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
|
||||
in1 = _mm_unpacklo_epi64(in1, _mm_loadl_epi64((const __m128i *)
|
||||
(input + 2 * stride)));
|
||||
in0 = _mm_unpacklo_epi64(in0, _mm_loadl_epi64((const __m128i *)
|
||||
(input + 3 * stride)));
|
||||
|
||||
tmp = _mm_add_epi16(in0, in1);
|
||||
in0 = _mm_unpacklo_epi16(zero, tmp);
|
||||
in1 = _mm_unpackhi_epi16(zero, tmp);
|
||||
in0 = _mm_srai_epi32(in0, 16);
|
||||
in1 = _mm_srai_epi32(in1, 16);
|
||||
|
||||
tmp = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_unpacklo_epi32(tmp, zero);
|
||||
in1 = _mm_unpackhi_epi32(tmp, zero);
|
||||
|
||||
tmp = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_srli_si128(tmp, 8);
|
||||
|
||||
in1 = _mm_add_epi32(tmp, in0);
|
||||
in0 = _mm_slli_epi32(in1, 1);
|
||||
_mm_store_si128((__m128i *)(output), in0);
|
||||
}
|
||||
|
||||
void vp9_fdct4x4_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
// This 2D transform implements 4 vertical 1D transforms followed
|
||||
// by 4 horizontal 1D transforms. The multiplies and adds are as given
|
||||
|
@ -377,6 +406,46 @@ void vp9_fht4x4_sse2(const int16_t *input, int16_t *output,
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_fdct8x8_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
__m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
|
||||
__m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
|
||||
__m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
|
||||
__m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
|
||||
__m128i u0, u1, sum;
|
||||
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
|
||||
|
||||
sum = _mm_add_epi16(u0, u1);
|
||||
|
||||
in0 = _mm_add_epi16(in0, in1);
|
||||
in2 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, in0);
|
||||
|
||||
u0 = _mm_setzero_si128();
|
||||
sum = _mm_add_epi16(sum, in2);
|
||||
|
||||
in0 = _mm_unpacklo_epi16(u0, sum);
|
||||
in1 = _mm_unpackhi_epi16(u0, sum);
|
||||
in0 = _mm_srai_epi32(in0, 16);
|
||||
in1 = _mm_srai_epi32(in1, 16);
|
||||
|
||||
sum = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_unpacklo_epi32(sum, u0);
|
||||
in1 = _mm_unpackhi_epi32(sum, u0);
|
||||
|
||||
sum = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_srli_si128(sum, 8);
|
||||
|
||||
in1 = _mm_add_epi32(sum, in0);
|
||||
_mm_store_si128((__m128i *)(output), in1);
|
||||
}
|
||||
|
||||
void vp9_fdct8x8_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
int pass;
|
||||
// Constants
|
||||
|
@ -1168,6 +1237,74 @@ void vp9_fht8x8_sse2(const int16_t *input, int16_t *output,
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_fdct16x16_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
__m128i in0, in1, in2, in3;
|
||||
__m128i u0, u1;
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
input += 8 * i;
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
|
||||
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 4 * stride));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 5 * stride));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 6 * stride));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 7 * stride));
|
||||
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 8 * stride));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 9 * stride));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 10 * stride));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 11 * stride));
|
||||
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 12 * stride));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 13 * stride));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 14 * stride));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 15 * stride));
|
||||
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
}
|
||||
|
||||
u0 = _mm_setzero_si128();
|
||||
in0 = _mm_unpacklo_epi16(u0, sum);
|
||||
in1 = _mm_unpackhi_epi16(u0, sum);
|
||||
in0 = _mm_srai_epi32(in0, 16);
|
||||
in1 = _mm_srai_epi32(in1, 16);
|
||||
|
||||
sum = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_unpacklo_epi32(sum, u0);
|
||||
in1 = _mm_unpackhi_epi32(sum, u0);
|
||||
|
||||
sum = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_srli_si128(sum, 8);
|
||||
|
||||
in1 = _mm_add_epi32(sum, in0);
|
||||
in1 = _mm_srai_epi32(in1, 1);
|
||||
_mm_store_si128((__m128i *)(output), in1);
|
||||
}
|
||||
|
||||
void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
// The 2D transform is done with two passes which are actually pretty
|
||||
// similar. In the first one, we transform the columns and transpose
|
||||
|
@ -2680,6 +2817,77 @@ void vp9_fht16x16_sse2(const int16_t *input, int16_t *output,
|
|||
}
|
||||
}
|
||||
|
||||
void vp9_fdct32x32_1_sse2(const int16_t *input, int16_t *output, int stride) {
|
||||
__m128i in0, in1, in2, in3;
|
||||
__m128i u0, u1;
|
||||
__m128i sum = _mm_setzero_si128();
|
||||
int i;
|
||||
|
||||
for (i = 0; i < 8; ++i) {
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
||||
|
||||
input += stride;
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
||||
|
||||
input += stride;
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
||||
|
||||
input += stride;
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
in0 = _mm_load_si128((const __m128i *)(input + 0));
|
||||
in1 = _mm_load_si128((const __m128i *)(input + 8));
|
||||
in2 = _mm_load_si128((const __m128i *)(input + 16));
|
||||
in3 = _mm_load_si128((const __m128i *)(input + 24));
|
||||
|
||||
input += stride;
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
u0 = _mm_add_epi16(in0, in1);
|
||||
u1 = _mm_add_epi16(in2, in3);
|
||||
sum = _mm_add_epi16(sum, u0);
|
||||
|
||||
sum = _mm_add_epi16(sum, u1);
|
||||
}
|
||||
|
||||
u0 = _mm_setzero_si128();
|
||||
in0 = _mm_unpacklo_epi16(u0, sum);
|
||||
in1 = _mm_unpackhi_epi16(u0, sum);
|
||||
in0 = _mm_srai_epi32(in0, 16);
|
||||
in1 = _mm_srai_epi32(in1, 16);
|
||||
|
||||
sum = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_unpacklo_epi32(sum, u0);
|
||||
in1 = _mm_unpackhi_epi32(sum, u0);
|
||||
|
||||
sum = _mm_add_epi32(in0, in1);
|
||||
in0 = _mm_srli_si128(sum, 8);
|
||||
|
||||
in1 = _mm_add_epi32(sum, in0);
|
||||
in1 = _mm_srai_epi32(in1, 3);
|
||||
_mm_store_si128((__m128i *)(output), in1);
|
||||
}
|
||||
|
||||
#define FDCT32x32_2D vp9_fdct32x32_rd_sse2
|
||||
#define FDCT32x32_HIGH_PRECISION 0
|
||||
#include "vp9/encoder/x86/vp9_dct32x32_sse2.c"
|
||||
|
|
Загрузка…
Ссылка в новой задаче