Merge "Re-design quantization process"
This commit is contained in:
Коммит
82fd084b35
|
@ -714,6 +714,9 @@ specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
|
|||
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
|
||||
specialize qw/vp9_subtract_block/, "$sse2_x86inc";
|
||||
|
||||
add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
|
||||
|
||||
add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
|
||||
specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
|
||||
|
||||
|
|
|
@ -35,6 +35,7 @@ struct macroblock_plane {
|
|||
|
||||
// Quantizer setings
|
||||
int16_t *quant_fp;
|
||||
int16_t *round_fp;
|
||||
int16_t *quant;
|
||||
int16_t *quant_shift;
|
||||
int16_t *zbin;
|
||||
|
@ -110,6 +111,9 @@ struct macroblock {
|
|||
int use_lp32x32fdct;
|
||||
int skip_encode;
|
||||
|
||||
// use fast quantization process
|
||||
int quant_fp;
|
||||
|
||||
// skip forward transform and quantization
|
||||
int skip_txfm;
|
||||
|
||||
|
|
|
@ -3074,6 +3074,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
|
|||
init_encode_frame_mb_context(cpi);
|
||||
set_prev_mi(cm);
|
||||
|
||||
x->quant_fp = cpi->sf.use_quant_fp;
|
||||
x->skip_txfm = 0;
|
||||
if (sf->use_nonrd_pick_mode) {
|
||||
// Initialize internal buffer pointers for rtc coding, where non-RD
|
||||
|
|
|
@ -306,6 +306,56 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
|
|||
MACROBLOCKD *const xd = &x->e_mbd;
|
||||
const struct macroblock_plane *const p = &x->plane[plane];
|
||||
const struct macroblockd_plane *const pd = &xd->plane[plane];
|
||||
const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
|
||||
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
|
||||
int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
|
||||
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
uint16_t *const eob = &p->eobs[block];
|
||||
const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
|
||||
int i, j;
|
||||
const int16_t *src_diff;
|
||||
txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
|
||||
src_diff = &p->src_diff[4 * (j * diff_stride + i)];
|
||||
|
||||
switch (tx_size) {
|
||||
case TX_32X32:
|
||||
fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
|
||||
vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
|
||||
p->quant, p->quant_shift, qcoeff, dqcoeff,
|
||||
pd->dequant, p->zbin_extra, eob, scan_order->scan,
|
||||
scan_order->iscan);
|
||||
break;
|
||||
case TX_16X16:
|
||||
vp9_fdct16x16(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
|
||||
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
||||
pd->dequant, p->zbin_extra, eob,
|
||||
scan_order->scan, scan_order->iscan);
|
||||
break;
|
||||
case TX_8X8:
|
||||
vp9_fdct8x8(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
|
||||
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
||||
pd->dequant, p->zbin_extra, eob,
|
||||
scan_order->scan, scan_order->iscan);
|
||||
break;
|
||||
case TX_4X4:
|
||||
x->fwd_txm4x4(src_diff, coeff, diff_stride);
|
||||
vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
|
||||
p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
|
||||
pd->dequant, p->zbin_extra, eob,
|
||||
scan_order->scan, scan_order->iscan);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
|
||||
MACROBLOCKD *const xd = &x->e_mbd;
|
||||
const struct macroblock_plane *const p = &x->plane[plane];
|
||||
const struct macroblockd_plane *const pd = &xd->plane[plane];
|
||||
int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
|
||||
int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
|
||||
int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
|
||||
|
@ -424,11 +474,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
|
|||
|
||||
if (x->skip_txfm == 0) {
|
||||
// full forward transform and quantization
|
||||
if (!x->skip_recode)
|
||||
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
|
||||
if (!x->skip_recode) {
|
||||
if (x->quant_fp)
|
||||
vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
|
||||
else
|
||||
vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
|
||||
}
|
||||
} else if (x->skip_txfm == 2) {
|
||||
// fast path forward transform and quantization
|
||||
vp9_xform_quant_fp(x, plane, block, plane_bsize, tx_size);
|
||||
vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
|
||||
} else {
|
||||
// skip forward transform
|
||||
p->eobs[block] = 0;
|
||||
|
|
|
@ -24,6 +24,8 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
|
|||
void vp9_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
|
||||
void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
|
||||
void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
|
||||
void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
|
||||
BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
|
||||
|
||||
|
|
|
@ -42,9 +42,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
|
|||
}
|
||||
|
||||
void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr) {
|
||||
const int16_t *round_ptr, const int16_t quant,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t dequant_ptr, uint16_t *eob_ptr) {
|
||||
int eob = -1;
|
||||
|
||||
if (!skip_block) {
|
||||
|
@ -63,6 +63,47 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
|
|||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
|
||||
int skip_block,
|
||||
const int16_t *zbin_ptr, const int16_t *round_ptr,
|
||||
const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
|
||||
int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
|
||||
const int16_t *dequant_ptr,
|
||||
int zbin_oq_value, uint16_t *eob_ptr,
|
||||
const int16_t *scan, const int16_t *iscan) {
|
||||
int i, eob = -1;
|
||||
// TODO(jingning) Decide the need of these arguments after the
|
||||
// quantization process is completed.
|
||||
(void)zbin_ptr;
|
||||
(void)quant_shift_ptr;
|
||||
(void)zbin_oq_value;
|
||||
(void)iscan;
|
||||
|
||||
vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
|
||||
vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
|
||||
|
||||
if (!skip_block) {
|
||||
// Quantization pass: All coefficients with index >= zero_flag are
|
||||
// skippable. Note: zero_flag can be zero.
|
||||
for (i = 0; i < count; i++) {
|
||||
const int rc = scan[i];
|
||||
const int coeff = coeff_ptr[rc];
|
||||
const int coeff_sign = (coeff >> 31);
|
||||
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
|
||||
|
||||
int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
|
||||
tmp = (tmp * quant_ptr[rc != 0]) >> 16;
|
||||
|
||||
qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
|
||||
dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
|
||||
|
||||
if (tmp)
|
||||
eob = i;
|
||||
}
|
||||
}
|
||||
*eob_ptr = eob + 1;
|
||||
}
|
||||
|
||||
void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
|
||||
int skip_block,
|
||||
const int16_t *zbin_ptr, const int16_t *round_ptr,
|
||||
|
@ -207,11 +248,16 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
const int qrounding_factor = q == 0 ? 64 : 48;
|
||||
|
||||
for (i = 0; i < 2; ++i) {
|
||||
int qrounding_factor_fp = i == 0 ? 48 : 42;
|
||||
if (q == 0)
|
||||
qrounding_factor_fp = 64;
|
||||
|
||||
// y
|
||||
quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q)
|
||||
: vp9_ac_quant(q, 0);
|
||||
invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant);
|
||||
quants->y_quant_fp[q][i] = (1 << 16) / quant;
|
||||
quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
|
||||
quants->y_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
|
||||
quants->y_round[q][i] = (qrounding_factor * quant) >> 7;
|
||||
cm->y_dequant[q][i] = quant;
|
||||
|
@ -222,6 +268,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
invert_quant(&quants->uv_quant[q][i],
|
||||
&quants->uv_quant_shift[q][i], quant);
|
||||
quants->uv_quant_fp[q][i] = (1 << 16) / quant;
|
||||
quants->uv_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7;
|
||||
quants->uv_zbin[q][i] = ROUND_POWER_OF_TWO(qzbin_factor * quant, 7);
|
||||
quants->uv_round[q][i] = (qrounding_factor * quant) >> 7;
|
||||
cm->uv_dequant[q][i] = quant;
|
||||
|
@ -240,6 +287,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
for (i = 2; i < 8; i++) {
|
||||
quants->y_quant[q][i] = quants->y_quant[q][1];
|
||||
quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
|
||||
quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
|
||||
quants->y_quant_shift[q][i] = quants->y_quant_shift[q][1];
|
||||
quants->y_zbin[q][i] = quants->y_zbin[q][1];
|
||||
quants->y_round[q][i] = quants->y_round[q][1];
|
||||
|
@ -247,6 +295,7 @@ void vp9_init_quantizer(VP9_COMP *cpi) {
|
|||
|
||||
quants->uv_quant[q][i] = quants->uv_quant[q][1];
|
||||
quants->uv_quant_fp[q][i] = quants->uv_quant_fp[q][1];
|
||||
quants->uv_round_fp[q][i] = quants->uv_round_fp[q][1];
|
||||
quants->uv_quant_shift[q][i] = quants->uv_quant_shift[q][1];
|
||||
quants->uv_zbin[q][i] = quants->uv_zbin[q][1];
|
||||
quants->uv_round[q][i] = quants->uv_round[q][1];
|
||||
|
@ -276,6 +325,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
|
|||
// Y
|
||||
x->plane[0].quant = quants->y_quant[qindex];
|
||||
x->plane[0].quant_fp = quants->y_quant_fp[qindex];
|
||||
x->plane[0].round_fp = quants->y_round_fp[qindex];
|
||||
x->plane[0].quant_shift = quants->y_quant_shift[qindex];
|
||||
x->plane[0].zbin = quants->y_zbin[qindex];
|
||||
x->plane[0].round = quants->y_round[qindex];
|
||||
|
@ -286,6 +336,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
|
|||
for (i = 1; i < 3; i++) {
|
||||
x->plane[i].quant = quants->uv_quant[qindex];
|
||||
x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
|
||||
x->plane[i].round_fp = quants->uv_round_fp[qindex];
|
||||
x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
|
||||
x->plane[i].zbin = quants->uv_zbin[qindex];
|
||||
x->plane[i].round = quants->uv_round[qindex];
|
||||
|
|
|
@ -28,6 +28,8 @@ typedef struct {
|
|||
// if we want to deprecate the current use of y_quant.
|
||||
DECLARE_ALIGNED(16, int16_t, y_quant_fp[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, uv_quant_fp[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, y_round_fp[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, uv_round_fp[QINDEX_RANGE][8]);
|
||||
|
||||
DECLARE_ALIGNED(16, int16_t, uv_quant[QINDEX_RANGE][8]);
|
||||
DECLARE_ALIGNED(16, int16_t, uv_quant_shift[QINDEX_RANGE][8]);
|
||||
|
|
|
@ -282,6 +282,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf,
|
|||
sf->elevate_newmv_thresh = 2000;
|
||||
}
|
||||
if (speed >= 7) {
|
||||
sf->use_quant_fp = cm->frame_type == KEY_FRAME ? 0 : 1;
|
||||
sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
|
||||
sf->encode_breakout_thresh = (MIN(cm->width, cm->height) >= 720) ?
|
||||
800 : 300;
|
||||
|
@ -318,6 +319,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
|
|||
sf->use_lp32x32fdct = 0;
|
||||
sf->adaptive_motion_search = 0;
|
||||
sf->adaptive_pred_interp_filter = 0;
|
||||
sf->use_quant_fp = 0;
|
||||
sf->reference_masking = 0;
|
||||
sf->partition_search_type = SEARCH_PARTITION;
|
||||
sf->less_rectangular_check = 0;
|
||||
|
|
|
@ -284,6 +284,9 @@ typedef struct SPEED_FEATURES {
|
|||
// was selected, and 2 means we use 8 tap if no 8x8 filter mode was selected.
|
||||
int adaptive_pred_interp_filter;
|
||||
|
||||
// Fast quantization process path
|
||||
int use_quant_fp;
|
||||
|
||||
// Search through variable block partition types in non-RD mode decision
|
||||
// encoding process for RTC.
|
||||
int partition_check;
|
||||
|
|
|
@ -217,3 +217,185 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
|||
INIT_XMM ssse3
|
||||
QUANTIZE_FN b, 7
|
||||
QUANTIZE_FN b_32x32, 7
|
||||
|
||||
%macro QUANTIZE_FP 2
|
||||
cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
|
||||
shift, qcoeff, dqcoeff, dequant, zbin_oq, \
|
||||
eob, scan, iscan
|
||||
cmp dword skipm, 0
|
||||
jne .blank
|
||||
|
||||
; actual quantize loop - setup pointers, rounders, etc.
|
||||
movifnidn coeffq, coeffmp
|
||||
movifnidn ncoeffq, ncoeffmp
|
||||
mov r2, dequantmp
|
||||
movifnidn zbinq, zbinmp
|
||||
movifnidn roundq, roundmp
|
||||
movifnidn quantq, quantmp
|
||||
mova m1, [roundq] ; m1 = round
|
||||
mova m2, [quantq] ; m2 = quant
|
||||
%ifidn %1, b_32x32
|
||||
; TODO(jingning) to be continued with 32x32 quantization process
|
||||
pcmpeqw m5, m5
|
||||
psrlw m5, 15
|
||||
paddw m0, m5
|
||||
paddw m1, m5
|
||||
psrlw m0, 1 ; m0 = (m0 + 1) / 2
|
||||
psrlw m1, 1 ; m1 = (m1 + 1) / 2
|
||||
%endif
|
||||
mova m3, [r2q] ; m3 = dequant
|
||||
mov r3, qcoeffmp
|
||||
mov r4, dqcoeffmp
|
||||
mov r5, iscanmp
|
||||
%ifidn %1, b_32x32
|
||||
psllw m4, 1
|
||||
%endif
|
||||
pxor m5, m5 ; m5 = dedicated zero
|
||||
DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob
|
||||
lea coeffq, [ coeffq+ncoeffq*2]
|
||||
lea iscanq, [ iscanq+ncoeffq*2]
|
||||
lea qcoeffq, [ qcoeffq+ncoeffq*2]
|
||||
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
|
||||
neg ncoeffq
|
||||
|
||||
; get DC and first 15 AC coeffs
|
||||
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
|
||||
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
|
||||
pabsw m6, m9 ; m6 = abs(m9)
|
||||
pabsw m11, m10 ; m11 = abs(m10)
|
||||
pcmpeqw m7, m7
|
||||
pcmpeqw m12, m12
|
||||
|
||||
paddsw m6, m1 ; m6 += round
|
||||
punpckhqdq m1, m1
|
||||
paddsw m11, m1 ; m11 += round
|
||||
pmulhw m8, m6, m2 ; m8 = m6*q>>16
|
||||
punpckhqdq m2, m2
|
||||
pmulhw m13, m11, m2 ; m13 = m11*q>>16
|
||||
psignw m8, m9 ; m8 = reinsert sign
|
||||
psignw m13, m10 ; m13 = reinsert sign
|
||||
mova [qcoeffq+ncoeffq*2+ 0], m8
|
||||
mova [qcoeffq+ncoeffq*2+16], m13
|
||||
%ifidn %1, b_32x32
|
||||
pabsw m8, m8
|
||||
pabsw m13, m13
|
||||
%endif
|
||||
pmullw m8, m3 ; dqc[i] = qc[i] * q
|
||||
punpckhqdq m3, m3
|
||||
pmullw m13, m3 ; dqc[i] = qc[i] * q
|
||||
%ifidn %1, b_32x32
|
||||
psrlw m8, 1
|
||||
psrlw m13, 1
|
||||
psignw m8, m9
|
||||
psignw m13, m10
|
||||
%endif
|
||||
mova [dqcoeffq+ncoeffq*2+ 0], m8
|
||||
mova [dqcoeffq+ncoeffq*2+16], m13
|
||||
pcmpeqw m8, m5 ; m8 = c[i] == 0
|
||||
pcmpeqw m13, m5 ; m13 = c[i] == 0
|
||||
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
|
||||
mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
|
||||
psubw m6, m7 ; m6 = scan[i] + 1
|
||||
psubw m11, m12 ; m11 = scan[i] + 1
|
||||
pandn m8, m6 ; m8 = max(eob)
|
||||
pandn m13, m11 ; m13 = max(eob)
|
||||
pmaxsw m8, m13
|
||||
add ncoeffq, mmsize
|
||||
jz .accumulate_eob
|
||||
|
||||
.ac_only_loop:
|
||||
mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
|
||||
mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
|
||||
pabsw m6, m9 ; m6 = abs(m9)
|
||||
pabsw m11, m10 ; m11 = abs(m10)
|
||||
pcmpeqw m7, m7
|
||||
pcmpeqw m12, m12
|
||||
%ifidn %1, b_32x32
|
||||
pmovmskb r6, m7
|
||||
pmovmskb r2, m12
|
||||
or r6, r2
|
||||
jz .skip_iter
|
||||
%endif
|
||||
paddsw m6, m1 ; m6 += round
|
||||
paddsw m11, m1 ; m11 += round
|
||||
pmulhw m14, m6, m2 ; m14 = m6*q>>16
|
||||
pmulhw m13, m11, m2 ; m13 = m11*q>>16
|
||||
psignw m14, m9 ; m14 = reinsert sign
|
||||
psignw m13, m10 ; m13 = reinsert sign
|
||||
mova [qcoeffq+ncoeffq*2+ 0], m14
|
||||
mova [qcoeffq+ncoeffq*2+16], m13
|
||||
%ifidn %1, b_32x32
|
||||
pabsw m14, m14
|
||||
pabsw m13, m13
|
||||
%endif
|
||||
pmullw m14, m3 ; dqc[i] = qc[i] * q
|
||||
pmullw m13, m3 ; dqc[i] = qc[i] * q
|
||||
%ifidn %1, b_32x32
|
||||
psrlw m14, 1
|
||||
psrlw m13, 1
|
||||
psignw m14, m9
|
||||
psignw m13, m10
|
||||
%endif
|
||||
mova [dqcoeffq+ncoeffq*2+ 0], m14
|
||||
mova [dqcoeffq+ncoeffq*2+16], m13
|
||||
pcmpeqw m14, m5 ; m14 = c[i] == 0
|
||||
pcmpeqw m13, m5 ; m13 = c[i] == 0
|
||||
mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
|
||||
mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
|
||||
psubw m6, m7 ; m6 = scan[i] + 1
|
||||
psubw m11, m12 ; m11 = scan[i] + 1
|
||||
pandn m14, m6 ; m14 = max(eob)
|
||||
pandn m13, m11 ; m13 = max(eob)
|
||||
pmaxsw m8, m14
|
||||
pmaxsw m8, m13
|
||||
add ncoeffq, mmsize
|
||||
jl .ac_only_loop
|
||||
|
||||
%ifidn %1, b_32x32
|
||||
jmp .accumulate_eob
|
||||
.skip_iter:
|
||||
mova [qcoeffq+ncoeffq*2+ 0], m5
|
||||
mova [qcoeffq+ncoeffq*2+16], m5
|
||||
mova [dqcoeffq+ncoeffq*2+ 0], m5
|
||||
mova [dqcoeffq+ncoeffq*2+16], m5
|
||||
add ncoeffq, mmsize
|
||||
jl .ac_only_loop
|
||||
%endif
|
||||
|
||||
.accumulate_eob:
|
||||
; horizontally accumulate/max eobs and write into [eob] memory pointer
|
||||
mov r2, eobmp
|
||||
pshufd m7, m8, 0xe
|
||||
pmaxsw m8, m7
|
||||
pshuflw m7, m8, 0xe
|
||||
pmaxsw m8, m7
|
||||
pshuflw m7, m8, 0x1
|
||||
pmaxsw m8, m7
|
||||
pextrw r6, m8, 0
|
||||
mov [r2], r6
|
||||
RET
|
||||
|
||||
; skip-block, i.e. just write all zeroes
|
||||
.blank:
|
||||
mov r0, dqcoeffmp
|
||||
movifnidn ncoeffq, ncoeffmp
|
||||
mov r2, qcoeffmp
|
||||
mov r3, eobmp
|
||||
DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
|
||||
lea dqcoeffq, [dqcoeffq+ncoeffq*2]
|
||||
lea qcoeffq, [ qcoeffq+ncoeffq*2]
|
||||
neg ncoeffq
|
||||
pxor m7, m7
|
||||
.blank_loop:
|
||||
mova [dqcoeffq+ncoeffq*2+ 0], m7
|
||||
mova [dqcoeffq+ncoeffq*2+16], m7
|
||||
mova [qcoeffq+ncoeffq*2+ 0], m7
|
||||
mova [qcoeffq+ncoeffq*2+16], m7
|
||||
add ncoeffq, mmsize
|
||||
jl .blank_loop
|
||||
mov word [eobq], 0
|
||||
RET
|
||||
%endmacro
|
||||
|
||||
INIT_XMM ssse3
|
||||
QUANTIZE_FP fp, 7
|
||||
|
|
Загрузка…
Ссылка в новой задаче