989 строки
33 KiB
C
989 строки
33 KiB
C
/*
|
|
* Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
|
|
*
|
|
* This source code is subject to the terms of the BSD 2 Clause License and
|
|
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
|
* was not distributed with this source code in the LICENSE file, you can
|
|
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
|
* Media Patent License 1.0 was not distributed with this source code in the
|
|
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
|
*/
|
|
|
|
/* clang-format off */
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
# include "config.h"
|
|
#endif
|
|
|
|
#include <math.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "aom_dsp/entcode.h"
|
|
#include "aom_dsp/entenc.h"
|
|
#include "av1/common/blockd.h"
|
|
#include "av1/common/odintrin.h"
|
|
#include "av1/common/partition.h"
|
|
#include "av1/common/pvq_state.h"
|
|
#include "av1/encoder/encodemb.h"
|
|
#include "av1/encoder/pvq_encoder.h"
|
|
#include "aom_ports/system_state.h"
|
|
|
|
/*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
|
|
dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
|
|
#define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
|
|
|
|
void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
|
|
int nsymbs) {
|
|
if (cdf[0] == 0)
|
|
aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
|
|
aom_write_symbol(w, symb, cdf, nsymbs);
|
|
}
|
|
|
|
static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
|
|
const od_coeff *in, int n, int k) {
|
|
int i;
|
|
aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
|
|
for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
|
|
}
|
|
|
|
/* Computes 1/sqrt(i) using a table for small values. */
|
|
static double od_rsqrt_table(int i) {
|
|
static double table[16] = {
|
|
1.000000, 0.707107, 0.577350, 0.500000,
|
|
0.447214, 0.408248, 0.377964, 0.353553,
|
|
0.333333, 0.316228, 0.301511, 0.288675,
|
|
0.277350, 0.267261, 0.258199, 0.250000};
|
|
if (i <= 16) return table[i-1];
|
|
else return 1./sqrt(i);
|
|
}
|
|
|
|
/*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
|
|
where 0 <= i < table_size.*/
|
|
static double od_custom_rsqrt_dynamic_table(const double* table,
|
|
const int table_size, const double start, const int i) {
|
|
if (i < table_size) return table[i];
|
|
else return od_rsqrt_table((int)(start + 2*i + 1));
|
|
}
|
|
|
|
/*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
|
|
static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
|
|
const double start) {
|
|
int i;
|
|
for (i = 0; i < table_size; i++)
|
|
table[i] = od_rsqrt_table((int)(start + 2*i + 1));
|
|
}
|
|
|
|
/** Find the codepoint on the given PSphere closest to the desired
|
|
* vector. Double-precision PVQ search just to make sure our tests
|
|
* aren't limited by numerical accuracy.
|
|
*
|
|
* @param [in] xcoeff input vector to quantize (x in the math doc)
|
|
* @param [in] n number of dimensions
|
|
* @param [in] k number of pulses
|
|
* @param [out] ypulse optimal codevector found (y in the math doc)
|
|
* @param [out] g2 multiplier for the distortion (typically squared
|
|
* gain units)
|
|
* @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
|
|
* @param [in] prev_k number of pulses already in ypulse that we should
|
|
* reuse for the search (or 0 for a new search)
|
|
* @return cosine distance between x and y (between 0 and 1)
|
|
*/
|
|
double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
|
|
od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
|
|
int i, j;
|
|
double xy;
|
|
double yy;
|
|
/* TODO - This blows our 8kB stack space budget and should be fixed when
|
|
converting PVQ to fixed point. */
|
|
double x[MAXN];
|
|
double xx;
|
|
double lambda;
|
|
double norm_1;
|
|
int rdo_pulses;
|
|
double delta_rate;
|
|
xx = xy = yy = 0;
|
|
for (j = 0; j < n; j++) {
|
|
x[j] = fabs((float)xcoeff[j]);
|
|
xx += x[j]*x[j];
|
|
}
|
|
norm_1 = 1./sqrt(1e-30 + xx);
|
|
lambda = pvq_norm_lambda/(1e-30 + g2);
|
|
i = 0;
|
|
if (prev_k > 0 && prev_k <= k) {
|
|
/* We reuse pulses from a previous search so we don't have to search them
|
|
again. */
|
|
for (j = 0; j < n; j++) {
|
|
ypulse[j] = abs(ypulse[j]);
|
|
xy += x[j]*ypulse[j];
|
|
yy += ypulse[j]*ypulse[j];
|
|
i += ypulse[j];
|
|
}
|
|
}
|
|
else if (k > 2) {
|
|
double l1_norm;
|
|
double l1_inv;
|
|
l1_norm = 0;
|
|
for (j = 0; j < n; j++) l1_norm += x[j];
|
|
l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
|
|
for (j = 0; j < n; j++) {
|
|
double tmp;
|
|
tmp = k*x[j]*l1_inv;
|
|
ypulse[j] = OD_MAXI(0, (int)floor(tmp));
|
|
xy += x[j]*ypulse[j];
|
|
yy += ypulse[j]*ypulse[j];
|
|
i += ypulse[j];
|
|
}
|
|
}
|
|
else OD_CLEAR(ypulse, n);
|
|
|
|
/* Only use RDO on the last few pulses. This not only saves CPU, but using
|
|
RDO on all pulses actually makes the results worse for reasons I don't
|
|
fully understand. */
|
|
rdo_pulses = 1 + k/4;
|
|
/* Rough assumption for now, the last position costs about 3 bits more than
|
|
the first. */
|
|
delta_rate = 3./n;
|
|
/* Search one pulse at a time */
|
|
for (; i < k - rdo_pulses; i++) {
|
|
int pos;
|
|
double best_xy;
|
|
double best_yy;
|
|
pos = 0;
|
|
best_xy = -10;
|
|
best_yy = 1;
|
|
for (j = 0; j < n; j++) {
|
|
double tmp_xy;
|
|
double tmp_yy;
|
|
tmp_xy = xy + x[j];
|
|
tmp_yy = yy + 2*ypulse[j] + 1;
|
|
tmp_xy *= tmp_xy;
|
|
if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
|
|
best_xy = tmp_xy;
|
|
best_yy = tmp_yy;
|
|
pos = j;
|
|
}
|
|
}
|
|
xy = xy + x[pos];
|
|
yy = yy + 2*ypulse[pos] + 1;
|
|
ypulse[pos]++;
|
|
}
|
|
/* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
|
|
and since x^2 and y^2 are constant, we just maximize x*y, plus a
|
|
lambda*rate term. Note that since x and y aren't normalized here,
|
|
we need to divide by sqrt(x^2)*sqrt(y^2). */
|
|
for (; i < k; i++) {
|
|
double rsqrt_table[4];
|
|
int rsqrt_table_size = 4;
|
|
int pos;
|
|
double best_cost;
|
|
pos = 0;
|
|
best_cost = -1e5;
|
|
/*Fill the small rsqrt lookup table with inputs relative to yy.
|
|
Specifically, the table of n values is filled with
|
|
rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
|
|
od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
|
|
for (j = 0; j < n; j++) {
|
|
double tmp_xy;
|
|
double tmp_yy;
|
|
tmp_xy = xy + x[j];
|
|
/*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
|
|
tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
|
|
yy, ypulse[j]);
|
|
tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
|
|
if (j == 0 || tmp_xy > best_cost) {
|
|
best_cost = tmp_xy;
|
|
pos = j;
|
|
}
|
|
}
|
|
xy = xy + x[pos];
|
|
yy = yy + 2*ypulse[pos] + 1;
|
|
ypulse[pos]++;
|
|
}
|
|
for (i = 0; i < n; i++) {
|
|
if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
|
|
}
|
|
return xy/(1e-100 + sqrt(xx*yy));
|
|
}
|
|
|
|
/** Encodes the gain so that the return value increases with the
|
|
* distance |x-ref|, so that we can encode a zero when x=ref. The
|
|
* value x=0 is not covered because it is only allowed in the noref
|
|
* case.
|
|
*
|
|
* @param [in] x quantized gain to encode
|
|
* @param [in] ref quantized gain of the reference
|
|
* @return interleave-encoded quantized gain value
|
|
*/
|
|
static int neg_interleave(int x, int ref) {
|
|
if (x < ref) return -2*(x - ref) - 1;
|
|
else if (x < 2*ref) return 2*(x - ref);
|
|
else return x-1;
|
|
}
|
|
|
|
int od_vector_is_null(const od_coeff *x, int len) {
|
|
int i;
|
|
for (i = 0; i < len; i++) if (x[i]) return 0;
|
|
return 1;
|
|
}
|
|
|
|
static double od_pvq_rate(int qg, int icgr, int theta, int ts,
|
|
const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
|
|
double rate;
|
|
if (k == 0) rate = 0;
|
|
else if (speed > 0) {
|
|
int i;
|
|
int sum;
|
|
double f;
|
|
/* Compute "center of mass" of the pulse vector. */
|
|
sum = 0;
|
|
for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
|
|
f = sum/(double)(k*n);
|
|
/* Estimates the number of bits it will cost to encode K pulses in
|
|
N dimensions based on hand-tuned fit for bitrate vs K, N and
|
|
"center of mass". */
|
|
rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
|
|
}
|
|
else {
|
|
aom_writer w;
|
|
od_pvq_codeword_ctx cd;
|
|
int tell;
|
|
#if !CONFIG_ANS
|
|
od_ec_enc_init(&w.ec, 1000);
|
|
#else
|
|
# error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
|
|
#if !CONFIG_ANS
|
|
tell = od_ec_enc_tell_frac(&w.ec);
|
|
#else
|
|
# error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
|
|
#if !CONFIG_ANS
|
|
rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
|
|
od_ec_enc_clear(&w.ec);
|
|
#else
|
|
# error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
}
|
|
if (qg > 0 && theta >= 0) {
|
|
/* Approximate cost of entropy-coding theta */
|
|
rate += .9*OD_LOG2(ts);
|
|
if (qg == icgr) rate -= .5;
|
|
}
|
|
return rate;
|
|
}
|
|
|
|
#define MAX_PVQ_ITEMS (20)
|
|
/* This stores the information about a PVQ search candidate, so we can sort
|
|
based on K. */
|
|
typedef struct {
|
|
int gain;
|
|
int k;
|
|
od_val32 qtheta;
|
|
int theta;
|
|
int ts;
|
|
od_val32 qcg;
|
|
} pvq_search_item;
|
|
|
|
int items_compare(pvq_search_item *a, pvq_search_item *b) {
|
|
/* Break ties in K with gain to ensure a stable sort.
|
|
Otherwise, the order depends on qsort implementation. */
|
|
return a->k == b->k ? a->gain - b->gain : a->k - b->k;
|
|
}
|
|
|
|
/** Perform PVQ quantization with prediction, trying several
|
|
* possible gains and angles. See draft-valin-videocodec-pvq and
|
|
* http://jmvalin.ca/slides/pvq.pdf for more details.
|
|
*
|
|
* @param [out] out coefficients after quantization
|
|
* @param [in] x0 coefficients before quantization
|
|
* @param [in] r0 reference, aka predicted coefficients
|
|
* @param [in] n number of dimensions
|
|
* @param [in] q0 quantization step size
|
|
* @param [out] y pulse vector (i.e. selected PVQ codevector)
|
|
* @param [out] itheta angle between input and reference (-1 if noref)
|
|
* @param [out] vk total number of pulses
|
|
* @param [in] beta per-band activity masking beta param
|
|
* @param [out] skip_diff distortion cost of skipping this block
|
|
* (accumulated)
|
|
* @param [in] is_keyframe whether we're encoding a keyframe
|
|
* @param [in] pli plane index
|
|
* @param [in] adapt probability adaptation context
|
|
* @param [in] qm QM with magnitude compensation
|
|
* @param [in] qm_inv Inverse of QM with magnitude compensation
|
|
* @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
|
|
* @param [in] speed Make search faster by making approximations
|
|
* @return gain index of the quatized gain
|
|
*/
|
|
static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
|
|
int n, int q0, od_coeff *y, int *itheta, int *vk,
|
|
od_val16 beta, double *skip_diff, int is_keyframe, int pli,
|
|
const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
|
|
double pvq_norm_lambda, int speed) {
|
|
od_val32 g;
|
|
od_val32 gr;
|
|
od_coeff y_tmp[MAXN + 3];
|
|
int i;
|
|
/* Number of pulses. */
|
|
int k;
|
|
/* Companded gain of x and reference, normalized to q. */
|
|
od_val32 cg;
|
|
od_val32 cgr;
|
|
int icgr;
|
|
int qg;
|
|
/* Best RDO cost (D + lamdba*R) so far. */
|
|
double best_cost;
|
|
double dist0;
|
|
/* Distortion (D) that corresponds to the best RDO cost. */
|
|
double best_dist;
|
|
double dist;
|
|
/* Sign of Householder reflection. */
|
|
int s;
|
|
/* Dimension on which Householder reflects. */
|
|
int m;
|
|
od_val32 theta;
|
|
double corr;
|
|
int best_k;
|
|
od_val32 best_qtheta;
|
|
od_val32 gain_offset;
|
|
int noref;
|
|
double skip_dist;
|
|
int cfl_enabled;
|
|
int skip;
|
|
double gain_weight;
|
|
od_val16 x16[MAXN];
|
|
od_val16 r16[MAXN];
|
|
int xshift;
|
|
int rshift;
|
|
/* Give more weight to gain error when calculating the total distortion. */
|
|
gain_weight = 1.0;
|
|
OD_ASSERT(n > 1);
|
|
corr = 0;
|
|
#if !defined(OD_FLOAT_PVQ)
|
|
/* Shift needed to make x fit in 16 bits even after rotation.
|
|
This shift value is not normative (it can be changed without breaking
|
|
the bitstream) */
|
|
xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
|
|
/* Shift needed to make the reference fit in 15 bits, so that the Householder
|
|
vector can fit in 16 bits.
|
|
This shift value *is* normative, and has to match the decoder. */
|
|
rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
|
|
#else
|
|
xshift = 0;
|
|
rshift = 0;
|
|
#endif
|
|
for (i = 0; i < n; i++) {
|
|
#if defined(OD_FLOAT_PVQ)
|
|
/*This is slightly different from the original float PVQ code,
|
|
where the qm was applied in the accumulation in od_pvq_compute_gain and
|
|
the vectors were od_coeffs, not od_val16 (i.e. double).*/
|
|
x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
|
|
r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
|
|
#else
|
|
x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
|
|
r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
|
|
#endif
|
|
corr += OD_MULT16_16(x16[i], r16[i]);
|
|
}
|
|
cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
|
|
cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
|
|
cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
|
|
if (cfl_enabled) cgr = OD_CGAIN_SCALE;
|
|
/* gain_offset is meant to make sure one of the quantized gains has
|
|
exactly the same gain as the reference. */
|
|
#if defined(OD_FLOAT_PVQ)
|
|
icgr = (int)floor(.5 + cgr);
|
|
#else
|
|
icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
|
|
#endif
|
|
gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
|
|
/* Start search with null case: gain=0, no pulse. */
|
|
qg = 0;
|
|
dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
|
|
best_dist = dist;
|
|
best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
|
|
n, speed);
|
|
noref = 1;
|
|
best_k = 0;
|
|
*itheta = -1;
|
|
OD_CLEAR(y, n);
|
|
best_qtheta = 0;
|
|
m = 0;
|
|
s = 1;
|
|
corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
|
|
corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
|
|
if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
|
|
else {
|
|
skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
|
|
+ cgr*(double)cg*(2 - 2*corr);
|
|
skip_dist *= OD_CGAIN_SCALE_2;
|
|
}
|
|
if (!is_keyframe) {
|
|
/* noref, gain=0 isn't allowed, but skip is allowed. */
|
|
od_val32 scgr;
|
|
scgr = OD_MAXF(0,gain_offset);
|
|
if (icgr == 0) {
|
|
best_dist = gain_weight*(cg - scgr)*(cg - scgr)
|
|
+ scgr*(double)cg*(2 - 2*corr);
|
|
best_dist *= OD_CGAIN_SCALE_2;
|
|
}
|
|
best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
|
|
NULL, 0, n, speed);
|
|
best_qtheta = 0;
|
|
*itheta = 0;
|
|
noref = 0;
|
|
}
|
|
dist0 = best_dist;
|
|
if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
|
|
od_val16 xr[MAXN];
|
|
int gain_bound;
|
|
int prev_k;
|
|
pvq_search_item items[MAX_PVQ_ITEMS];
|
|
int idx;
|
|
int nitems;
|
|
double cos_dist;
|
|
idx = 0;
|
|
gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
|
|
/* Perform theta search only if prediction is useful. */
|
|
theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
|
|
m = od_compute_householder(r16, n, gr, &s, rshift);
|
|
od_apply_householder(xr, x16, r16, n);
|
|
prev_k = 0;
|
|
for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
|
|
/* Compute all candidate PVQ searches within a reasonable range of gain
|
|
and theta. */
|
|
for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
|
|
int j;
|
|
od_val32 qcg;
|
|
int ts;
|
|
int theta_lower;
|
|
int theta_upper;
|
|
/* Quantized companded gain */
|
|
qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
|
|
/* Set angular resolution (in ra) to match the encoded gain */
|
|
ts = od_pvq_compute_max_theta(qcg, beta);
|
|
theta_lower = OD_MAXI(0, (int)floor(.5 +
|
|
theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
|
|
theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
|
|
/* Include the angles within a reasonable range. */
|
|
for (j = theta_lower; j <= theta_upper; j++) {
|
|
od_val32 qtheta;
|
|
qtheta = od_pvq_compute_theta(j, ts);
|
|
k = od_pvq_compute_k(qcg, j, 0, n, beta);
|
|
items[idx].gain = i;
|
|
items[idx].theta = j;
|
|
items[idx].k = k;
|
|
items[idx].qcg = qcg;
|
|
items[idx].qtheta = qtheta;
|
|
items[idx].ts = ts;
|
|
idx++;
|
|
OD_ASSERT(idx < MAX_PVQ_ITEMS);
|
|
}
|
|
}
|
|
nitems = idx;
|
|
cos_dist = 0;
|
|
/* Sort PVQ search candidates in ascending order of pulses K so that
|
|
we can reuse all the previously searched pulses across searches. */
|
|
qsort(items, nitems, sizeof(items[0]),
|
|
(int (*)(const void *, const void *))items_compare);
|
|
/* Search for the best gain/theta in order. */
|
|
for (idx = 0; idx < nitems; idx++) {
|
|
int j;
|
|
od_val32 qcg;
|
|
int ts;
|
|
double cost;
|
|
double dist_theta;
|
|
double sin_prod;
|
|
od_val32 qtheta;
|
|
/* Quantized companded gain */
|
|
qcg = items[idx].qcg;
|
|
i = items[idx].gain;
|
|
j = items[idx].theta;
|
|
/* Set angular resolution (in ra) to match the encoded gain */
|
|
ts = items[idx].ts;
|
|
/* Search for the best angle within a reasonable range. */
|
|
qtheta = items[idx].qtheta;
|
|
k = items[idx].k;
|
|
/* Compute the minimal possible distortion by not taking the PVQ
|
|
cos_dist into account. */
|
|
dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
|
|
dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
|
|
dist *= OD_CGAIN_SCALE_2;
|
|
/* If we have no hope of beating skip (including a 1-bit worst-case
|
|
penalty), stop now. */
|
|
if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
|
|
sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
|
|
OD_TRIG_SCALE_1;
|
|
/* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
|
|
that's the factor by which cos_dist is multiplied to get the
|
|
distortion metric. */
|
|
if (k == 0) {
|
|
cos_dist = 0;
|
|
OD_CLEAR(y_tmp, n-1);
|
|
}
|
|
else if (k != prev_k) {
|
|
cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
|
|
qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
|
|
}
|
|
prev_k = k;
|
|
/* See Jmspeex' Journal of Dubious Theoretical Results. */
|
|
dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
|
|
+ sin_prod*(2 - 2*cos_dist);
|
|
dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
|
|
dist *= OD_CGAIN_SCALE_2;
|
|
/* Do approximate RDO. */
|
|
cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
|
|
k, n, speed);
|
|
if (cost < best_cost) {
|
|
best_cost = cost;
|
|
best_dist = dist;
|
|
qg = i;
|
|
best_k = k;
|
|
best_qtheta = qtheta;
|
|
*itheta = j;
|
|
noref = 0;
|
|
OD_COPY(y, y_tmp, n - 1);
|
|
}
|
|
}
|
|
}
|
|
/* Don't bother with no-reference version if there's a reasonable
|
|
correlation. */
|
|
if (n <= OD_MAX_PVQ_SIZE && (corr < .5
|
|
|| cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
|
|
int gain_bound;
|
|
int prev_k;
|
|
gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
|
|
prev_k = 0;
|
|
/* Search for the best gain (haven't determined reasonable range yet). */
|
|
for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
|
|
double cos_dist;
|
|
double cost;
|
|
od_val32 qcg;
|
|
qcg = OD_SHL(i, OD_CGAIN_SHIFT);
|
|
k = od_pvq_compute_k(qcg, -1, 1, n, beta);
|
|
/* Compute the minimal possible distortion by not taking the PVQ
|
|
cos_dist into account. */
|
|
dist = gain_weight*(qcg - cg)*(qcg - cg);
|
|
dist *= OD_CGAIN_SCALE_2;
|
|
if (dist > dist0 && k != 0) continue;
|
|
cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
|
|
qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
|
|
prev_k = k;
|
|
/* See Jmspeex' Journal of Dubious Theoretical Results. */
|
|
dist = gain_weight*(qcg - cg)*(qcg - cg)
|
|
+ qcg*(double)cg*(2 - 2*cos_dist);
|
|
dist *= OD_CGAIN_SCALE_2;
|
|
/* Do approximate RDO. */
|
|
cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
|
|
n, speed);
|
|
if (cost <= best_cost) {
|
|
best_cost = cost;
|
|
best_dist = dist;
|
|
qg = i;
|
|
noref = 1;
|
|
best_k = k;
|
|
*itheta = -1;
|
|
OD_COPY(y, y_tmp, n);
|
|
}
|
|
}
|
|
}
|
|
k = best_k;
|
|
theta = best_qtheta;
|
|
skip = 0;
|
|
if (noref) {
|
|
if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
|
|
}
|
|
else {
|
|
if (!is_keyframe && qg == 0) {
|
|
skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
|
|
}
|
|
if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
|
|
}
|
|
/* Synthesize like the decoder would. */
|
|
if (skip) {
|
|
if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
|
|
else OD_CLEAR(out, n);
|
|
}
|
|
else {
|
|
if (noref) gain_offset = 0;
|
|
g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
|
|
od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
|
|
qm_inv);
|
|
}
|
|
*vk = k;
|
|
*skip_diff += skip_dist - best_dist;
|
|
/* Encode gain differently depending on whether we use prediction or not.
|
|
Special encoding on inter frames where qg=0 is allowed for noref=0
|
|
but not noref=1.*/
|
|
if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
|
|
else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
|
|
}
|
|
|
|
/** Encodes a single vector of integers (eg, a partition within a
|
|
* coefficient block) using PVQ
|
|
*
|
|
* @param [in,out] w multi-symbol entropy encoder
|
|
* @param [in] qg quantized gain
|
|
* @param [in] theta quantized post-prediction theta
|
|
* @param [in] in coefficient vector to code
|
|
* @param [in] n number of coefficients in partition
|
|
* @param [in] k number of pulses in partition
|
|
* @param [in,out] model entropy encoder state
|
|
* @param [in,out] adapt adaptation context
|
|
* @param [in,out] exg ExQ16 expectation of gain value
|
|
* @param [in,out] ext ExQ16 expectation of theta value
|
|
* @param [in] cdf_ctx selects which cdf context to use
|
|
* @param [in] is_keyframe whether we're encoding a keyframe
|
|
* @param [in] code_skip whether the "skip rest" flag is allowed
|
|
* @param [in] skip_rest when set, we skip all higher bands
|
|
* @param [in] encode_flip whether we need to encode the CfL flip flag now
|
|
* @param [in] flip value of the CfL flip flag
|
|
*/
|
|
void pvq_encode_partition(aom_writer *w,
|
|
int qg,
|
|
int theta,
|
|
const od_coeff *in,
|
|
int n,
|
|
int k,
|
|
generic_encoder model[3],
|
|
od_adapt_ctx *adapt,
|
|
int *exg,
|
|
int *ext,
|
|
int cdf_ctx,
|
|
int is_keyframe,
|
|
int code_skip,
|
|
int skip_rest,
|
|
int encode_flip,
|
|
int flip) {
|
|
int noref;
|
|
int id;
|
|
noref = (theta == -1);
|
|
id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
|
|
if (is_keyframe) {
|
|
OD_ASSERT(id != 8);
|
|
if (id >= 8) id--;
|
|
}
|
|
else {
|
|
OD_ASSERT(id != 10);
|
|
if (id >= 10) id--;
|
|
}
|
|
/* Jointly code gain, theta and noref for small values. Then we handle
|
|
larger gain and theta values. For noref, theta = -1. */
|
|
aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
|
|
8 + 7*code_skip);
|
|
if (encode_flip) {
|
|
/* We could eventually do some smarter entropy coding here, but it would
|
|
have to be good enough to overcome the overhead of the entropy coder.
|
|
An early attempt using a "toogle" flag with simple adaptation wasn't
|
|
worth the trouble. */
|
|
aom_write_bit(w, flip);
|
|
}
|
|
if (qg > 0) {
|
|
int tmp;
|
|
tmp = *exg;
|
|
generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
|
|
OD_IIR_DIADIC(*exg, qg << 16, 2);
|
|
}
|
|
if (theta > 1) {
|
|
int tmp;
|
|
tmp = *ext;
|
|
generic_encode(w, &model[2], theta - 2, &tmp, 2);
|
|
OD_IIR_DIADIC(*ext, theta << 16, 2);
|
|
}
|
|
aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
|
|
n - (theta != -1), k);
|
|
}
|
|
|
|
/** Quantizes a scalar with rate-distortion optimization (RDO)
|
|
* @param [in] x unquantized value
|
|
* @param [in] q quantization step size
|
|
* @param [in] delta0 rate increase for encoding a 1 instead of a 0
|
|
* @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
|
|
* @retval quantized value
|
|
*/
|
|
int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
|
|
int n;
|
|
/* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
|
|
Jmspeex' Journal of Dubious Theoretical Results for details. */
|
|
n = OD_DIV_R0(abs(x), q);
|
|
if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
|
|
return 0;
|
|
}
|
|
else {
|
|
return OD_DIV_R0(x, q);
|
|
}
|
|
}
|
|
|
|
/** Encode a coefficient block (excepting DC) using PVQ
|
|
*
|
|
* @param [in,out] enc daala encoder context
|
|
* @param [in] ref 'reference' (prediction) vector
|
|
* @param [in] in coefficient block to quantize and encode
|
|
* @param [out] out quantized coefficient block
|
|
* @param [in] q0 scale/quantizer
|
|
* @param [in] pli plane index
|
|
* @param [in] bs log of the block size minus two
|
|
* @param [in] beta per-band activity masking beta param
|
|
* @param [in] is_keyframe whether we're encoding a keyframe
|
|
* @param [in] qm QM with magnitude compensation
|
|
* @param [in] qm_inv Inverse of QM with magnitude compensation
|
|
* @param [in] speed Make search faster by making approximations
|
|
* @param [in] pvq_info If null, conisdered as RDO search mode
|
|
* @return Returns block skip info indicating whether DC/AC are coded.
|
|
* bit0: DC is coded, bit1: AC is coded (1 means coded)
|
|
*
|
|
*/
|
|
PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
|
|
od_coeff *ref,
|
|
const od_coeff *in,
|
|
od_coeff *out,
|
|
int q_dc,
|
|
int q_ac,
|
|
int pli,
|
|
int bs,
|
|
const od_val16 *beta,
|
|
int is_keyframe,
|
|
const int16_t *qm,
|
|
const int16_t *qm_inv,
|
|
int speed,
|
|
PVQ_INFO *pvq_info){
|
|
int theta[PVQ_MAX_PARTITIONS];
|
|
int qg[PVQ_MAX_PARTITIONS];
|
|
int k[PVQ_MAX_PARTITIONS];
|
|
od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
|
|
int *exg;
|
|
int *ext;
|
|
int nb_bands;
|
|
int i;
|
|
const int *off;
|
|
int size[PVQ_MAX_PARTITIONS];
|
|
generic_encoder *model;
|
|
double skip_diff;
|
|
int tell;
|
|
uint16_t *skip_cdf;
|
|
od_rollback_buffer buf;
|
|
int dc_quant;
|
|
int flip;
|
|
int cfl_encoded;
|
|
int skip_rest;
|
|
int skip_dir;
|
|
int skip_theta_value;
|
|
const unsigned char *pvq_qm;
|
|
double dc_rate;
|
|
int use_masking;
|
|
PVQ_SKIP_TYPE ac_dc_coded;
|
|
|
|
aom_clear_system_state();
|
|
|
|
use_masking = enc->use_activity_masking;
|
|
|
|
if (use_masking)
|
|
pvq_qm = &enc->state.pvq_qm_q4[pli][0];
|
|
else
|
|
pvq_qm = 0;
|
|
|
|
exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
|
|
ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
|
|
skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
|
|
model = enc->state.adapt->pvq.pvq_param_model;
|
|
nb_bands = OD_BAND_OFFSETS[bs][0];
|
|
off = &OD_BAND_OFFSETS[bs][1];
|
|
|
|
if (use_masking)
|
|
dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
|
|
else
|
|
dc_quant = OD_MAXI(1, q_dc);
|
|
|
|
tell = 0;
|
|
for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
|
|
skip_diff = 0;
|
|
flip = 0;
|
|
/*If we are coding a chroma block of a keyframe, we are doing CfL.*/
|
|
if (pli != 0 && is_keyframe) {
|
|
od_val32 xy;
|
|
xy = 0;
|
|
/*Compute the dot-product of the first band of chroma with the luma ref.*/
|
|
for (i = off[0]; i < off[1]; i++) {
|
|
#if defined(OD_FLOAT_PVQ)
|
|
xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
|
|
(double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
|
|
#else
|
|
od_val32 rq;
|
|
od_val32 inq;
|
|
rq = ref[i]*qm[i];
|
|
inq = in[i]*qm[i];
|
|
xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
|
|
1));
|
|
#endif
|
|
}
|
|
/*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
|
|
if (xy < 0) {
|
|
flip = 1;
|
|
for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
|
|
}
|
|
}
|
|
for (i = 0; i < nb_bands; i++) {
|
|
int q;
|
|
|
|
if (use_masking)
|
|
q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
|
|
else
|
|
q = OD_MAXI(1, q_ac);
|
|
|
|
qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
|
|
q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
|
|
pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
|
|
enc->pvq_norm_lambda, speed);
|
|
}
|
|
od_encode_checkpoint(enc, &buf);
|
|
if (is_keyframe) out[0] = 0;
|
|
else {
|
|
int n;
|
|
n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
|
|
if (n == 0) {
|
|
out[0] = 0;
|
|
} else {
|
|
int tell2;
|
|
od_rollback_buffer dc_buf;
|
|
|
|
dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/
|
|
(double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1])));
|
|
dc_rate += 1;
|
|
|
|
#if !CONFIG_ANS
|
|
tell2 = od_ec_enc_tell_frac(&enc->w.ec);
|
|
#else
|
|
#error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
od_encode_checkpoint(enc, &dc_buf);
|
|
generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
|
|
n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
|
|
#if !CONFIG_ANS
|
|
tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
|
|
#else
|
|
#error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
dc_rate += tell2/8.0;
|
|
od_encode_rollback(enc, &dc_buf);
|
|
|
|
out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
|
|
enc->pvq_norm_lambda);
|
|
}
|
|
}
|
|
#if !CONFIG_ANS
|
|
tell = od_ec_enc_tell_frac(&enc->w.ec);
|
|
#else
|
|
#error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
/* Code as if we're not skipping. */
|
|
aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
|
|
ac_dc_coded = AC_CODED + (out[0] != 0);
|
|
cfl_encoded = 0;
|
|
skip_rest = 1;
|
|
skip_theta_value = is_keyframe ? -1 : 0;
|
|
for (i = 1; i < nb_bands; i++) {
|
|
if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
|
|
}
|
|
skip_dir = 0;
|
|
if (nb_bands > 1) {
|
|
for (i = 0; i < 3; i++) {
|
|
int j;
|
|
int tmp;
|
|
tmp = 1;
|
|
// ToDo(yaowu): figure out better stop condition without gcc warning.
|
|
for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
|
|
if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
|
|
}
|
|
skip_dir |= tmp << i;
|
|
}
|
|
}
|
|
if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
|
|
|
|
/* NOTE: There was no other better place to put this function. */
|
|
if (pvq_info)
|
|
av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
|
|
skip_rest, skip_dir, bs);
|
|
|
|
for (i = 0; i < nb_bands; i++) {
|
|
int encode_flip;
|
|
/* Encode CFL flip bit just after the first time it's used. */
|
|
encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
|
|
if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
|
|
pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
|
|
size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
|
|
(pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
|
|
is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
|
|
}
|
|
if (i == 0 && !skip_rest && bs > 0) {
|
|
aom_write_symbol(&enc->w, skip_dir,
|
|
&enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
|
|
}
|
|
if (encode_flip) cfl_encoded = 1;
|
|
}
|
|
#if !CONFIG_ANS
|
|
tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
|
|
#else
|
|
#error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
/* Account for the rate of skipping the AC, based on the same DC decision
|
|
we made when trying to not skip AC. */
|
|
{
|
|
double skip_rate;
|
|
if (out[0] != 0) {
|
|
skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
|
|
(double)OD_ICDF(skip_cdf[3]));
|
|
}
|
|
else {
|
|
skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/
|
|
(double)OD_ICDF(skip_cdf[3]));
|
|
}
|
|
tell -= (int)floor(.5+8*skip_rate);
|
|
}
|
|
if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
|
|
if (is_keyframe) out[0] = 0;
|
|
else {
|
|
int n;
|
|
n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
|
|
if (n == 0) {
|
|
out[0] = 0;
|
|
} else {
|
|
int tell2;
|
|
od_rollback_buffer dc_buf;
|
|
|
|
dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
|
|
(double)OD_ICDF(skip_cdf[0]));
|
|
dc_rate += 1;
|
|
|
|
#if !CONFIG_ANS
|
|
tell2 = od_ec_enc_tell_frac(&enc->w.ec);
|
|
#else
|
|
#error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
od_encode_checkpoint(enc, &dc_buf);
|
|
generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
|
|
n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
|
|
#if !CONFIG_ANS
|
|
tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
|
|
#else
|
|
#error "CONFIG_PVQ currently requires !CONFIG_ANS."
|
|
#endif
|
|
dc_rate += tell2/8.0;
|
|
od_encode_rollback(enc, &dc_buf);
|
|
|
|
out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
|
|
enc->pvq_norm_lambda);
|
|
}
|
|
}
|
|
/* We decide to skip, roll back everything as it was before. */
|
|
od_encode_rollback(enc, &buf);
|
|
aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
|
|
ac_dc_coded = (out[0] != 0);
|
|
if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
|
|
else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
|
|
}
|
|
if (pvq_info)
|
|
pvq_info->ac_dc_coded = ac_dc_coded;
|
|
return ac_dc_coded;
|
|
}
|