Add SIMD code for PVQ search
This reduces the runtime profile of pvq_search_rdo_double from 37% to 15% and improves overall encoding speed when PVQ is enabled by ~40%. The SIMD code is not bit accurate with the C version and introduces a slight PSNR regression on AWCY: PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 0.0607 | 0.1044 | N/A | 0.0126 | N/A | -0.0309 | N/A Change-Id: Ie22cebc62df2e72618305f2268668d79167860c6
This commit is contained in:
Родитель
47c7218942
Коммит
3a88de8f82
|
@ -120,6 +120,8 @@ AV1_COMMON_SRCS-yes += common/generic_code.h
|
|||
AV1_COMMON_SRCS-yes += common/pvq_state.c
|
||||
AV1_COMMON_SRCS-yes += common/pvq_state.h
|
||||
AV1_COMMON_SRCS-yes += common/laplace_tables.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.c
|
||||
AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/pvq_sse4.h
|
||||
endif
|
||||
|
||||
ifneq ($(CONFIG_AOM_HIGHBITDEPTH),yes)
|
||||
|
|
|
@ -762,6 +762,13 @@ if (aom_config("CONFIG_CDEF") eq "yes") {
|
|||
specialize qw/od_filter_dering_direction_8x8 sse4_1/;
|
||||
}
|
||||
|
||||
# PVQ Functions
|
||||
|
||||
if (aom_config("CONFIG_PVQ") eq "yes") {
|
||||
add_proto qw/double pvq_search_rdo_double/, "const int16_t *xcoeff, int n, int k, int *ypulse, double g2, double pvq_norm_lambda, int prev_k";
|
||||
specialize qw/pvq_search_rdo_double sse4_1/;
|
||||
}
|
||||
|
||||
# WARPED_MOTION / GLOBAL_MOTION functions
|
||||
|
||||
if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
|
||||
|
|
|
@ -0,0 +1,250 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
|
||||
#include <smmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#include <tmmintrin.h>
|
||||
#include <float.h>
|
||||
|
||||
#include "./av1_rtcd.h"
|
||||
#include "av1/common/x86/pvq_sse4.h"
|
||||
#include "../odintrin.h"
|
||||
#include "av1/common/pvq.h"
|
||||
|
||||
#define EPSILON 1e-15f
|
||||
|
||||
static __m128 horizontal_sum_ps(__m128 x) {
|
||||
x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
x = _mm_add_ps(x, _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
return x;
|
||||
}
|
||||
|
||||
static __m128i horizontal_sum_epi32(__m128i x) {
|
||||
x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
x = _mm_add_epi32(x, _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline float rsqrtf(float x) {
|
||||
float y;
|
||||
_mm_store_ss(&y, _mm_rsqrt_ss(_mm_load_ss(&x)));
|
||||
return y;
|
||||
}
|
||||
|
||||
/** Find the codepoint on the given PSphere closest to the desired
|
||||
* vector. This is a float-precision PVQ search just to make sure
|
||||
* our tests aren't limited by numerical accuracy. It's close to the
|
||||
* pvq_search_rdo_double_c implementation, but is not bit accurate and
|
||||
* it performs slightly worse on PSNR. One reason is that this code runs
|
||||
* more RDO iterations than the C code. It also uses single precision
|
||||
* floating point math, whereas the C version uses double precision.
|
||||
*
|
||||
* @param [in] xcoeff input vector to quantize (x in the math doc)
|
||||
* @param [in] n number of dimensions
|
||||
* @param [in] k number of pulses
|
||||
* @param [out] ypulse optimal codevector found (y in the math doc)
|
||||
* @param [in] g2 multiplier for the distortion (typically squared
|
||||
* gain units)
|
||||
* @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
|
||||
* @param [in] prev_k number of pulses already in ypulse that we should
|
||||
* reuse for the search (or 0 for a new search)
|
||||
* @return cosine distance between x and y (between 0 and 1)
|
||||
*/
|
||||
double pvq_search_rdo_double_sse4_1(const int16_t *xcoeff, int n, int k,
|
||||
int *ypulse, double g2,
|
||||
double pvq_norm_lambda, int prev_k) {
|
||||
int i, j;
|
||||
int reuse_pulses = prev_k > 0 && prev_k <= k;
|
||||
/* TODO - This blows our 8kB stack space budget and should be fixed when
|
||||
converting PVQ to fixed point. */
|
||||
float xx = 0, xy = 0, yy = 0;
|
||||
float x[MAXN + 3];
|
||||
float y[MAXN + 3];
|
||||
float sign_y[MAXN + 3];
|
||||
for (i = 0; i < n; i++) {
|
||||
float tmp = (float)xcoeff[i];
|
||||
xx += tmp * tmp;
|
||||
x[i] = xcoeff[i];
|
||||
}
|
||||
|
||||
x[n] = x[n + 1] = x[n + 2] = 0;
|
||||
ypulse[n] = ypulse[n + 1] = ypulse[n + 2] = 0;
|
||||
|
||||
__m128 sums = _mm_setzero_ps();
|
||||
for (i = 0; i < n; i += 4) {
|
||||
__m128 x4 = _mm_loadu_ps(&x[i]);
|
||||
__m128 s4 = _mm_cmplt_ps(x4, _mm_setzero_ps());
|
||||
/* Save the sign, we'll put it back later. */
|
||||
_mm_storeu_ps(&sign_y[i], s4);
|
||||
/* Get rid of the sign. */
|
||||
x4 = _mm_andnot_ps(_mm_set_ps1(-0.f), x4);
|
||||
sums = _mm_add_ps(sums, x4);
|
||||
if (!reuse_pulses) {
|
||||
/* Clear y and ypulse in case we don't do the projection. */
|
||||
_mm_storeu_ps(&y[i], _mm_setzero_ps());
|
||||
_mm_storeu_si128((__m128i *)&ypulse[i], _mm_setzero_si128());
|
||||
}
|
||||
_mm_storeu_ps(&x[i], x4);
|
||||
}
|
||||
sums = horizontal_sum_ps(sums);
|
||||
int pulses_left = k;
|
||||
{
|
||||
__m128i pulses_sum;
|
||||
__m128 yy4, xy4;
|
||||
xy4 = yy4 = _mm_setzero_ps();
|
||||
pulses_sum = _mm_setzero_si128();
|
||||
if (reuse_pulses) {
|
||||
/* We reuse pulses from a previous search so we don't have to search them
|
||||
again. */
|
||||
for (j = 0; j < n; j += 4) {
|
||||
__m128 x4, y4;
|
||||
__m128i iy4;
|
||||
iy4 = _mm_abs_epi32(_mm_loadu_si128((__m128i *)&ypulse[j]));
|
||||
pulses_sum = _mm_add_epi32(pulses_sum, iy4);
|
||||
_mm_storeu_si128((__m128i *)&ypulse[j], iy4);
|
||||
y4 = _mm_cvtepi32_ps(iy4);
|
||||
x4 = _mm_loadu_ps(&x[j]);
|
||||
xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
|
||||
yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
|
||||
/* Double the y[] vector so we don't have to do it in the search loop.
|
||||
*/
|
||||
_mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
|
||||
}
|
||||
pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
|
||||
xy4 = horizontal_sum_ps(xy4);
|
||||
xy = _mm_cvtss_f32(xy4);
|
||||
yy4 = horizontal_sum_ps(yy4);
|
||||
yy = _mm_cvtss_f32(yy4);
|
||||
} else if (k > (n >> 1)) {
|
||||
/* Do a pre-search by projecting on the pyramid. */
|
||||
__m128 rcp4;
|
||||
float sum = _mm_cvtss_f32(sums);
|
||||
/* If x is too small, just replace it with a pulse at 0. This prevents
|
||||
infinities and NaNs from causing too many pulses to be allocated. Here,
|
||||
64 is an
|
||||
approximation of infinity. */
|
||||
if (sum <= EPSILON) {
|
||||
x[0] = 1.f;
|
||||
for (i = 1; i < n; i++) {
|
||||
x[i] = 0;
|
||||
}
|
||||
sums = _mm_set_ps1(1.f);
|
||||
}
|
||||
/* Using k + e with e < 1 guarantees we cannot get more than k pulses. */
|
||||
rcp4 = _mm_mul_ps(_mm_set_ps1((float)k + .8f), _mm_rcp_ps(sums));
|
||||
xy4 = yy4 = _mm_setzero_ps();
|
||||
pulses_sum = _mm_setzero_si128();
|
||||
for (j = 0; j < n; j += 4) {
|
||||
__m128 rx4, x4, y4;
|
||||
__m128i iy4;
|
||||
x4 = _mm_loadu_ps(&x[j]);
|
||||
rx4 = _mm_mul_ps(x4, rcp4);
|
||||
iy4 = _mm_cvttps_epi32(rx4);
|
||||
pulses_sum = _mm_add_epi32(pulses_sum, iy4);
|
||||
_mm_storeu_si128((__m128i *)&ypulse[j], iy4);
|
||||
y4 = _mm_cvtepi32_ps(iy4);
|
||||
xy4 = _mm_add_ps(xy4, _mm_mul_ps(x4, y4));
|
||||
yy4 = _mm_add_ps(yy4, _mm_mul_ps(y4, y4));
|
||||
/* Double the y[] vector so we don't have to do it in the search loop.
|
||||
*/
|
||||
_mm_storeu_ps(&y[j], _mm_add_ps(y4, y4));
|
||||
}
|
||||
pulses_left -= _mm_cvtsi128_si32(horizontal_sum_epi32(pulses_sum));
|
||||
xy = _mm_cvtss_f32(horizontal_sum_ps(xy4));
|
||||
yy = _mm_cvtss_f32(horizontal_sum_ps(yy4));
|
||||
}
|
||||
x[n] = x[n + 1] = x[n + 2] = -100;
|
||||
y[n] = y[n + 1] = y[n + 2] = 100;
|
||||
}
|
||||
|
||||
/* This should never happen. */
|
||||
OD_ASSERT(pulses_left <= n + 3);
|
||||
|
||||
float lambda_delta_rate[MAXN + 3];
|
||||
if (pulses_left) {
|
||||
/* Hoist lambda to avoid the multiply in the loop. */
|
||||
float lambda =
|
||||
0.5f * sqrtf(xx) * (float)pvq_norm_lambda / (FLT_MIN + (float)g2);
|
||||
float delta_rate = 3.f / n;
|
||||
__m128 count = _mm_set_ps(3, 2, 1, 0);
|
||||
for (i = 0; i < n; i += 4) {
|
||||
_mm_storeu_ps(&lambda_delta_rate[i],
|
||||
_mm_mul_ps(count, _mm_set_ps1(lambda * delta_rate)));
|
||||
count = _mm_add_ps(count, _mm_set_ps(4, 4, 4, 4));
|
||||
}
|
||||
}
|
||||
|
||||
for (i = 0; i < pulses_left; i++) {
|
||||
int best_id = 0;
|
||||
__m128 xy4, yy4;
|
||||
__m128 max, max2;
|
||||
__m128i count;
|
||||
__m128i pos;
|
||||
|
||||
/* The squared magnitude term gets added anyway, so we might as well
|
||||
add it outside the loop. */
|
||||
yy = yy + 1;
|
||||
xy4 = _mm_load1_ps(&xy);
|
||||
yy4 = _mm_load1_ps(&yy);
|
||||
max = _mm_setzero_ps();
|
||||
pos = _mm_setzero_si128();
|
||||
count = _mm_set_epi32(3, 2, 1, 0);
|
||||
for (j = 0; j < n; j += 4) {
|
||||
__m128 x4, y4, r4;
|
||||
x4 = _mm_loadu_ps(&x[j]);
|
||||
y4 = _mm_loadu_ps(&y[j]);
|
||||
x4 = _mm_add_ps(x4, xy4);
|
||||
y4 = _mm_add_ps(y4, yy4);
|
||||
y4 = _mm_rsqrt_ps(y4);
|
||||
r4 = _mm_mul_ps(x4, y4);
|
||||
/* Subtract lambda. */
|
||||
r4 = _mm_sub_ps(r4, _mm_loadu_ps(&lambda_delta_rate[j]));
|
||||
/* Update the index of the max. */
|
||||
pos = _mm_max_epi16(
|
||||
pos, _mm_and_si128(count, _mm_castps_si128(_mm_cmpgt_ps(r4, max))));
|
||||
/* Update the max. */
|
||||
max = _mm_max_ps(max, r4);
|
||||
/* Update the indices (+4) */
|
||||
count = _mm_add_epi32(count, _mm_set_epi32(4, 4, 4, 4));
|
||||
}
|
||||
/* Horizontal max. */
|
||||
max2 = _mm_max_ps(max, _mm_shuffle_ps(max, max, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
max2 =
|
||||
_mm_max_ps(max2, _mm_shuffle_ps(max2, max2, _MM_SHUFFLE(2, 3, 0, 1)));
|
||||
/* Now that max2 contains the max at all positions, look at which value(s)
|
||||
of the
|
||||
partial max is equal to the global max. */
|
||||
pos = _mm_and_si128(pos, _mm_castps_si128(_mm_cmpeq_ps(max, max2)));
|
||||
pos = _mm_max_epi16(pos, _mm_unpackhi_epi64(pos, pos));
|
||||
pos = _mm_max_epi16(pos, _mm_shufflelo_epi16(pos, _MM_SHUFFLE(1, 0, 3, 2)));
|
||||
best_id = _mm_cvtsi128_si32(pos);
|
||||
OD_ASSERT(best_id < n);
|
||||
/* Updating the sums of the new pulse(s) */
|
||||
xy = xy + x[best_id];
|
||||
/* We're multiplying y[j] by two so we don't have to do it here. */
|
||||
yy = yy + y[best_id];
|
||||
/* Only now that we've made the final choice, update y/ypulse. */
|
||||
/* Multiplying y[j] by 2 so we don't have to do it everywhere else. */
|
||||
y[best_id] += 2;
|
||||
ypulse[best_id]++;
|
||||
}
|
||||
|
||||
/* Put the original sign back. */
|
||||
for (i = 0; i < n; i += 4) {
|
||||
__m128i y4;
|
||||
__m128i s4;
|
||||
y4 = _mm_loadu_si128((__m128i *)&ypulse[i]);
|
||||
s4 = _mm_castps_si128(_mm_loadu_ps(&sign_y[i]));
|
||||
y4 = _mm_xor_si128(_mm_add_epi32(y4, s4), s4);
|
||||
_mm_storeu_si128((__m128i *)&ypulse[i], y4);
|
||||
}
|
||||
return xy * rsqrtf(xx * yy + FLT_MIN);
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Alliance for Open Media. All rights reserved
|
||||
*
|
||||
* This source code is subject to the terms of the BSD 2 Clause License and
|
||||
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
|
||||
* was not distributed with this source code in the LICENSE file, you can
|
||||
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
|
||||
* Media Patent License 1.0 was not distributed with this source code in the
|
||||
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
|
||||
*/
|
||||
#ifndef AOM_COMMON_PVQ_X86_SSE4_H_
|
||||
#define AOM_COMMON_PVQ_X86_SSE4_H_
|
||||
#endif // AOM_COMMON_PVQ_X86_SSE4_H_
|
|
@ -82,7 +82,7 @@ static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
|
|||
* reuse for the search (or 0 for a new search)
|
||||
* @return cosine distance between x and y (between 0 and 1)
|
||||
*/
|
||||
static double pvq_search_rdo_double(const od_val16 *xcoeff, int n, int k,
|
||||
double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
|
||||
od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
|
||||
int i, j;
|
||||
double xy;
|
||||
|
@ -325,7 +325,7 @@ static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
|
|||
const int16_t *qm_inv, double pvq_norm_lambda, int speed) {
|
||||
od_val32 g;
|
||||
od_val32 gr;
|
||||
od_coeff y_tmp[MAXN];
|
||||
od_coeff y_tmp[MAXN + 3];
|
||||
int i;
|
||||
/* Number of pulses. */
|
||||
int k;
|
||||
|
|
Загрузка…
Ссылка в новой задаче