vp8: Add temporal denoising for UV-channel.
C version and sse2 version, and off by default. For the test clip used, the sse2 performance improved by ~5.6% Change-Id: Ic2d815968849db51b9d62085d7a490d0e01574f6
This commit is contained in:
Родитель
c19046a795
Коммит
94ae0430d2
|
@ -554,6 +554,9 @@ $vp8_yv12_copy_partial_frame_neon_asm=vp8_yv12_copy_partial_frame_neon;
|
|||
if (vpx_config("CONFIG_TEMPORAL_DENOISING") eq "yes") {
|
||||
add_proto qw/int vp8_denoiser_filter/, "unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
|
||||
specialize qw/vp8_denoiser_filter sse2 neon/;
|
||||
add_proto qw/int vp8_denoiser_filter_uv/, "unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising";
|
||||
specialize qw/vp8_denoiser_filter_uv sse2/;
|
||||
|
||||
}
|
||||
|
||||
# End of encoder only functions
|
||||
|
|
|
@ -191,6 +191,148 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
|
|||
return FILTER_BLOCK;
|
||||
}
|
||||
|
||||
int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
|
||||
int mc_avg_uv_stride,
|
||||
unsigned char *running_avg_uv,
|
||||
int avg_uv_stride,
|
||||
unsigned char *sig,
|
||||
int sig_stride,
|
||||
unsigned int motion_magnitude,
|
||||
int increase_denoising) {
|
||||
unsigned char *running_avg_uv_start = running_avg_uv;
|
||||
unsigned char *sig_start = sig;
|
||||
int sum_diff_thresh;
|
||||
int r, c;
|
||||
int sum_diff = 0;
|
||||
int sum_block = 0;
|
||||
int adj_val[3] = {3, 4, 6};
|
||||
int shift_inc1 = 0;
|
||||
int shift_inc2 = 1;
|
||||
/* If motion_magnitude is small, making the denoiser more aggressive by
|
||||
* increasing the adjustment for each level. Add another increment for
|
||||
* blocks that are labeled for increase denoising. */
|
||||
if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) {
|
||||
if (increase_denoising) {
|
||||
shift_inc1 = 1;
|
||||
shift_inc2 = 2;
|
||||
}
|
||||
adj_val[0] += shift_inc2;
|
||||
adj_val[1] += shift_inc2;
|
||||
adj_val[2] += shift_inc2;
|
||||
}
|
||||
|
||||
// Avoid denoising color signal if its close to average level.
|
||||
for (r = 0; r < 8; ++r) {
|
||||
for (c = 0; c < 8; ++c) {
|
||||
sum_block += sig[c];
|
||||
}
|
||||
sig += sig_stride;
|
||||
}
|
||||
if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
|
||||
sig -= sig_stride * 8;
|
||||
for (r = 0; r < 8; ++r) {
|
||||
for (c = 0; c < 8; ++c) {
|
||||
int diff = 0;
|
||||
int adjustment = 0;
|
||||
int absdiff = 0;
|
||||
|
||||
diff = mc_running_avg_uv[c] - sig[c];
|
||||
absdiff = abs(diff);
|
||||
|
||||
// When |diff| <= |3 + shift_inc1|, use pixel value from
|
||||
// last denoised raw.
|
||||
if (absdiff <= 3 + shift_inc1) {
|
||||
running_avg_uv[c] = mc_running_avg_uv[c];
|
||||
sum_diff += diff;
|
||||
} else {
|
||||
if (absdiff >= 4 && absdiff <= 7)
|
||||
adjustment = adj_val[0];
|
||||
else if (absdiff >= 8 && absdiff <= 15)
|
||||
adjustment = adj_val[1];
|
||||
else
|
||||
adjustment = adj_val[2];
|
||||
if (diff > 0) {
|
||||
if ((sig[c] + adjustment) > 255)
|
||||
running_avg_uv[c] = 255;
|
||||
else
|
||||
running_avg_uv[c] = sig[c] + adjustment;
|
||||
sum_diff += adjustment;
|
||||
} else {
|
||||
if ((sig[c] - adjustment) < 0)
|
||||
running_avg_uv[c] = 0;
|
||||
else
|
||||
running_avg_uv[c] = sig[c] - adjustment;
|
||||
sum_diff -= adjustment;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Update pointers for next iteration. */
|
||||
sig += sig_stride;
|
||||
mc_running_avg_uv += mc_avg_uv_stride;
|
||||
running_avg_uv += avg_uv_stride;
|
||||
}
|
||||
|
||||
sum_diff_thresh= SUM_DIFF_THRESHOLD_UV;
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
|
||||
if (abs(sum_diff) > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising), check
|
||||
// if we can still apply some (weaker) temporal filtering to this block,
|
||||
// that would otherwise not be denoised at all. Simplest is to apply
|
||||
// an additional adjustment to running_avg_y to bring it closer to sig.
|
||||
// The adjustment is capped by a maximum delta, and chosen such that
|
||||
// in most cases the resulting sum_diff will be within the
|
||||
// accceptable range given by sum_diff_thresh.
|
||||
|
||||
// The delta is set by the excess of absolute pixel diff over threshold.
|
||||
int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
sig -= sig_stride * 8;
|
||||
mc_running_avg_uv -= mc_avg_uv_stride * 8;
|
||||
running_avg_uv -= avg_uv_stride * 8;
|
||||
for (r = 0; r < 8; ++r) {
|
||||
for (c = 0; c < 8; ++c) {
|
||||
int diff = mc_running_avg_uv[c] - sig[c];
|
||||
int adjustment = abs(diff);
|
||||
if (adjustment > delta)
|
||||
adjustment = delta;
|
||||
if (diff > 0) {
|
||||
// Bring denoised signal down.
|
||||
if (running_avg_uv[c] - adjustment < 0)
|
||||
running_avg_uv[c] = 0;
|
||||
else
|
||||
running_avg_uv[c] = running_avg_uv[c] - adjustment;
|
||||
sum_diff -= adjustment;
|
||||
} else if (diff < 0) {
|
||||
// Bring denoised signal up.
|
||||
if (running_avg_uv[c] + adjustment > 255)
|
||||
running_avg_uv[c] = 255;
|
||||
else
|
||||
running_avg_uv[c] = running_avg_uv[c] + adjustment;
|
||||
sum_diff += adjustment;
|
||||
}
|
||||
}
|
||||
// TODO(marpan): Check here if abs(sum_diff) has gone below the
|
||||
// threshold sum_diff_thresh, and if so, we can exit the row loop.
|
||||
sig += sig_stride;
|
||||
mc_running_avg_uv += mc_avg_uv_stride;
|
||||
running_avg_uv += avg_uv_stride;
|
||||
}
|
||||
if (abs(sum_diff) > sum_diff_thresh)
|
||||
return COPY_BLOCK;
|
||||
} else {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
|
||||
vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start,
|
||||
sig_stride);
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
||||
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
|
||||
int num_mb_rows, int num_mb_cols)
|
||||
{
|
||||
|
@ -260,6 +402,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
unsigned int motion_magnitude2;
|
||||
unsigned int sse_thresh;
|
||||
int sse_diff_thresh = 0;
|
||||
// Denoise the UV channel.
|
||||
int apply_color_denoise = 0;
|
||||
// Spatial loop filter: only applied selectively based on
|
||||
// temporal filter state of block relative to top/left neighbors.
|
||||
int apply_spatial_loop_filter = 1;
|
||||
|
@ -267,6 +411,8 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
MV_REFERENCE_FRAME zero_frame = x->best_zeromv_reference_frame;
|
||||
|
||||
enum vp8_denoiser_decision decision = FILTER_BLOCK;
|
||||
enum vp8_denoiser_decision decision_u = FILTER_BLOCK;
|
||||
enum vp8_denoiser_decision decision_v = FILTER_BLOCK;
|
||||
|
||||
if (zero_frame)
|
||||
{
|
||||
|
@ -376,11 +522,37 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
|
||||
/* Filter. */
|
||||
decision = vp8_denoiser_filter(mc_running_avg_y, mc_avg_y_stride,
|
||||
running_avg_y, avg_y_stride,
|
||||
x->thismb, 16, motion_magnitude2,
|
||||
x->increase_denoising);
|
||||
running_avg_y, avg_y_stride,
|
||||
x->thismb, 16, motion_magnitude2,
|
||||
x->increase_denoising);
|
||||
denoiser->denoise_state[block_index] = motion_magnitude2 > 0 ?
|
||||
kFilterNonZeroMV : kFilterZeroMV;
|
||||
// Only denoise UV for zero motion, and if y channel was denoised.
|
||||
if (apply_color_denoise &&
|
||||
motion_magnitude2 == 0 &&
|
||||
decision == FILTER_BLOCK) {
|
||||
unsigned char *mc_running_avg_u =
|
||||
denoiser->yv12_mc_running_avg.u_buffer + recon_uvoffset;
|
||||
unsigned char *running_avg_u =
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset;
|
||||
unsigned char *mc_running_avg_v =
|
||||
denoiser->yv12_mc_running_avg.v_buffer + recon_uvoffset;
|
||||
unsigned char *running_avg_v =
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset;
|
||||
int mc_avg_uv_stride = denoiser->yv12_mc_running_avg.uv_stride;
|
||||
int avg_uv_stride = denoiser->yv12_running_avg[INTRA_FRAME].uv_stride;
|
||||
int signal_stride = x->block[16].src_stride;
|
||||
decision_u =
|
||||
vp8_denoiser_filter_uv(mc_running_avg_u, mc_avg_uv_stride,
|
||||
running_avg_u, avg_uv_stride,
|
||||
x->block[16].src + *x->block[16].base_src,
|
||||
signal_stride, motion_magnitude2, 0);
|
||||
decision_v =
|
||||
vp8_denoiser_filter_uv(mc_running_avg_v, mc_avg_uv_stride,
|
||||
running_avg_v, avg_uv_stride,
|
||||
x->block[20].src + *x->block[20].base_src,
|
||||
signal_stride, motion_magnitude2, 0);
|
||||
}
|
||||
}
|
||||
if (decision == COPY_BLOCK)
|
||||
{
|
||||
|
@ -393,7 +565,21 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
|
|||
denoiser->yv12_running_avg[INTRA_FRAME].y_stride);
|
||||
denoiser->denoise_state[block_index] = kNoFilter;
|
||||
}
|
||||
// Option to selectively deblock the denoised signal.
|
||||
if (apply_color_denoise) {
|
||||
if (decision_u == COPY_BLOCK) {
|
||||
vp8_copy_mem8x8(
|
||||
x->block[16].src + *x->block[16].base_src, x->block[16].src_stride,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].u_buffer + recon_uvoffset,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
|
||||
}
|
||||
if (decision_v == COPY_BLOCK) {
|
||||
vp8_copy_mem8x8(
|
||||
x->block[20].src + *x->block[20].base_src, x->block[16].src_stride,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].v_buffer + recon_uvoffset,
|
||||
denoiser->yv12_running_avg[INTRA_FRAME].uv_stride);
|
||||
}
|
||||
}
|
||||
// Option to selectively deblock the denoised signal, for y channel only.
|
||||
if (apply_spatial_loop_filter) {
|
||||
loop_filter_info lfi;
|
||||
int apply_filter_col = 0;
|
||||
|
|
|
@ -22,6 +22,11 @@ extern "C" {
|
|||
#define SUM_DIFF_THRESHOLD_HIGH (16 * 16 * 3)
|
||||
#define MOTION_MAGNITUDE_THRESHOLD (8*3)
|
||||
|
||||
#define SUM_DIFF_THRESHOLD_UV (96) // (8 * 8 * 1.5)
|
||||
#define SUM_DIFF_THRESHOLD_HIGH_UV (8 * 8 * 2)
|
||||
#define SUM_DIFF_FROM_AVG_THRESH_UV (8 * 8 * 4)
|
||||
#define MOTION_MAGNITUDE_THRESHOLD_UV (8*3)
|
||||
|
||||
enum vp8_denoiser_decision
|
||||
{
|
||||
COPY_BLOCK,
|
||||
|
|
|
@ -17,10 +17,23 @@
|
|||
#include <emmintrin.h>
|
||||
#include "vpx_ports/emmintrin_compat.h"
|
||||
|
||||
union sum_union {
|
||||
__m128i v;
|
||||
signed char e[16];
|
||||
};
|
||||
/* Compute the sum of all pixel differences of this MB. */
|
||||
static inline unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
|
||||
const __m128i k_1 = _mm_set1_epi16(1);
|
||||
const __m128i acc_diff_lo = _mm_srai_epi16(
|
||||
_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
|
||||
const __m128i acc_diff_hi = _mm_srai_epi16(
|
||||
_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
|
||||
const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
|
||||
const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
|
||||
const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
|
||||
_mm_srli_si128(hg_fe_dc_ba, 8));
|
||||
const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
|
||||
_mm_srli_si128(hgfe_dcba, 4));
|
||||
unsigned int sum_diff = _mm_cvtsi128_si32(hgfedcba);
|
||||
|
||||
return abs(sum_diff);
|
||||
}
|
||||
|
||||
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
|
||||
int mc_avg_y_stride,
|
||||
|
@ -103,16 +116,10 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
|
|||
|
||||
{
|
||||
/* Compute the sum of all pixel differences of this MB. */
|
||||
union sum_union s;
|
||||
int sum_diff = 0;
|
||||
s.v = acc_diff;
|
||||
sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
|
||||
+ s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
|
||||
+ s.e[12] + s.e[13] + s.e[14] + s.e[15];
|
||||
|
||||
unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
|
||||
sum_diff_thresh = SUM_DIFF_THRESHOLD;
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
|
||||
if (abs(sum_diff) > sum_diff_thresh) {
|
||||
if (abs_sum_diff > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising),
|
||||
// checK if we can still apply some (weaker) temporal filtering to
|
||||
// this block, that would otherwise not be denoised at all. Simplest
|
||||
|
@ -123,7 +130,7 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
|
|||
|
||||
// The delta is set by the excess of absolute pixel diff over the
|
||||
// threshold.
|
||||
int delta = ((abs(sum_diff) - sum_diff_thresh) >> 8) + 1;
|
||||
int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
const __m128i k_delta = _mm_set1_epi8(delta);
|
||||
|
@ -162,16 +169,9 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
|
|||
mc_running_avg_y += mc_avg_y_stride;
|
||||
running_avg_y += avg_y_stride;
|
||||
}
|
||||
{
|
||||
// Update the sum of all pixel differences of this MB.
|
||||
union sum_union s;
|
||||
s.v = acc_diff;
|
||||
sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
|
||||
+ s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
|
||||
+ s.e[12] + s.e[13] + s.e[14] + s.e[15];
|
||||
if (abs(sum_diff) > sum_diff_thresh) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
abs_sum_diff = abs_sum_diff_16x1(acc_diff);
|
||||
if (abs_sum_diff > sum_diff_thresh) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
} else {
|
||||
return COPY_BLOCK;
|
||||
|
@ -182,3 +182,198 @@ int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
|
|||
vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
||||
int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
|
||||
int mc_avg_stride,
|
||||
unsigned char *running_avg, int avg_stride,
|
||||
unsigned char *sig, int sig_stride,
|
||||
unsigned int motion_magnitude,
|
||||
int increase_denoising) {
|
||||
unsigned char *running_avg_start = running_avg;
|
||||
unsigned char *sig_start = sig;
|
||||
int sum_diff_thresh;
|
||||
int r;
|
||||
int shift_inc = (increase_denoising &&
|
||||
motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 1 : 0;
|
||||
__m128i acc_diff = _mm_setzero_si128();
|
||||
const __m128i k_0 = _mm_setzero_si128();
|
||||
const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
|
||||
const __m128i k_8 = _mm_set1_epi8(8);
|
||||
const __m128i k_16 = _mm_set1_epi8(16);
|
||||
/* Modify each level's adjustment according to motion_magnitude. */
|
||||
const __m128i l3 = _mm_set1_epi8(
|
||||
(motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ?
|
||||
7 + shift_inc : 6);
|
||||
/* Difference between level 3 and level 2 is 2. */
|
||||
const __m128i l32 = _mm_set1_epi8(2);
|
||||
/* Difference between level 2 and level 1 is 1. */
|
||||
const __m128i l21 = _mm_set1_epi8(1);
|
||||
|
||||
{
|
||||
const __m128i k_1 = _mm_set1_epi16(1);
|
||||
__m128i vec_sum_block = _mm_setzero_si128();
|
||||
|
||||
// Avoid denoising color signal if its close to average level.
|
||||
for (r = 0; r < 8; ++r) {
|
||||
const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
|
||||
const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
|
||||
vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
|
||||
sig += sig_stride;
|
||||
}
|
||||
sig -= sig_stride * 8;
|
||||
{
|
||||
const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
|
||||
const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba,
|
||||
_mm_srli_si128(hg_fe_dc_ba, 8));
|
||||
const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba,
|
||||
_mm_srli_si128(hgfe_dcba, 4));
|
||||
const int sum_block = _mm_cvtsi128_si32(hgfedcba);
|
||||
if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (r = 0; r < 4; ++r) {
|
||||
/* Calculate differences */
|
||||
const __m128i v_sig_low = _mm_castpd_si128(
|
||||
_mm_load_sd((double *)(&sig[0])));
|
||||
const __m128i v_sig = _mm_castpd_si128(
|
||||
_mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
|
||||
(double *)(&sig[sig_stride])));
|
||||
const __m128i v_mc_running_avg_low = _mm_castpd_si128(
|
||||
_mm_load_sd((double *)(&mc_running_avg[0])));
|
||||
const __m128i v_mc_running_avg = _mm_castpd_si128(
|
||||
_mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
|
||||
(double *)(&mc_running_avg[mc_avg_stride])));
|
||||
const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
|
||||
const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
|
||||
/* Obtain the sign. FF if diff is negative. */
|
||||
const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
|
||||
/* Clamp absolute difference to 16 to be used to get mask. Doing this
|
||||
* allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
|
||||
const __m128i clamped_absdiff = _mm_min_epu8(
|
||||
_mm_or_si128(pdiff, ndiff), k_16);
|
||||
/* Get masks for l2 l1 and l0 adjustments */
|
||||
const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
|
||||
const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
|
||||
const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
|
||||
/* Get adjustments for l2, l1, and l0 */
|
||||
__m128i adj2 = _mm_and_si128(mask2, l32);
|
||||
const __m128i adj1 = _mm_and_si128(mask1, l21);
|
||||
const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
|
||||
__m128i adj, padj, nadj;
|
||||
__m128i v_running_avg;
|
||||
|
||||
/* Combine the adjustments and get absolute adjustments. */
|
||||
adj2 = _mm_add_epi8(adj2, adj1);
|
||||
adj = _mm_sub_epi8(l3, adj2);
|
||||
adj = _mm_andnot_si128(mask0, adj);
|
||||
adj = _mm_or_si128(adj, adj0);
|
||||
|
||||
/* Restore the sign and get positive and negative adjustments. */
|
||||
padj = _mm_andnot_si128(diff_sign, adj);
|
||||
nadj = _mm_and_si128(diff_sign, adj);
|
||||
|
||||
/* Calculate filtered value. */
|
||||
v_running_avg = _mm_adds_epu8(v_sig, padj);
|
||||
v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
|
||||
|
||||
_mm_storel_pd((double *)&running_avg[0],
|
||||
_mm_castsi128_pd(v_running_avg));
|
||||
_mm_storeh_pd((double *)&running_avg[avg_stride],
|
||||
_mm_castsi128_pd(v_running_avg));
|
||||
|
||||
/* Adjustments <=7, and each element in acc_diff can fit in signed
|
||||
* char.
|
||||
*/
|
||||
acc_diff = _mm_adds_epi8(acc_diff, padj);
|
||||
acc_diff = _mm_subs_epi8(acc_diff, nadj);
|
||||
|
||||
/* Update pointers for next iteration. */
|
||||
sig += sig_stride * 2;
|
||||
mc_running_avg += mc_avg_stride * 2;
|
||||
running_avg += avg_stride * 2;
|
||||
}
|
||||
|
||||
{
|
||||
unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
|
||||
sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
|
||||
if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
|
||||
if (abs_sum_diff > sum_diff_thresh) {
|
||||
// Before returning to copy the block (i.e., apply no denoising),
|
||||
// checK if we can still apply some (weaker) temporal filtering to
|
||||
// this block, that would otherwise not be denoised at all. Simplest
|
||||
// is to apply an additional adjustment to running_avg_y to bring it
|
||||
// closer to sig. The adjustment is capped by a maximum delta, and
|
||||
// chosen such that in most cases the resulting sum_diff will be
|
||||
// within the accceptable range given by sum_diff_thresh.
|
||||
|
||||
// The delta is set by the excess of absolute pixel diff over the
|
||||
// threshold.
|
||||
int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
|
||||
// Only apply the adjustment for max delta up to 3.
|
||||
if (delta < 4) {
|
||||
const __m128i k_delta = _mm_set1_epi8(delta);
|
||||
sig -= sig_stride * 8;
|
||||
mc_running_avg -= mc_avg_stride * 8;
|
||||
running_avg -= avg_stride * 8;
|
||||
for (r = 0; r < 4; ++r) {
|
||||
// Calculate differences.
|
||||
const __m128i v_sig_low = _mm_castpd_si128(
|
||||
_mm_load_sd((double *)(&sig[0])));
|
||||
const __m128i v_sig = _mm_castpd_si128(
|
||||
_mm_loadh_pd(_mm_castsi128_pd(v_sig_low),
|
||||
(double *)(&sig[sig_stride])));
|
||||
const __m128i v_mc_running_avg_low = _mm_castpd_si128(
|
||||
_mm_load_sd((double *)(&mc_running_avg[0])));
|
||||
const __m128i v_mc_running_avg = _mm_castpd_si128(
|
||||
_mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
|
||||
(double *)(&mc_running_avg[mc_avg_stride])));
|
||||
const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
|
||||
const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
|
||||
// Obtain the sign. FF if diff is negative.
|
||||
const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
|
||||
// Clamp absolute difference to delta to get the adjustment.
|
||||
const __m128i adj =
|
||||
_mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
|
||||
// Restore the sign and get positive and negative adjustments.
|
||||
__m128i padj, nadj;
|
||||
const __m128i v_running_avg_low = _mm_castpd_si128(
|
||||
_mm_load_sd((double *)(&running_avg[0])));
|
||||
__m128i v_running_avg = _mm_castpd_si128(
|
||||
_mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
|
||||
(double *)(&running_avg[avg_stride])));
|
||||
padj = _mm_andnot_si128(diff_sign, adj);
|
||||
nadj = _mm_and_si128(diff_sign, adj);
|
||||
// Calculate filtered value.
|
||||
v_running_avg = _mm_subs_epu8(v_running_avg, padj);
|
||||
v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
|
||||
|
||||
_mm_storel_pd((double *)&running_avg[0],
|
||||
_mm_castsi128_pd(v_running_avg));
|
||||
_mm_storeh_pd((double *)&running_avg[avg_stride],
|
||||
_mm_castsi128_pd(v_running_avg));
|
||||
|
||||
// Accumulate the adjustments.
|
||||
acc_diff = _mm_subs_epi8(acc_diff, padj);
|
||||
acc_diff = _mm_adds_epi8(acc_diff, nadj);
|
||||
|
||||
// Update pointers for next iteration.
|
||||
sig += sig_stride * 2;
|
||||
mc_running_avg += mc_avg_stride * 2;
|
||||
running_avg += avg_stride * 2;
|
||||
}
|
||||
abs_sum_diff = abs_sum_diff_16x1(acc_diff);
|
||||
if (abs_sum_diff > sum_diff_thresh) {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
} else {
|
||||
return COPY_BLOCK;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
|
||||
return FILTER_BLOCK;
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче