vp9: Refactor vp9_denoiser_NxM_sse2.

Denoiser is ~1.5% faster in speed 6~8.

Change-Id: I7b350f3c50cce6773d9c4eded4c0c1b722d0a5fc
This commit is contained in:
JackyChen 2016-05-01 16:15:29 -07:00
Родитель 94806e029d
Коммит b29612fbbe
1 изменённых файлов: 32 добавлений и 36 удалений

Просмотреть файл

@ -147,8 +147,9 @@ static int vp9_denoiser_NxM_sse2_small(
const __m128i l32 = _mm_set1_epi8(2);
// Difference between level 2 and level 1 is 1.
const __m128i l21 = _mm_set1_epi8(1);
const int b_height = (4 << b_height_log2_lookup[bs]) >> 1;
for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
for (r = 0; r < b_height; ++r) {
memcpy(sig_buffer[r], sig, width);
memcpy(sig_buffer[r] + width, sig + sig_stride, width);
memcpy(mc_running_buffer[r], mc_running_avg_y, width);
@ -188,8 +189,8 @@ static int vp9_denoiser_NxM_sse2_small(
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
const __m128i k_delta = _mm_set1_epi8(delta);
running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> 1); ++r) {
running_avg_y -= avg_y_stride * (b_height << 1);
for (r = 0; r < b_height; ++r) {
acc_diff = vp9_denoiser_adj_16x1_sse2(
sig_buffer[r], mc_running_buffer[r], running_buffer[r],
k_0, k_delta, acc_diff);
@ -235,38 +236,37 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
const __m128i l32 = _mm_set1_epi8(2);
// Difference between level 2 and level 1 is 1.
const __m128i l21 = _mm_set1_epi8(1);
const int b_width = (4 << b_width_log2_lookup[bs]);
const int b_height = (4 << b_height_log2_lookup[bs]);
const int b_width_shift4 = b_width >> 4;
for (c = 0; c < 4; ++c) {
for (r = 0; r < 4; ++r) {
for (r = 0; r < 4; ++r) {
for (c = 0; c < b_width_shift4; ++c) {
acc_diff[c][r] = _mm_setzero_si128();
}
}
for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
acc_diff[c>>4][r>>4] = vp9_denoiser_16x1_sse2(
for (r = 0; r < b_height; ++r) {
for (c = 0; c < b_width_shift4; ++c) {
acc_diff[c][r>>4] = vp9_denoiser_16x1_sse2(
sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
&k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
&k_8, &k_16, &l3, &l32, &l21, acc_diff[c][r>>4]);
// Update pointers for next iteration.
sig += 16;
mc_running_avg_y += 16;
running_avg_y += 16;
}
if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
for (c = 0; c < b_width_shift4; ++c) {
sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
}
}
// Update pointers for next iteration.
sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
mc_running_avg_y = mc_running_avg_y -
16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
mc_avg_y_stride;
running_avg_y = running_avg_y -
16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
avg_y_stride;
sig = sig - b_width + sig_stride;
mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
running_avg_y = running_avg_y - b_width + avg_y_stride;
}
{
@ -278,33 +278,29 @@ static int vp9_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
// Only apply the adjustment for max delta up to 3.
if (delta < 4) {
const __m128i k_delta = _mm_set1_epi8(delta);
sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
sig -= sig_stride * b_height;
mc_running_avg_y -= mc_avg_y_stride * b_height;
running_avg_y -= avg_y_stride * b_height;
sum_diff = 0;
for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
acc_diff[c>>4][r>>4] = vp9_denoiser_adj_16x1_sse2(
for (r = 0; r < b_height; ++r) {
for (c = 0; c < b_width_shift4; ++c) {
acc_diff[c][r>>4] = vp9_denoiser_adj_16x1_sse2(
sig, mc_running_avg_y, running_avg_y, k_0,
k_delta, acc_diff[c>>4][r>>4]);
k_delta, acc_diff[c][r>>4]);
// Update pointers for next iteration.
sig += 16;
mc_running_avg_y += 16;
running_avg_y += 16;
}
if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
if ((r & 0xf) == 0xf || (bs == BLOCK_16X8 && r == 7)) {
for (c = 0; c < b_width_shift4; ++c) {
sum_diff += sum_diff_16x1(acc_diff[c][r>>4]);
}
}
sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
mc_running_avg_y = mc_running_avg_y -
16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
mc_avg_y_stride;
running_avg_y = running_avg_y -
16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
avg_y_stride;
sig = sig - b_width + sig_stride;
mc_running_avg_y = mc_running_avg_y - b_width + mc_avg_y_stride;
running_avg_y = running_avg_y - b_width + avg_y_stride;
}
if (abs(sum_diff) > sum_diff_thresh) {
return COPY_BLOCK;