Merge "Highbd fht4x4 SSE4.1 optimization for DCT_DCT mode - Setup function vp10_highbd_fht4x4_sse4_1 for highbd SSE4.1 intrinsics optimization. - Wrote SSE4.1 functions: load_buffer_4x4(), write_buffer_4x4(), and fdct4x4_sse4_1(). - Used logic right shift to avoid coeff memory write/read. - Turned on vp10_highbd_fht4x4_sse4_1 for DCT_DCT mode only. - Improved overall encoding performance >2.3% for 50 frames sequence, park_joy_1080p_12.y4m, in which, --input-bit-depth=12, --bit-depth=12, 50 frames. - Unit test passed." into nextgenv2
This commit is contained in:
Коммит
deb33056d1
|
@ -670,7 +670,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
|
||||
# fdct functions
|
||||
add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp10_highbd_fht4x4/;
|
||||
specialize qw/vp10_highbd_fht4x4 sse4_1/;
|
||||
|
||||
add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
|
||||
specialize qw/vp10_highbd_fht8x8/;
|
||||
|
|
|
@ -202,10 +202,12 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
|
|||
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
|
||||
break;
|
||||
case ADST_DCT:
|
||||
case DCT_ADST:
|
||||
case ADST_ADST:
|
||||
vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
|
||||
vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
|
||||
break;
|
||||
#if CONFIG_EXT_TX
|
||||
case FLIPADST_DCT:
|
||||
|
@ -213,7 +215,7 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
|
|||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
|
||||
vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
|
||||
break;
|
||||
case V_DCT:
|
||||
case H_DCT:
|
||||
|
|
|
@ -0,0 +1,211 @@
|
|||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include <assert.h>
|
||||
#include <smmintrin.h> /* SSE4.1 */
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "./vpx_config.h"
|
||||
#include "vpx_dsp/txfm_common.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
|
||||
#if CONFIG_VP9_HIGHBITDEPTH
|
||||
|
||||
static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
|
||||
int stride, int flipud, int fliplr) {
|
||||
const __m128i k__nonzero_bias_a = _mm_setr_epi32(0, 1, 1, 1);
|
||||
const __m128i k__nonzero_bias_b = _mm_setr_epi32(1, 0, 0, 0);
|
||||
__m128i mask;
|
||||
|
||||
if (!flipud) {
|
||||
in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
|
||||
in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
|
||||
in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
|
||||
in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
|
||||
} else {
|
||||
in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
|
||||
in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
|
||||
in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
|
||||
in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
|
||||
}
|
||||
|
||||
if (fliplr) {
|
||||
in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
|
||||
in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
|
||||
in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
|
||||
in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
|
||||
}
|
||||
|
||||
in[0] = _mm_cvtepi16_epi32(in[0]);
|
||||
in[1] = _mm_cvtepi16_epi32(in[1]);
|
||||
in[2] = _mm_cvtepi16_epi32(in[2]);
|
||||
in[3] = _mm_cvtepi16_epi32(in[3]);
|
||||
|
||||
in[0] = _mm_slli_epi32(in[0], 4);
|
||||
in[1] = _mm_slli_epi32(in[1], 4);
|
||||
in[2] = _mm_slli_epi32(in[2], 4);
|
||||
in[3] = _mm_slli_epi32(in[3], 4);
|
||||
|
||||
mask = _mm_cmpeq_epi32(in[0], k__nonzero_bias_a);
|
||||
in[0] = _mm_add_epi32(in[0], mask);
|
||||
in[0] = _mm_add_epi32(in[0], k__nonzero_bias_b);
|
||||
}
|
||||
|
||||
static void fdct4x4_sse4_1(__m128i *in) {
|
||||
const __m128i k__cospi_p16_p16 = _mm_set1_epi64x(cospi_16_64);
|
||||
const __m128i k__cospi_m16_m16 = _mm_set1_epi64x(-cospi_16_64);
|
||||
const __m128i k__cospi_p08_p08 = _mm_set1_epi64x(cospi_8_64);
|
||||
const __m128i k__cospi_m08_m08 = _mm_set1_epi64x(-cospi_8_64);
|
||||
const __m128i k__cospi_p24_p24 = _mm_set1_epi64x(cospi_24_64);
|
||||
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi64x(DCT_CONST_ROUNDING);
|
||||
|
||||
__m128i s[8];
|
||||
__m128i u0, u1, u2, u3;
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
|
||||
s[0] = _mm_add_epi32(in[0], in[3]);
|
||||
s[1] = _mm_add_epi32(in[1], in[2]);
|
||||
s[2] = _mm_sub_epi32(in[1], in[2]);
|
||||
s[3] = _mm_sub_epi32(in[0], in[3]);
|
||||
|
||||
v0 = _mm_cvtepi32_epi64(s[0]); // s01 s00
|
||||
v1 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[0], s[0])); // s03 s02
|
||||
v2 = _mm_cvtepi32_epi64(s[1]); // s11 s10
|
||||
v3 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[1], s[1])); // s13 s12
|
||||
v4 = _mm_cvtepi32_epi64(s[2]); // s21 s20
|
||||
v5 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[2], s[2])); // s23 s22
|
||||
v6 = _mm_cvtepi32_epi64(s[3]); // s31 s30
|
||||
v7 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[3], s[3])); // s33 s32
|
||||
|
||||
u0 = _mm_mul_epi32(v0, k__cospi_p16_p16);
|
||||
u1 = _mm_mul_epi32(v1, k__cospi_p16_p16);
|
||||
u2 = _mm_mul_epi32(v2, k__cospi_p16_p16);
|
||||
u3 = _mm_mul_epi32(v3, k__cospi_p16_p16);
|
||||
|
||||
s[0] = _mm_add_epi64(u0, u2); // y10 y00
|
||||
s[1] = _mm_add_epi64(u1, u3); // y30 y20
|
||||
|
||||
u2 = _mm_mul_epi32(v2, k__cospi_m16_m16);
|
||||
u3 = _mm_mul_epi32(v3, k__cospi_m16_m16);
|
||||
|
||||
s[2] = _mm_add_epi64(u0, u2); // y12 y02
|
||||
s[3] = _mm_add_epi64(u1, u3); // y32 y22
|
||||
|
||||
u0 = _mm_mul_epi32(v6, k__cospi_p08_p08);
|
||||
u1 = _mm_mul_epi32(v5, k__cospi_p24_p24);
|
||||
u2 = _mm_mul_epi32(v4, k__cospi_p24_p24);
|
||||
u3 = _mm_mul_epi32(v7, k__cospi_p08_p08);
|
||||
|
||||
s[4] = _mm_add_epi64(u0, u2); // y11 y01
|
||||
s[5] = _mm_add_epi64(u1, u3); // y31 y21
|
||||
|
||||
u0 = _mm_mul_epi32(v4, k__cospi_m08_m08);
|
||||
u1 = _mm_mul_epi32(v5, k__cospi_m08_m08);
|
||||
u2 = _mm_mul_epi32(v6, k__cospi_p24_p24);
|
||||
u3 = _mm_mul_epi32(v7, k__cospi_p24_p24);
|
||||
|
||||
s[6] = _mm_add_epi64(u0, u2); // y13 y03
|
||||
s[7] = _mm_add_epi64(u1, u3); // y33 y23
|
||||
|
||||
s[0] = _mm_add_epi64(s[0], k__DCT_CONST_ROUNDING);
|
||||
s[1] = _mm_add_epi64(s[1], k__DCT_CONST_ROUNDING);
|
||||
s[2] = _mm_add_epi64(s[2], k__DCT_CONST_ROUNDING);
|
||||
s[3] = _mm_add_epi64(s[3], k__DCT_CONST_ROUNDING);
|
||||
s[4] = _mm_add_epi64(s[4], k__DCT_CONST_ROUNDING);
|
||||
s[5] = _mm_add_epi64(s[5], k__DCT_CONST_ROUNDING);
|
||||
s[6] = _mm_add_epi64(s[6], k__DCT_CONST_ROUNDING);
|
||||
s[7] = _mm_add_epi64(s[7], k__DCT_CONST_ROUNDING);
|
||||
|
||||
s[0] = _mm_srli_epi64(s[0], DCT_CONST_BITS);
|
||||
s[1] = _mm_srli_epi64(s[1], DCT_CONST_BITS);
|
||||
s[2] = _mm_srli_epi64(s[2], DCT_CONST_BITS);
|
||||
s[3] = _mm_srli_epi64(s[3], DCT_CONST_BITS);
|
||||
s[4] = _mm_srli_epi64(s[4], DCT_CONST_BITS);
|
||||
s[5] = _mm_srli_epi64(s[5], DCT_CONST_BITS);
|
||||
s[6] = _mm_srli_epi64(s[6], DCT_CONST_BITS);
|
||||
s[7] = _mm_srli_epi64(s[7], DCT_CONST_BITS);
|
||||
|
||||
s[0] = _mm_shuffle_epi32(s[0], 0x88);
|
||||
s[1] = _mm_shuffle_epi32(s[1], 0x88);
|
||||
s[2] = _mm_shuffle_epi32(s[2], 0x88);
|
||||
s[3] = _mm_shuffle_epi32(s[3], 0x88);
|
||||
s[4] = _mm_shuffle_epi32(s[4], 0x88);
|
||||
s[5] = _mm_shuffle_epi32(s[5], 0x88);
|
||||
s[6] = _mm_shuffle_epi32(s[6], 0x88);
|
||||
s[7] = _mm_shuffle_epi32(s[7], 0x88);
|
||||
|
||||
v0 = _mm_unpacklo_epi32(s[0], s[4]);
|
||||
v1 = _mm_unpacklo_epi32(s[2], s[6]);
|
||||
v2 = _mm_unpacklo_epi32(s[1], s[5]);
|
||||
v3 = _mm_unpacklo_epi32(s[3], s[7]);
|
||||
|
||||
in[0] = _mm_unpacklo_epi64(v0, v1);
|
||||
in[1] = _mm_unpackhi_epi64(v0, v1);
|
||||
in[2] = _mm_unpacklo_epi64(v2, v3);
|
||||
in[3] = _mm_unpackhi_epi64(v2, v3);
|
||||
}
|
||||
|
||||
static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
|
||||
const __m128i kOne = _mm_set1_epi32(1);
|
||||
res[0] = _mm_add_epi32(res[0], kOne);
|
||||
res[1] = _mm_add_epi32(res[1], kOne);
|
||||
res[2] = _mm_add_epi32(res[2], kOne);
|
||||
res[3] = _mm_add_epi32(res[3], kOne);
|
||||
res[0] = _mm_srai_epi32(res[0], 2);
|
||||
res[1] = _mm_srai_epi32(res[1], 2);
|
||||
res[2] = _mm_srai_epi32(res[2], 2);
|
||||
res[3] = _mm_srai_epi32(res[3], 2);
|
||||
_mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
|
||||
_mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
|
||||
_mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
|
||||
_mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
|
||||
}
|
||||
|
||||
void vp10_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
|
||||
int stride, int tx_type) {
|
||||
__m128i in[4];
|
||||
|
||||
switch (tx_type) {
|
||||
case DCT_DCT:
|
||||
load_buffer_4x4(input, in, stride, 0, 0);
|
||||
fdct4x4_sse4_1(in);
|
||||
fdct4x4_sse4_1(in);
|
||||
write_buffer_4x4(output, in);
|
||||
break;
|
||||
case ADST_DCT:
|
||||
case DCT_ADST:
|
||||
case ADST_ADST:
|
||||
vp10_highbd_fht4x4_c(input, output, stride, tx_type);
|
||||
break;
|
||||
#if CONFIG_EXT_TX
|
||||
case FLIPADST_DCT:
|
||||
case DCT_FLIPADST:
|
||||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
vp10_highbd_fht4x4_c(input, output, stride, tx_type);
|
||||
break;
|
||||
case DST_DST:
|
||||
case DCT_DST:
|
||||
case DST_DCT:
|
||||
case DST_ADST:
|
||||
case ADST_DST:
|
||||
case DST_FLIPADST:
|
||||
case FLIPADST_DST:
|
||||
vp10_highbd_fht4x4_c(input, output, stride, tx_type);
|
||||
break;
|
||||
#endif // CONFIG_EXT_TX
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CONFIG_VP9_HIGHBITDEPTH
|
|
@ -112,6 +112,7 @@ endif
|
|||
|
||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
|
||||
VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
|
||||
VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
|
||||
|
||||
ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
|
||||
VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoiser_sse2.c
|
||||
|
|
Загрузка…
Ссылка в новой задаче