From 25520d8dc3a0c295115a4f22a0965b7561dcbefa Mon Sep 17 00:00:00 2001 From: Angie Chiang Date: Thu, 24 Mar 2016 15:34:27 -0700 Subject: [PATCH] change vp10_fwd_txfm2d_#x#_sse2 to vp10_fwd_txfm2d_#x#_sse4_1 The speed performance for running 20k times is as follows Notice that the vp10_highbd_fdct#x#_sse2 version is 16-bit version plus range check The rest are 32-bit version vp10_fwd_txfm2d_4x4_c (2 ms) vp10_fwd_txfm2d_8x8_c (9 ms) vp10_fwd_txfm2d_16x16_c (45 ms) vp10_fwd_txfm2d_32x32_c (233 ms) vp10_fwd_txfm2d_4x4_sse4_1 (2 ms) vp10_fwd_txfm2d_8x8_sse4_1 (3 ms) vp10_fwd_txfm2d_16x16_sse4_1 (16 ms) vp10_fwd_txfm2d_32x32_sse4_1 (80 ms) vp10_highbd_fdct4x4_c (1 ms) vp10_highbd_fdct8x8_c (3 ms) vp10_highbd_fdct16x16_c (17 ms) highbd_fdct32x32_c (160 ms) vp10_highbd_fdct4x4_sse2 (0 ms) vp10_highbd_fdct8x8_sse2 (2 ms) vp10_highbd_fdct16x16_sse2 (8 ms) highbd_fdct32x32_sse2 (105 ms) Change-Id: I24daf1e0d4d66e91e4ce61ef71cefa7b70ee90ce --- test/test.mk | 2 +- ...2_test.cc => vp10_fwd_txfm2d_sse4_test.cc} | 19 +- vp10/common/vp10_rtcd_defs.pl | 10 +- ...d_txfm1d_sse2.c => vp10_fwd_txfm1d_sse4.c} | 890 +++++++++--------- ...d_txfm2d_sse2.c => vp10_fwd_txfm2d_sse4.c} | 48 +- vp10/common/x86/vp10_txfm1d_sse2.h | 153 --- vp10/common/x86/vp10_txfm1d_sse4.h | 145 +++ vp10/vp10_common.mk | 7 +- 8 files changed, 633 insertions(+), 641 deletions(-) rename test/{vp10_fwd_txfm2d_sse2_test.cc => vp10_fwd_txfm2d_sse4_test.cc} (76%) rename vp10/common/x86/{vp10_fwd_txfm1d_sse2.c => vp10_fwd_txfm1d_sse4.c} (74%) rename vp10/common/x86/{vp10_fwd_txfm2d_sse2.c => vp10_fwd_txfm2d_sse4.c} (68%) delete mode 100644 vp10/common/x86/vp10_txfm1d_sse2.h create mode 100644 vp10/common/x86/vp10_txfm1d_sse4.h diff --git a/test/test.mk b/test/test.mk index d466b4712..7c3f10142 100644 --- a/test/test.mk +++ b/test/test.mk @@ -174,7 +174,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ANS) += vp10_ans_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc -LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp10_fwd_txfm2d_sse2_test.cc +LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_fwd_txfm2d_sse4_test.cc ifeq ($(CONFIG_EXT_INTER),yes) LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc diff --git a/test/vp10_fwd_txfm2d_sse2_test.cc b/test/vp10_fwd_txfm2d_sse4_test.cc similarity index 76% rename from test/vp10_fwd_txfm2d_sse2_test.cc rename to test/vp10_fwd_txfm2d_sse4_test.cc index f5cc15944..d3882cd9e 100644 --- a/test/vp10_fwd_txfm2d_sse2_test.cc +++ b/test/vp10_fwd_txfm2d_sse4_test.cc @@ -12,9 +12,9 @@ using libvpx_test::ACMRandom; namespace { #if CONFIG_VP9_HIGHBITDEPTH -TEST(vp10_fwd_txfm2d_sse2, accuracy) { +TEST(vp10_fwd_txfm2d_sse4_1, accuracy) { int16_t input[4096] = {0}; - int32_t output_sse2[4096] = {0}; + int32_t output_sse4_1[4096] = {0}; int32_t output_c[4096] = {0}; int txfm_num = 17; @@ -36,10 +36,10 @@ TEST(vp10_fwd_txfm2d_sse2, accuracy) { vp10_fwd_txfm2d_32x32_c, vp10_fwd_txfm2d_64x64_c, }; - Fwd_Txfm2d_Func txfm2d_func_sse2_list[] = { - vp10_fwd_txfm2d_4x4_sse2, vp10_fwd_txfm2d_8x8_sse2, - vp10_fwd_txfm2d_16x16_sse2, vp10_fwd_txfm2d_32x32_sse2, - vp10_fwd_txfm2d_64x64_sse2, + Fwd_Txfm2d_Func txfm2d_func_sse4_1_list[] = { + vp10_fwd_txfm2d_4x4_sse4_1, vp10_fwd_txfm2d_8x8_sse4_1, + vp10_fwd_txfm2d_16x16_sse4_1, vp10_fwd_txfm2d_32x32_sse4_1, + vp10_fwd_txfm2d_64x64_sse4_1, }; for (int i = 0; i < txfm_num; i++) { @@ -47,7 +47,7 @@ TEST(vp10_fwd_txfm2d_sse2, accuracy) { int txfm_size = cfg.txfm_size; int func_idx = get_max_bit(txfm_size) - 2; Fwd_Txfm2d_Func txfm2d_func_c = txfm2d_func_c_list[func_idx]; - Fwd_Txfm2d_Func txfm2d_func_sse2 = txfm2d_func_sse2_list[func_idx]; + Fwd_Txfm2d_Func txfm2d_func_sse4_1 = txfm2d_func_sse4_1_list[func_idx]; ACMRandom rnd(ACMRandom::DeterministicSeed()); @@ -59,10 +59,11 @@ TEST(vp10_fwd_txfm2d_sse2, accuracy) { } txfm2d_func_c(input, output_c, cfg.txfm_size, &cfg, 10); - txfm2d_func_sse2(input, output_sse2, cfg.txfm_size, &cfg, 10); + txfm2d_func_sse4_1(input, output_sse4_1, cfg.txfm_size, &cfg, 10); for (int r = 0; r < txfm_size; r++) { for (int c = 0; c < txfm_size; c++) { - EXPECT_EQ(output_c[r * txfm_size + c], output_sse2[r * txfm_size + c]); + EXPECT_EQ(output_c[r * txfm_size + c], + output_sse4_1[r * txfm_size + c]); } } } diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl index ec619c3ab..7b2023938 100644 --- a/vp10/common/vp10_rtcd_defs.pl +++ b/vp10/common/vp10_rtcd_defs.pl @@ -615,15 +615,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { #fwd txfm add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd"; - specialize qw/vp10_fwd_txfm2d_4x4 sse2/; + specialize qw/vp10_fwd_txfm2d_4x4 sse4_1/; add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd"; - specialize qw/vp10_fwd_txfm2d_8x8 sse2/; + specialize qw/vp10_fwd_txfm2d_8x8 sse4_1/; add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd"; - specialize qw/vp10_fwd_txfm2d_16x16 sse2/; + specialize qw/vp10_fwd_txfm2d_16x16 sse4_1/; add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd"; - specialize qw/vp10_fwd_txfm2d_32x32 sse2/; + specialize qw/vp10_fwd_txfm2d_32x32 sse4_1/; add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd"; - specialize qw/vp10_fwd_txfm2d_64x64 sse2/; + specialize qw/vp10_fwd_txfm2d_64x64 sse4_1/; #inv txfm add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd"; diff --git a/vp10/common/x86/vp10_fwd_txfm1d_sse2.c b/vp10/common/x86/vp10_fwd_txfm1d_sse4.c similarity index 74% rename from vp10/common/x86/vp10_fwd_txfm1d_sse2.c rename to vp10/common/x86/vp10_fwd_txfm1d_sse4.c index fd9e7a3bb..5ade8bd3f 100644 --- a/vp10/common/x86/vp10_fwd_txfm1d_sse2.c +++ b/vp10/common/x86/vp10_fwd_txfm1d_sse4.c @@ -1,7 +1,7 @@ -#include "vp10/common/x86/vp10_txfm1d_sse2.h" +#include "vp10/common/x86/vp10_txfm1d_sse4.h" -void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 4; const int num_per_128 = 4; const int32_t* cospi; @@ -32,10 +32,10 @@ void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], + buf0[1], bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], + buf0[3], bit); // stage 3 stage_idx++; @@ -53,8 +53,8 @@ void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 8; const int num_per_128 = 4; const int32_t* cospi; @@ -98,18 +98,18 @@ void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output, buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); buf0[4] = buf1[4]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], - bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], + buf0[6], bit); buf0[7] = buf1[7]; // stage 3 stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], - bit); - btf_32_sse2_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], + buf1[1], bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], + buf1[3], bit); buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); @@ -123,10 +123,10 @@ void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output, buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; - btf_32_sse2_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], - bit); - btf_32_sse2_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], - bit); + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], + buf0[6], bit); // stage 5 stage_idx++; @@ -152,8 +152,8 @@ void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 16; const int num_per_128 = 4; const int32_t* cospi; @@ -218,10 +218,10 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]); buf0[8] = buf1[8]; buf0[9] = buf1[9]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11], - buf0[12], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10], + buf0[13], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11], + buf0[12], bit); buf0[14] = buf1[14]; buf0[15] = buf1[15]; @@ -234,8 +234,8 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, buf1[1] = _mm_add_epi32(buf0[1], buf0[2]); buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]); buf1[4] = buf0[4]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], buf1[6], - bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], + buf1[6], bit); buf1[7] = buf0[7]; buf1[8] = _mm_add_epi32(buf0[8], buf0[11]); buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]); @@ -250,19 +250,19 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], + buf0[1], bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], + buf0[3], bit); buf0[4] = _mm_add_epi32(buf1[4], buf1[5]); buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]); buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]); buf0[7] = _mm_add_epi32(buf1[7], buf1[6]); buf0[8] = buf1[8]; - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9], - buf0[14], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10], - buf0[13], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9], + buf0[14], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10], + buf0[13], bit); buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[15] = buf1[15]; @@ -275,10 +275,10 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, buf1[1] = buf0[1]; buf1[2] = buf0[2]; buf1[3] = buf0[3]; - btf_32_sse2_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7], - bit); - btf_32_sse2_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], buf1[6], - bit); + btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7], + bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], + buf1[6], bit); buf1[8] = _mm_add_epi32(buf0[8], buf0[9]); buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]); buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]); @@ -300,14 +300,14 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; - btf_32_sse2_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], buf0[15], - bit); - btf_32_sse2_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9], - buf0[14], bit); - btf_32_sse2_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - btf_32_sse2_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11], - buf0[12], bit); + btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], + buf0[15], bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9], + buf0[14], bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10], + buf0[13], bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11], + buf0[12], bit); // stage 7 stage_idx++; @@ -349,8 +349,8 @@ void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 32; const int num_per_128 = 4; const int32_t* cospi; @@ -457,14 +457,14 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], - buf0[27], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], - buf0[26], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], - buf0[25], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], - buf0[24], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20], + buf0[27], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21], + buf0[26], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22], + buf0[25], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23], + buf0[24], bit); buf0[28] = buf1[28]; buf0[29] = buf1[29]; buf0[30] = buf1[30]; @@ -484,10 +484,10 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]); buf1[8] = buf0[8]; buf1[9] = buf0[9]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], - buf1[13], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], - buf1[12], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10], + buf1[13], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11], + buf1[12], bit); buf1[14] = buf0[14]; buf1[15] = buf0[15]; buf1[16] = _mm_add_epi32(buf0[16], buf0[23]); @@ -516,8 +516,8 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf0[1] = _mm_add_epi32(buf1[1], buf1[2]); buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]); buf0[4] = buf1[4]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], buf0[6], - bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5], + buf0[6], bit); buf0[7] = buf1[7]; buf0[8] = _mm_add_epi32(buf1[8], buf1[11]); buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]); @@ -529,14 +529,14 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf0[14] = _mm_add_epi32(buf1[14], buf1[13]); buf0[16] = buf1[16]; buf0[17] = buf1[17]; - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], - buf0[29], bit); - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], - buf0[28], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], - buf0[27], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], - buf0[26], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18], + buf0[29], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19], + buf0[28], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20], + buf0[27], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21], + buf0[26], bit); buf0[22] = buf1[22]; buf0[23] = buf1[23]; buf0[24] = buf1[24]; @@ -548,19 +548,19 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], buf1[1], - bit); - btf_32_sse2_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], buf1[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0], + buf1[1], bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2], + buf1[3], bit); buf1[4] = _mm_add_epi32(buf0[4], buf0[5]); buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]); buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]); buf1[7] = _mm_add_epi32(buf0[7], buf0[6]); buf1[8] = buf0[8]; - btf_32_sse2_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], - buf1[14], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], - buf1[13], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9], + buf1[14], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10], + buf1[13], bit); buf1[11] = buf0[11]; buf1[12] = buf0[12]; buf1[15] = buf0[15]; @@ -589,10 +589,10 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; - btf_32_sse2_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], - bit); - btf_32_sse2_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], buf0[6], - bit); + btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7], + bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5], + buf0[6], bit); buf0[8] = _mm_add_epi32(buf1[8], buf1[9]); buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]); buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]); @@ -602,16 +602,16 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]); buf0[15] = _mm_add_epi32(buf1[15], buf1[14]); buf0[16] = buf1[16]; - btf_32_sse2_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], - buf0[30], bit); - btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], - buf0[29], bit); + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17], + buf0[30], bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18], + buf0[29], bit); buf0[19] = buf1[19]; buf0[20] = buf1[20]; - btf_32_sse2_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], - buf0[26], bit); - btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], - buf0[25], bit); + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21], + buf0[26], bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22], + buf0[25], bit); buf0[23] = buf1[23]; buf0[24] = buf1[24]; buf0[27] = buf1[27]; @@ -630,14 +630,14 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf1[5] = buf0[5]; buf1[6] = buf0[6]; buf1[7] = buf0[7]; - btf_32_sse2_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], buf1[15], - bit); - btf_32_sse2_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], - buf1[14], bit); - btf_32_sse2_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], - buf1[13], bit); - btf_32_sse2_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], - buf1[12], bit); + btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8], + buf1[15], bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9], + buf1[14], bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10], + buf1[13], bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11], + buf1[12], bit); buf1[16] = _mm_add_epi32(buf0[16], buf0[17]); buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]); buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]); @@ -675,22 +675,22 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; - btf_32_sse2_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], - buf0[31], bit); - btf_32_sse2_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], - buf0[30], bit); - btf_32_sse2_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], - buf0[29], bit); - btf_32_sse2_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], - buf0[28], bit); - btf_32_sse2_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], - buf0[27], bit); - btf_32_sse2_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], - buf0[26], bit); - btf_32_sse2_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], - buf0[25], bit); - btf_32_sse2_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], - buf0[24], bit); + btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16], + buf0[31], bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17], + buf0[30], bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18], + buf0[29], bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19], + buf0[28], bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20], + buf0[27], bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21], + buf0[26], bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22], + buf0[25], bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23], + buf0[24], bit); // stage 9 stage_idx++; @@ -764,8 +764,8 @@ void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 4; const int num_per_128 = 4; const int32_t* cospi; @@ -796,10 +796,10 @@ void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1], + bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2], + buf0[3], bit); // stage 3 stage_idx++; @@ -816,8 +816,8 @@ void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output, cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], bit); // stage 5 stage_idx++; @@ -835,8 +835,8 @@ void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 8; const int num_per_128 = 4; const int32_t* cospi; @@ -875,14 +875,14 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2], buf0[3], - bit); - btf_32_sse2_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4], buf0[5], - bit); - btf_32_sse2_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1], + bit); + btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2], + buf0[3], bit); + btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4], + buf0[5], bit); + btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6], + buf0[7], bit); // stage 3 stage_idx++; @@ -905,10 +905,10 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output, buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5], - bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], + buf0[5], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], + buf0[7], bit); // stage 5 stage_idx++; @@ -929,12 +929,12 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output, cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], bit); buf0[4] = buf1[4]; buf0[5] = buf1[5]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], + buf0[7], bit); // stage 7 stage_idx++; @@ -960,8 +960,8 @@ void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 16; const int num_per_128 = 4; const int32_t* cospi; @@ -1016,22 +1016,22 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2], buf0[3], - bit); - btf_32_sse2_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4], buf0[5], - bit); - btf_32_sse2_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6], buf0[7], - bit); - btf_32_sse2_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8], buf0[9], - bit); - btf_32_sse2_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - btf_32_sse2_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse2_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1], + bit); + btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2], + buf0[3], bit); + btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4], + buf0[5], bit); + btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6], + buf0[7], bit); + btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8], + buf0[9], bit); + btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10], + buf0[11], bit); + btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12], + buf0[13], bit); + btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14], + buf0[15], bit); // stage 3 stage_idx++; @@ -1066,14 +1066,14 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; - btf_32_sse2_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], - bit); - btf_32_sse2_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - btf_32_sse2_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse2_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], + bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], + buf0[11], bit); + btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], + buf0[13], bit); + btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], + buf0[15], bit); // stage 5 stage_idx++; @@ -1104,18 +1104,18 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5], - bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], + buf0[5], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], + buf0[7], bit); buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], + buf0[13], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], + buf0[15], bit); // stage 7 stage_idx++; @@ -1144,20 +1144,20 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], bit); buf0[4] = buf1[4]; buf0[5] = buf1[5]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], + buf0[7], bit); buf0[8] = buf1[8]; buf0[9] = buf1[9]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], - buf0[11], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], + buf0[11], bit); buf0[12] = buf1[12]; buf0[13] = buf1[13]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], + buf0[15], bit); // stage 9 stage_idx++; @@ -1199,8 +1199,8 @@ void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 32; const int num_per_128 = 4; const int32_t* cospi; @@ -1287,38 +1287,38 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3], - bit); - btf_32_sse2_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5], - bit); - btf_32_sse2_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6], buf0[7], - bit); - btf_32_sse2_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8], buf0[9], - bit); - btf_32_sse2_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - btf_32_sse2_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse2_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14], - buf0[15], bit); - btf_32_sse2_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16], - buf0[17], bit); - btf_32_sse2_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18], - buf0[19], bit); - btf_32_sse2_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20], - buf0[21], bit); - btf_32_sse2_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22], - buf0[23], bit); - btf_32_sse2_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24], - buf0[25], bit); - btf_32_sse2_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26], - buf0[27], bit); - btf_32_sse2_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28], - buf0[29], bit); - btf_32_sse2_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30], - buf0[31], bit); + btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1], + bit); + btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3], + bit); + btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5], + bit); + btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6], + buf0[7], bit); + btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8], + buf0[9], bit); + btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10], + buf0[11], bit); + btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12], + buf0[13], bit); + btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14], + buf0[15], bit); + btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16], + buf0[17], bit); + btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18], + buf0[19], bit); + btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20], + buf0[21], bit); + btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22], + buf0[23], bit); + btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24], + buf0[25], bit); + btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26], + buf0[27], bit); + btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28], + buf0[29], bit); + btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30], + buf0[31], bit); // stage 3 stage_idx++; @@ -1377,22 +1377,22 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, buf0[13] = buf1[13]; buf0[14] = buf1[14]; buf0[15] = buf1[15]; - btf_32_sse2_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16], - buf0[17], bit); - btf_32_sse2_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18], - buf0[19], bit); - btf_32_sse2_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20], - buf0[21], bit); - btf_32_sse2_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22], - buf0[23], bit); - btf_32_sse2_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24], - buf0[25], bit); - btf_32_sse2_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26], - buf0[27], bit); - btf_32_sse2_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28], - buf0[29], bit); - btf_32_sse2_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30], - buf0[31], bit); + btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16], + buf0[17], bit); + btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18], + buf0[19], bit); + btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20], + buf0[21], bit); + btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22], + buf0[23], bit); + btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24], + buf0[25], bit); + btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26], + buf0[27], bit); + btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28], + buf0[29], bit); + btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30], + buf0[31], bit); // stage 5 stage_idx++; @@ -1443,14 +1443,14 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; - btf_32_sse2_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], - bit); - btf_32_sse2_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], - buf0[11], bit); - btf_32_sse2_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse2_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9], + bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10], + buf0[11], bit); + btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12], + buf0[13], bit); + btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14], + buf0[15], bit); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; @@ -1459,14 +1459,14 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, buf0[21] = buf1[21]; buf0[22] = buf1[22]; buf0[23] = buf1[23]; - btf_32_sse2_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24], - buf0[25], bit); - btf_32_sse2_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26], - buf0[27], bit); - btf_32_sse2_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28], - buf0[29], bit); - btf_32_sse2_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30], - buf0[31], bit); + btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24], + buf0[25], bit); + btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26], + buf0[27], bit); + btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28], + buf0[29], bit); + btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30], + buf0[31], bit); // stage 7 stage_idx++; @@ -1513,34 +1513,34 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, buf0[1] = buf1[1]; buf0[2] = buf1[2]; buf0[3] = buf1[3]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], buf0[5], - bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4], + buf0[5], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6], + buf0[7], bit); buf0[8] = buf1[8]; buf0[9] = buf1[9]; buf0[10] = buf1[10]; buf0[11] = buf1[11]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], - buf0[13], bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12], + buf0[13], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14], + buf0[15], bit); buf0[16] = buf1[16]; buf0[17] = buf1[17]; buf0[18] = buf1[18]; buf0[19] = buf1[19]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20], - buf0[21], bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22], - buf0[23], bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20], + buf0[21], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22], + buf0[23], bit); buf0[24] = buf1[24]; buf0[25] = buf1[25]; buf0[26] = buf1[26]; buf0[27] = buf1[27]; - btf_32_sse2_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28], - buf0[29], bit); - btf_32_sse2_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30], - buf0[31], bit); + btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28], + buf0[29], bit); + btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30], + buf0[31], bit); // stage 9 stage_idx++; @@ -1585,36 +1585,36 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, cospi = cospi_arr[bit - cos_bit_min]; buf0[0] = buf1[0]; buf0[1] = buf1[1]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2], + buf0[3], bit); buf0[4] = buf1[4]; buf0[5] = buf1[5]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], buf0[7], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6], + buf0[7], bit); buf0[8] = buf1[8]; buf0[9] = buf1[9]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], - buf0[11], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10], + buf0[11], bit); buf0[12] = buf1[12]; buf0[13] = buf1[13]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], - buf0[15], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14], + buf0[15], bit); buf0[16] = buf1[16]; buf0[17] = buf1[17]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18], - buf0[19], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18], + buf0[19], bit); buf0[20] = buf1[20]; buf0[21] = buf1[21]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22], - buf0[23], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22], + buf0[23], bit); buf0[24] = buf1[24]; buf0[25] = buf1[25]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26], - buf0[27], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26], + buf0[27], bit); buf0[28] = buf1[28]; buf0[29] = buf1[29]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30], - buf0[31], bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30], + buf0[31], bit); // stage 11 stage_idx++; @@ -1688,8 +1688,8 @@ void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, } } -void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range) { +void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range) { const int txfm_size = 64; const int num_per_128 = 4; const int32_t* cospi; @@ -1880,22 +1880,22 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[37] = buf1[37]; buf0[38] = buf1[38]; buf0[39] = buf1[39]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40], - buf0[55], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41], - buf0[54], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42], - buf0[53], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43], - buf0[52], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44], - buf0[51], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45], - buf0[50], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46], - buf0[49], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47], - buf0[48], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40], + buf0[55], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41], + buf0[54], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42], + buf0[53], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43], + buf0[52], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44], + buf0[51], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45], + buf0[50], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46], + buf0[49], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47], + buf0[48], bit); buf0[56] = buf1[56]; buf0[57] = buf1[57]; buf0[58] = buf1[58]; @@ -1929,14 +1929,14 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf1[17] = buf0[17]; buf1[18] = buf0[18]; buf1[19] = buf0[19]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20], - buf1[27], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21], - buf1[26], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22], - buf1[25], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23], - buf1[24], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20], + buf1[27], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21], + buf1[26], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22], + buf1[25], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23], + buf1[24], bit); buf1[28] = buf0[28]; buf1[29] = buf0[29]; buf1[30] = buf0[30]; @@ -1988,10 +1988,10 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]); buf0[8] = buf1[8]; buf0[9] = buf1[9]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - btf_32_sse2_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11], - buf0[12], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10], + buf0[13], bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11], + buf0[12], bit); buf0[14] = buf1[14]; buf0[15] = buf1[15]; buf0[16] = _mm_add_epi32(buf1[16], buf1[23]); @@ -2014,22 +2014,22 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[33] = buf1[33]; buf0[34] = buf1[34]; buf0[35] = buf1[35]; - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36], - buf0[59], bit); - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37], - buf0[58], bit); - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38], - buf0[57], bit); - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39], - buf0[56], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40], - buf0[55], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41], - buf0[54], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42], - buf0[53], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43], - buf0[52], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36], + buf0[59], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37], + buf0[58], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38], + buf0[57], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39], + buf0[56], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40], + buf0[55], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41], + buf0[54], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42], + buf0[53], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43], + buf0[52], bit); buf0[44] = buf1[44]; buf0[45] = buf1[45]; buf0[46] = buf1[46]; @@ -2052,8 +2052,8 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf1[1] = _mm_add_epi32(buf0[1], buf0[2]); buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]); buf1[4] = buf0[4]; - btf_32_sse2_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], buf1[6], - bit); + btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5], + buf1[6], bit); buf1[7] = buf0[7]; buf1[8] = _mm_add_epi32(buf0[8], buf0[11]); buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]); @@ -2065,14 +2065,14 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf1[14] = _mm_add_epi32(buf0[14], buf0[13]); buf1[16] = buf0[16]; buf1[17] = buf0[17]; - btf_32_sse2_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18], - buf1[29], bit); - btf_32_sse2_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19], - buf1[28], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20], - buf1[27], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21], - buf1[26], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18], + buf1[29], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19], + buf1[28], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20], + buf1[27], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21], + buf1[26], bit); buf1[22] = buf0[22]; buf1[23] = buf0[23]; buf1[24] = buf0[24]; @@ -2116,19 +2116,19 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, stage_idx++; bit = cos_bit[stage_idx]; cospi = cospi_arr[bit - cos_bit_min]; - btf_32_sse2_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], buf0[1], - bit); - btf_32_sse2_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], buf0[3], - bit); + btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0], + buf0[1], bit); + btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2], + buf0[3], bit); buf0[4] = _mm_add_epi32(buf1[4], buf1[5]); buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]); buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]); buf0[7] = _mm_add_epi32(buf1[7], buf1[6]); buf0[8] = buf1[8]; - btf_32_sse2_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9], - buf0[14], bit); - btf_32_sse2_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10], - buf0[13], bit); + btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9], + buf0[14], bit); + btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10], + buf0[13], bit); buf0[11] = buf1[11]; buf0[12] = buf1[12]; buf0[15] = buf1[15]; @@ -2150,26 +2150,26 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[30] = _mm_add_epi32(buf1[30], buf1[29]); buf0[32] = buf1[32]; buf0[33] = buf1[33]; - btf_32_sse2_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34], - buf0[61], bit); - btf_32_sse2_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35], - buf0[60], bit); - btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36], - buf0[59], bit); - btf_32_sse2_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37], - buf0[58], bit); + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34], + buf0[61], bit); + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35], + buf0[60], bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36], + buf0[59], bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37], + buf0[58], bit); buf0[38] = buf1[38]; buf0[39] = buf1[39]; buf0[40] = buf1[40]; buf0[41] = buf1[41]; - btf_32_sse2_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42], - buf0[53], bit); - btf_32_sse2_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43], - buf0[52], bit); - btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44], - buf0[51], bit); - btf_32_sse2_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45], - buf0[50], bit); + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42], + buf0[53], bit); + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43], + buf0[52], bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44], + buf0[51], bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45], + buf0[50], bit); buf0[46] = buf1[46]; buf0[47] = buf1[47]; buf0[48] = buf1[48]; @@ -2189,10 +2189,10 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf1[1] = buf0[1]; buf1[2] = buf0[2]; buf1[3] = buf0[3]; - btf_32_sse2_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7], - bit); - btf_32_sse2_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], buf1[6], - bit); + btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7], + bit); + btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5], + buf1[6], bit); buf1[8] = _mm_add_epi32(buf0[8], buf0[9]); buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]); buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]); @@ -2202,16 +2202,16 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]); buf1[15] = _mm_add_epi32(buf0[15], buf0[14]); buf1[16] = buf0[16]; - btf_32_sse2_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17], - buf1[30], bit); - btf_32_sse2_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18], - buf1[29], bit); + btf_32_sse4_1_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17], + buf1[30], bit); + btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18], + buf1[29], bit); buf1[19] = buf0[19]; buf1[20] = buf0[20]; - btf_32_sse2_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21], - buf1[26], bit); - btf_32_sse2_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22], - buf1[25], bit); + btf_32_sse4_1_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21], + buf1[26], bit); + btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22], + buf1[25], bit); buf1[23] = buf0[23]; buf1[24] = buf0[24]; buf1[27] = buf0[27]; @@ -2262,14 +2262,14 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[5] = buf1[5]; buf0[6] = buf1[6]; buf0[7] = buf1[7]; - btf_32_sse2_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], buf0[15], - bit); - btf_32_sse2_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9], - buf0[14], bit); - btf_32_sse2_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10], - buf0[13], bit); - btf_32_sse2_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11], - buf0[12], bit); + btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8], + buf0[15], bit); + btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9], + buf0[14], bit); + btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10], + buf0[13], bit); + btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11], + buf0[12], bit); buf0[16] = _mm_add_epi32(buf1[16], buf1[17]); buf0[17] = _mm_sub_epi32(buf1[16], buf1[17]); buf0[18] = _mm_sub_epi32(buf1[19], buf1[18]); @@ -2287,28 +2287,28 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[30] = _mm_sub_epi32(buf1[31], buf1[30]); buf0[31] = _mm_add_epi32(buf1[31], buf1[30]); buf0[32] = buf1[32]; - btf_32_sse2_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33], - buf0[62], bit); - btf_32_sse2_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34], - buf0[61], bit); + btf_32_sse4_1_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33], + buf0[62], bit); + btf_32_sse4_1_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34], + buf0[61], bit); buf0[35] = buf1[35]; buf0[36] = buf1[36]; - btf_32_sse2_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37], - buf0[58], bit); - btf_32_sse2_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38], - buf0[57], bit); + btf_32_sse4_1_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37], + buf0[58], bit); + btf_32_sse4_1_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38], + buf0[57], bit); buf0[39] = buf1[39]; buf0[40] = buf1[40]; - btf_32_sse2_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41], - buf0[54], bit); - btf_32_sse2_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42], - buf0[53], bit); + btf_32_sse4_1_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41], + buf0[54], bit); + btf_32_sse4_1_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42], + buf0[53], bit); buf0[43] = buf1[43]; buf0[44] = buf1[44]; - btf_32_sse2_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45], - buf0[50], bit); - btf_32_sse2_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46], - buf0[49], bit); + btf_32_sse4_1_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45], + buf0[50], bit); + btf_32_sse4_1_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46], + buf0[49], bit); buf0[47] = buf1[47]; buf0[48] = buf1[48]; buf0[51] = buf1[51]; @@ -2339,22 +2339,22 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf1[13] = buf0[13]; buf1[14] = buf0[14]; buf1[15] = buf0[15]; - btf_32_sse2_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16], - buf1[31], bit); - btf_32_sse2_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17], - buf1[30], bit); - btf_32_sse2_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18], - buf1[29], bit); - btf_32_sse2_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19], - buf1[28], bit); - btf_32_sse2_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20], - buf1[27], bit); - btf_32_sse2_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21], - buf1[26], bit); - btf_32_sse2_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22], - buf1[25], bit); - btf_32_sse2_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23], - buf1[24], bit); + btf_32_sse4_1_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16], + buf1[31], bit); + btf_32_sse4_1_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17], + buf1[30], bit); + btf_32_sse4_1_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18], + buf1[29], bit); + btf_32_sse4_1_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19], + buf1[28], bit); + btf_32_sse4_1_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20], + buf1[27], bit); + btf_32_sse4_1_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21], + buf1[26], bit); + btf_32_sse4_1_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22], + buf1[25], bit); + btf_32_sse4_1_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23], + buf1[24], bit); buf1[32] = _mm_add_epi32(buf0[32], buf0[33]); buf1[33] = _mm_sub_epi32(buf0[32], buf0[33]); buf1[34] = _mm_sub_epi32(buf0[35], buf0[34]); @@ -2424,38 +2424,38 @@ void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, buf0[29] = buf1[29]; buf0[30] = buf1[30]; buf0[31] = buf1[31]; - btf_32_sse2_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32], - buf0[63], bit); - btf_32_sse2_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33], - buf0[62], bit); - btf_32_sse2_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34], - buf0[61], bit); - btf_32_sse2_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35], - buf0[60], bit); - btf_32_sse2_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36], - buf0[59], bit); - btf_32_sse2_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37], - buf0[58], bit); - btf_32_sse2_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38], - buf0[57], bit); - btf_32_sse2_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39], - buf0[56], bit); - btf_32_sse2_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40], - buf0[55], bit); - btf_32_sse2_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41], - buf0[54], bit); - btf_32_sse2_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42], - buf0[53], bit); - btf_32_sse2_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43], - buf0[52], bit); - btf_32_sse2_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44], - buf0[51], bit); - btf_32_sse2_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45], - buf0[50], bit); - btf_32_sse2_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46], - buf0[49], bit); - btf_32_sse2_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47], - buf0[48], bit); + btf_32_sse4_1_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32], + buf0[63], bit); + btf_32_sse4_1_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33], + buf0[62], bit); + btf_32_sse4_1_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34], + buf0[61], bit); + btf_32_sse4_1_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35], + buf0[60], bit); + btf_32_sse4_1_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36], + buf0[59], bit); + btf_32_sse4_1_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37], + buf0[58], bit); + btf_32_sse4_1_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38], + buf0[57], bit); + btf_32_sse4_1_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39], + buf0[56], bit); + btf_32_sse4_1_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40], + buf0[55], bit); + btf_32_sse4_1_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41], + buf0[54], bit); + btf_32_sse4_1_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42], + buf0[53], bit); + btf_32_sse4_1_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43], + buf0[52], bit); + btf_32_sse4_1_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44], + buf0[51], bit); + btf_32_sse4_1_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45], + buf0[50], bit); + btf_32_sse4_1_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46], + buf0[49], bit); + btf_32_sse4_1_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47], + buf0[48], bit); // stage 11 stage_idx++; diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse2.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c similarity index 68% rename from vp10/common/x86/vp10_fwd_txfm2d_sse2.c rename to vp10/common/x86/vp10_fwd_txfm2d_sse4.c index 5af682fc2..6664bd5dc 100644 --- a/vp10/common/x86/vp10_fwd_txfm2d_sse2.c +++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c @@ -1,4 +1,4 @@ -#include "vp10/common/x86/vp10_txfm1d_sse2.h" +#include "vp10/common/x86/vp10_txfm1d_sse4.h" static inline void int16_array_with_stride_to_int32_array_without_stride( const int16_t *input, int stride, int32_t *output, int txfm1d_size) { @@ -16,31 +16,31 @@ typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output, static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { switch (txfm_type) { case TXFM_TYPE_DCT4: - return vp10_fdct4_new_sse2; + return vp10_fdct4_new_sse4_1; break; case TXFM_TYPE_DCT8: - return vp10_fdct8_new_sse2; + return vp10_fdct8_new_sse4_1; break; case TXFM_TYPE_DCT16: - return vp10_fdct16_new_sse2; + return vp10_fdct16_new_sse4_1; break; case TXFM_TYPE_DCT32: - return vp10_fdct32_new_sse2; + return vp10_fdct32_new_sse4_1; break; case TXFM_TYPE_DCT64: - return vp10_fdct64_new_sse2; + return vp10_fdct64_new_sse4_1; break; case TXFM_TYPE_ADST4: - return vp10_fadst4_new_sse2; + return vp10_fadst4_new_sse4_1; break; case TXFM_TYPE_ADST8: - return vp10_fadst8_new_sse2; + return vp10_fadst8_new_sse4_1; break; case TXFM_TYPE_ADST16: - return vp10_fadst16_new_sse2; + return vp10_fadst16_new_sse4_1; break; case TXFM_TYPE_ADST32: - return vp10_fadst32_new_sse2; + return vp10_fadst32_new_sse4_1; break; default: assert(0); @@ -48,7 +48,7 @@ static inline TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) { return NULL; } -static inline void fwd_txfm2d_sse2(const int16_t *input, int32_t *output, +static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, int32_t *txfm_buf) { const int txfm_size = cfg->txfm_size; @@ -67,51 +67,51 @@ static inline void fwd_txfm2d_sse2(const int16_t *input, int32_t *output, int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf, txfm_size); - round_shift_array_32_sse2(buf_128, out_128, txfm2d_size_128, -shift[0]); + round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]); txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col); - round_shift_array_32_sse2(buf_128, out_128, txfm2d_size_128, -shift[1]); + round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]); transpose_32(txfm_size, out_128, buf_128); txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row); - round_shift_array_32_sse2(out_128, buf_128, txfm2d_size_128, -shift[2]); + round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]); transpose_32(txfm_size, buf_128, out_128); } -void vp10_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output, +void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd) { int32_t txfm_buf[16]; (void)bd; - fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); } -void vp10_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, +void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd) { int32_t txfm_buf[64]; (void)bd; - fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); } -void vp10_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output, +void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd) { int32_t txfm_buf[256]; (void)bd; - fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); } -void vp10_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output, +void vp10_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd) { int32_t txfm_buf[1024]; (void)bd; - fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); } -void vp10_fwd_txfm2d_64x64_sse2(const int16_t *input, int32_t *output, +void vp10_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output, const int stride, const TXFM_2D_CFG *cfg, const int bd) { int32_t txfm_buf[4096]; (void)bd; - fwd_txfm2d_sse2(input, output, stride, cfg, txfm_buf); + fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); } diff --git a/vp10/common/x86/vp10_txfm1d_sse2.h b/vp10/common/x86/vp10_txfm1d_sse2.h deleted file mode 100644 index fc25013d6..000000000 --- a/vp10/common/x86/vp10_txfm1d_sse2.h +++ /dev/null @@ -1,153 +0,0 @@ -#ifndef VP10_TXMF1D_SSE2_H_ -#define VP10_TXMF1D_SSE2_H_ - -#include -#include "vp10/common/vp10_txfm.h" - -#ifdef __cplusplus -extern "C" { -#endif - -void vp10_fdct4_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fdct8_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fdct16_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fdct32_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fdct64_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); - -void vp10_fadst4_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fadst8_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fadst16_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_fadst32_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); - -void vp10_idct4_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_idct8_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_idct16_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_idct32_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_idct64_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); - -void vp10_iadst4_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_iadst8_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_iadst16_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); -void vp10_iadst32_new_sse2(const __m128i* input, __m128i* output, - const int8_t* cos_bit, const int8_t* stage_range); - -static INLINE void transpose_32_4x4(int stride, const __m128i* input, - __m128i* output) { - __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); - __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); - __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); - __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); - - output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); - output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); - output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); - output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); -} - -// the entire input block can be represent by a grid of 4x4 blocks -// each 4x4 blocks can be represent by 4 vertical __m128i -// we first transpose each 4x4 block internally -// than transpose the grid -static INLINE void transpose_32(int txfm_size, const __m128i* input, - __m128i* output) { - const int num_per_128 = 4; - const int row_size = txfm_size; - const int col_size = txfm_size / num_per_128; - int r, c; - - // transpose each 4x4 block internally - for (r = 0; r < row_size; r += 4) { - for (c = 0; c < col_size; c++) { - transpose_32_4x4(col_size, &input[r * col_size + c], - &output[c * 4 * col_size + r / 4]); - } - } -} - -#define mullo_epi32(a, b) \ - ({ \ - __m128i tmp1 = _mm_mul_epu32(a, b); \ - __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); \ - _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), \ - _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); \ - }) - -#define round_shift_32_sse2(vec, bit) \ - ({ \ - __m128i tmp, round; \ - round = _mm_set1_epi32(1 << (bit - 1)); \ - tmp = _mm_add_epi32(vec, round); \ - _mm_srai_epi32(tmp, bit); \ - }) - -#define round_shift_array_32_sse2(input, output, size, bit) \ - ({ \ - if (bit > 0) { \ - int i; \ - for (i = 0; i < size; i++) { \ - output[i] = round_shift_32_sse2(input[i], bit); \ - } \ - } else { \ - int i; \ - for (i = 0; i < size; i++) { \ - output[i] = _mm_slli_epi32(input[i], -bit); \ - } \ - } \ - }) - -// out0 = in0*w0 + in1*w1 -// out1 = -in1*w0 + in0*w1 -#define btf_32_sse2_type0(w0, w1, in0, in1, out0, out1, bit) \ - ({ \ - __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ - ww0 = _mm_set1_epi32(w0); \ - ww1 = _mm_set1_epi32(w1); \ - in0_w0 = mullo_epi32(in0, ww0); \ - in1_w1 = mullo_epi32(in1, ww1); \ - out0 = _mm_add_epi32(in0_w0, in1_w1); \ - out0 = round_shift_32_sse2(out0, bit); \ - in0_w1 = mullo_epi32(in0, ww1); \ - in1_w0 = mullo_epi32(in1, ww0); \ - out1 = _mm_sub_epi32(in0_w1, in1_w0); \ - out1 = round_shift_32_sse2(out1, bit); \ - }) - -// out0 = in0*w0 + in1*w1 -// out1 = in1*w0 - in0*w1 -#define btf_32_sse2_type1(w0, w1, in0, in1, out0, out1, bit) \ - ({ \ - __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ - ww0 = _mm_set1_epi32(w0); \ - ww1 = _mm_set1_epi32(w1); \ - in0_w0 = mullo_epi32(in0, ww0); \ - in1_w1 = mullo_epi32(in1, ww1); \ - out0 = _mm_add_epi32(in0_w0, in1_w1); \ - out0 = round_shift_32_sse2(out0, bit); \ - in0_w1 = mullo_epi32(in0, ww1); \ - in1_w0 = mullo_epi32(in1, ww0); \ - out1 = _mm_sub_epi32(in1_w0, in0_w1); \ - out1 = round_shift_32_sse2(out1, bit); \ - }) - -#ifdef __cplusplus -} -#endif - -#endif // VP10_TXMF1D_SSE2_H_ diff --git a/vp10/common/x86/vp10_txfm1d_sse4.h b/vp10/common/x86/vp10_txfm1d_sse4.h new file mode 100644 index 000000000..803b86d3e --- /dev/null +++ b/vp10/common/x86/vp10_txfm1d_sse4.h @@ -0,0 +1,145 @@ +#ifndef VP10_TXMF1D_SSE2_H_ +#define VP10_TXMF1D_SSE2_H_ + +#include +#include "vp10/common/vp10_txfm.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); + +void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); + +void vp10_idct4_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_idct8_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_idct16_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_idct32_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_idct64_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); + +void vp10_iadst4_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_iadst8_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_iadst16_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); +void vp10_iadst32_new_sse4_1(const __m128i* input, __m128i* output, + const int8_t* cos_bit, const int8_t* stage_range); + +static INLINE void transpose_32_4x4(int stride, const __m128i* input, + __m128i* output) { + __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]); + __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]); + __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]); + __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]); + + output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2); + output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2); + output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3); + output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3); +} + +// the entire input block can be represent by a grid of 4x4 blocks +// each 4x4 blocks can be represent by 4 vertical __m128i +// we first transpose each 4x4 block internally +// than transpose the grid +static INLINE void transpose_32(int txfm_size, const __m128i* input, + __m128i* output) { + const int num_per_128 = 4; + const int row_size = txfm_size; + const int col_size = txfm_size / num_per_128; + int r, c; + + // transpose each 4x4 block internally + for (r = 0; r < row_size; r += 4) { + for (c = 0; c < col_size; c++) { + transpose_32_4x4(col_size, &input[r * col_size + c], + &output[c * 4 * col_size + r / 4]); + } + } +} + +#define round_shift_32_sse4_1(vec, bit) \ + ({ \ + __m128i tmp, round; \ + round = _mm_set1_epi32(1 << (bit - 1)); \ + tmp = _mm_add_epi32(vec, round); \ + _mm_srai_epi32(tmp, bit); \ + }) + +#define round_shift_array_32_sse4_1(input, output, size, bit) \ + ({ \ + if (bit > 0) { \ + int i; \ + for (i = 0; i < size; i++) { \ + output[i] = round_shift_32_sse4_1(input[i], bit); \ + } \ + } else { \ + int i; \ + for (i = 0; i < size; i++) { \ + output[i] = _mm_slli_epi32(input[i], -bit); \ + } \ + } \ + }) + +// out0 = in0*w0 + in1*w1 +// out1 = -in1*w0 + in0*w1 +#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \ + ({ \ + __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ + ww0 = _mm_set1_epi32(w0); \ + ww1 = _mm_set1_epi32(w1); \ + in0_w0 = _mm_mullo_epi32(in0, ww0); \ + in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = round_shift_32_sse4_1(out0, bit); \ + in0_w1 = _mm_mullo_epi32(in0, ww1); \ + in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in0_w1, in1_w0); \ + out1 = round_shift_32_sse4_1(out1, bit); \ + }) + +// out0 = in0*w0 + in1*w1 +// out1 = in1*w0 - in0*w1 +#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \ + ({ \ + __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0; \ + ww0 = _mm_set1_epi32(w0); \ + ww1 = _mm_set1_epi32(w1); \ + in0_w0 = _mm_mullo_epi32(in0, ww0); \ + in1_w1 = _mm_mullo_epi32(in1, ww1); \ + out0 = _mm_add_epi32(in0_w0, in1_w1); \ + out0 = round_shift_32_sse4_1(out0, bit); \ + in0_w1 = _mm_mullo_epi32(in0, ww1); \ + in1_w0 = _mm_mullo_epi32(in1, ww0); \ + out1 = _mm_sub_epi32(in1_w0, in0_w1); \ + out1 = round_shift_32_sse4_1(out1, bit); \ + }) + +#ifdef __cplusplus +} +#endif + +#endif // VP10_TXMF1D_SSE2_H_ diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk index 84eacadb2..40699a356 100644 --- a/vp10/vp10_common.mk +++ b/vp10/vp10_common.mk @@ -110,10 +110,9 @@ VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_intrin_sse2.c VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h -VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_txfm1d_sse2.h -VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm1d_sse2.h -VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm1d_sse2.c -VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm2d_sse2.c +VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_txfm1d_sse4.h +VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm1d_sse4.c +VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm2d_sse4.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c