Implemented DST 16x16 SSE2 intrinsics optimization
- Implemented fdst16_sse2(), fdst16_8col() against C version: fdst16(). - Turned on 7 DST related hybrid txfm types in vp10_fht16x16_sse2(). - Replaced vp10_fht10x10_c() with vp10_fht16x16_sse2() in fwd_txfm_16x16(). - Added vp10_fht16x16_sse2() unit test against C version: vp10_fht16x16_c() (--gtest_filter=*VP10Trans16x16*). - Unit test passed. - Speed improvement: 2.4%, 3.2%, 3.2%, for city_cif.y4m, garden_sif.y4m, and mobile_cif.y4m. Change-Id: Ib30a67ce5d5964bef143d588d0f8fa438be8901f
This commit is contained in:
Родитель
cf9c95c32c
Коммит
50a164a1f6
|
@ -168,6 +168,7 @@ LIBVPX_TEST_SRCS-yes += vp10_inv_txfm_test.cc
|
|||
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht4x4_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht8x8_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht16x16_test.cc
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_ANS) += vp10_ans_test.cc
|
||||
|
||||
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
|
||||
|
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "third_party/googletest/src/include/gtest/gtest.h"
|
||||
|
||||
#include "./vp10_rtcd.h"
|
||||
#include "./vpx_dsp_rtcd.h"
|
||||
|
||||
#include "test/acm_random.h"
|
||||
#include "test/clear_system_state.h"
|
||||
#include "test/register_state_check.h"
|
||||
#include "test/transform_test_base.h"
|
||||
#include "test/util.h"
|
||||
#include "vpx_ports/mem.h"
|
||||
|
||||
using libvpx_test::ACMRandom;
|
||||
|
||||
namespace {
|
||||
typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
|
||||
int tx_type);
|
||||
|
||||
using libvpx_test::FhtFunc;
|
||||
typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int>
|
||||
Ht16x16Param;
|
||||
|
||||
void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
|
||||
int tx_type) {
|
||||
vp10_fht16x16_c(in, out, stride, tx_type);
|
||||
}
|
||||
|
||||
class VP10Trans16x16HT
|
||||
: public libvpx_test::TransformTestBase,
|
||||
public ::testing::TestWithParam<Ht16x16Param> {
|
||||
public:
|
||||
virtual ~VP10Trans16x16HT() {}
|
||||
|
||||
virtual void SetUp() {
|
||||
fwd_txfm_ = GET_PARAM(0);
|
||||
inv_txfm_ = GET_PARAM(1);
|
||||
tx_type_ = GET_PARAM(2);
|
||||
pitch_ = 16;
|
||||
fwd_txfm_ref = fht16x16_ref;
|
||||
bit_depth_ = GET_PARAM(3);
|
||||
mask_ = (1 << bit_depth_) - 1;
|
||||
num_coeffs_ = GET_PARAM(4);
|
||||
}
|
||||
virtual void TearDown() { libvpx_test::ClearSystemState(); }
|
||||
|
||||
protected:
|
||||
void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
|
||||
fwd_txfm_(in, out, stride, tx_type_);
|
||||
}
|
||||
|
||||
void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
|
||||
inv_txfm_(out, dst, stride, tx_type_);
|
||||
}
|
||||
|
||||
FhtFunc fwd_txfm_;
|
||||
IhtFunc inv_txfm_;
|
||||
};
|
||||
|
||||
TEST_P(VP10Trans16x16HT, CoeffCheck) {
|
||||
RunCoeffCheck();
|
||||
}
|
||||
|
||||
using std::tr1::make_tuple;
|
||||
|
||||
#if HAVE_SSE2
|
||||
INSTANTIATE_TEST_CASE_P(
|
||||
SSE2, VP10Trans16x16HT,
|
||||
::testing::Values(
|
||||
#if !CONFIG_EXT_TX
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3,
|
||||
VPX_BITS_8, 256)));
|
||||
#else
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 4,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 5,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 6,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 9,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14,
|
||||
VPX_BITS_8, 256),
|
||||
make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15,
|
||||
VPX_BITS_8, 256)));
|
||||
#endif // !CONFIG_EXT_TX
|
||||
#endif // HAVE_SSE2
|
||||
|
||||
} // namespace
|
|
@ -155,7 +155,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
|
|||
specialize qw/vp10_iht8x8_64_add sse2/;
|
||||
|
||||
add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
|
||||
specialize qw/vp10_iht16x16_256_add/;
|
||||
specialize qw/vp10_iht16x16_256_add sse2/;
|
||||
|
||||
add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
|
||||
specialize qw/vp10_fdct4x4 sse2/;
|
||||
|
|
|
@ -134,8 +134,6 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
|
|||
case FLIPADST_FLIPADST:
|
||||
case ADST_FLIPADST:
|
||||
case FLIPADST_ADST:
|
||||
vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
|
||||
break;
|
||||
case DST_DST:
|
||||
case DCT_DST:
|
||||
case DST_DCT:
|
||||
|
@ -143,8 +141,7 @@ static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
|
|||
case ADST_DST:
|
||||
case DST_FLIPADST:
|
||||
case FLIPADST_DST:
|
||||
// Use C version since DST exists only in C
|
||||
vp10_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
|
||||
vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
|
||||
break;
|
||||
case H_DCT:
|
||||
case V_DCT:
|
||||
|
|
|
@ -2420,6 +2420,351 @@ static void fadst16_8col(__m128i *in) {
|
|||
in[15] = _mm_sub_epi16(kZero, s[1]);
|
||||
}
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
static void fdst16_8col(__m128i *in) {
|
||||
const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
|
||||
const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t) cospi_16_64);
|
||||
const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
|
||||
const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
|
||||
|
||||
const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t) -cospi_16_64);
|
||||
const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
|
||||
const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
|
||||
const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
|
||||
const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
|
||||
const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
|
||||
|
||||
const __m128i k__cospi_m08_m24 = pair_set_epi16(-cospi_8_64, -cospi_24_64);
|
||||
const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
|
||||
|
||||
const __m128i k__cospi_m30_p02 = pair_set_epi16(-cospi_30_64, cospi_2_64);
|
||||
const __m128i k__cospi_m14_p18 = pair_set_epi16(-cospi_14_64, cospi_18_64);
|
||||
const __m128i k__cospi_m22_p10 = pair_set_epi16(-cospi_22_64, cospi_10_64);
|
||||
const __m128i k__cospi_m06_p26 = pair_set_epi16(-cospi_6_64, cospi_26_64);
|
||||
const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
|
||||
const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
|
||||
const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
|
||||
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
|
||||
|
||||
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
|
||||
|
||||
__m128i u0, u1, u2, u3, u4, u5, u6, u7;
|
||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
||||
__m128i s0, s1, s2, s3, s4, s5, s6, s7;
|
||||
__m128i x0, x1, x2, x3, t0, t1, t2, t3;
|
||||
__m128i y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
__m128i w0, w1, w2, w3, w4, w5, w6, w7;
|
||||
|
||||
// (1)
|
||||
u0 = _mm_sub_epi16(in[0], in[15]);
|
||||
v7 = _mm_add_epi16(in[0], in[15]);
|
||||
|
||||
u1 = _mm_sub_epi16(in[1], in[14]); // -u1
|
||||
v6 = _mm_add_epi16(in[1], in[14]); // -v6
|
||||
|
||||
u2 = _mm_sub_epi16(in[2], in[13]);
|
||||
v5 = _mm_add_epi16(in[2], in[13]);
|
||||
|
||||
u3 = _mm_sub_epi16(in[3], in[12]); // -u3
|
||||
v4 = _mm_add_epi16(in[3], in[12]); // -v4
|
||||
|
||||
u4 = _mm_sub_epi16(in[4], in[11]);
|
||||
v3 = _mm_add_epi16(in[4], in[11]);
|
||||
|
||||
u5 = _mm_sub_epi16(in[5], in[10]); // -u5
|
||||
v2 = _mm_add_epi16(in[5], in[10]); // -v2
|
||||
|
||||
u6 = _mm_sub_epi16(in[6], in[9]);
|
||||
v1 = _mm_add_epi16(in[6], in[9]);
|
||||
|
||||
u7 = _mm_sub_epi16(in[7], in[8]); // -u7
|
||||
v0 = _mm_add_epi16(in[7], in[8]); // -v0
|
||||
|
||||
s0 = _mm_sub_epi16(u0, u7);
|
||||
s1 = _mm_sub_epi16(u1, u6); // -s1
|
||||
s2 = _mm_sub_epi16(u2, u5);
|
||||
s3 = _mm_sub_epi16(u3, u4); // -s3
|
||||
s4 = _mm_add_epi16(u3, u4); // -s4
|
||||
s5 = _mm_add_epi16(u2, u5);
|
||||
s6 = _mm_add_epi16(u1, u6); // -s6
|
||||
s7 = _mm_add_epi16(u0, u7);
|
||||
|
||||
x0 = _mm_sub_epi16(s0, s3);
|
||||
x1 = _mm_sub_epi16(s1, s2); // -x1
|
||||
x2 = _mm_add_epi16(s1, s2); // -x2
|
||||
x3 = _mm_add_epi16(s0, s3);
|
||||
|
||||
y0 = _mm_unpacklo_epi16(x0, x1);
|
||||
y1 = _mm_unpackhi_epi16(x0, x1);
|
||||
y2 = _mm_unpacklo_epi16(x2, x3);
|
||||
y3 = _mm_unpackhi_epi16(x2, x3);
|
||||
|
||||
t0 = _mm_madd_epi16(y0, k__cospi_p16_m16);
|
||||
t1 = _mm_madd_epi16(y1, k__cospi_p16_m16);
|
||||
t2 = _mm_madd_epi16(y0, k__cospi_p16_p16);
|
||||
t3 = _mm_madd_epi16(y1, k__cospi_p16_p16);
|
||||
x0 = _mm_madd_epi16(y2, k__cospi_m24_p08);
|
||||
x1 = _mm_madd_epi16(y3, k__cospi_m24_p08);
|
||||
x2 = _mm_madd_epi16(y2, k__cospi_p08_p24);
|
||||
x3 = _mm_madd_epi16(y3, k__cospi_p08_p24);
|
||||
|
||||
y0 = _mm_add_epi32(t0, k__DCT_CONST_ROUNDING);
|
||||
y1 = _mm_add_epi32(t1, k__DCT_CONST_ROUNDING);
|
||||
y2 = _mm_add_epi32(t2, k__DCT_CONST_ROUNDING);
|
||||
y3 = _mm_add_epi32(t3, k__DCT_CONST_ROUNDING);
|
||||
y4 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
|
||||
y5 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
|
||||
y6 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
|
||||
y7 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
|
||||
|
||||
t0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
|
||||
t1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
|
||||
t2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
|
||||
t3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
|
||||
x0 = _mm_srai_epi32(y4, DCT_CONST_BITS);
|
||||
x1 = _mm_srai_epi32(y5, DCT_CONST_BITS);
|
||||
x2 = _mm_srai_epi32(y6, DCT_CONST_BITS);
|
||||
x3 = _mm_srai_epi32(y7, DCT_CONST_BITS);
|
||||
|
||||
in[15] = _mm_packs_epi32(t0, t1);
|
||||
in[11] = _mm_packs_epi32(x0, x1);
|
||||
in[7] = _mm_packs_epi32(t2, t3);
|
||||
in[3] = _mm_packs_epi32(x2, x3);
|
||||
|
||||
// (2)
|
||||
t0 = _mm_unpacklo_epi16(s6, s5);
|
||||
t1 = _mm_unpackhi_epi16(s6, s5);
|
||||
|
||||
y0 = _mm_madd_epi16(t0, k__cospi_m16_m16);
|
||||
y1 = _mm_madd_epi16(t1, k__cospi_m16_m16);
|
||||
y2 = _mm_madd_epi16(t0, k__cospi_m16_p16);
|
||||
y3 = _mm_madd_epi16(t1, k__cospi_m16_p16);
|
||||
|
||||
x0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
|
||||
x1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
|
||||
x2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
|
||||
x3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
|
||||
|
||||
y4 = _mm_srai_epi32(x0, DCT_CONST_BITS);
|
||||
y5 = _mm_srai_epi32(x1, DCT_CONST_BITS);
|
||||
y6 = _mm_srai_epi32(x2, DCT_CONST_BITS);
|
||||
y7 = _mm_srai_epi32(x3, DCT_CONST_BITS);
|
||||
|
||||
t2 = _mm_packs_epi32(y4, y5);
|
||||
t3 = _mm_packs_epi32(y6, y7);
|
||||
|
||||
x0 = _mm_sub_epi16(s4, t2); // -x0
|
||||
x1 = _mm_add_epi16(s4, t2); // -x1
|
||||
x2 = _mm_sub_epi16(s7, t3);
|
||||
x3 = _mm_add_epi16(s7, t3);
|
||||
|
||||
y0 = _mm_unpacklo_epi16(x0, x3);
|
||||
y1 = _mm_unpackhi_epi16(x0, x3);
|
||||
y2 = _mm_unpacklo_epi16(x1, x2);
|
||||
y3 = _mm_unpackhi_epi16(x1, x2);
|
||||
|
||||
w0 = _mm_madd_epi16(y0, k__cospi_m28_p04);
|
||||
w1 = _mm_madd_epi16(y1, k__cospi_m28_p04);
|
||||
w2 = _mm_madd_epi16(y2, k__cospi_m12_p20);
|
||||
w3 = _mm_madd_epi16(y3, k__cospi_m12_p20);
|
||||
w4 = _mm_madd_epi16(y2, k__cospi_p20_p12);
|
||||
w5 = _mm_madd_epi16(y3, k__cospi_p20_p12);
|
||||
w6 = _mm_madd_epi16(y0, k__cospi_p04_p28);
|
||||
w7 = _mm_madd_epi16(y1, k__cospi_p04_p28);
|
||||
|
||||
u0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
|
||||
u1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
|
||||
u2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
|
||||
u3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
|
||||
u4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
|
||||
u5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
|
||||
u6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
|
||||
u7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
y0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
|
||||
y1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
|
||||
y2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
|
||||
y3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
|
||||
y4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
|
||||
y5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
|
||||
y6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
|
||||
y7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
|
||||
|
||||
in[13] = _mm_packs_epi32(y0, y1);
|
||||
in[9] = _mm_packs_epi32(y4, y5);
|
||||
in[5] = _mm_packs_epi32(y2, y3);
|
||||
in[1] = _mm_packs_epi32(y6, y7);
|
||||
|
||||
// (3)
|
||||
y0 = _mm_unpacklo_epi16(v5, v2);
|
||||
y1 = _mm_unpackhi_epi16(v5, v2);
|
||||
y2 = _mm_unpacklo_epi16(v4, v3);
|
||||
y3 = _mm_unpackhi_epi16(v4, v3);
|
||||
|
||||
u0 = _mm_madd_epi16(y0, k__cospi_p16_p16);
|
||||
u1 = _mm_madd_epi16(y1, k__cospi_p16_p16);
|
||||
u2 = _mm_madd_epi16(y2, k__cospi_m16_m16);
|
||||
u3 = _mm_madd_epi16(y3, k__cospi_m16_m16);
|
||||
u4 = _mm_madd_epi16(y2, k__cospi_m16_p16);
|
||||
u5 = _mm_madd_epi16(y3, k__cospi_m16_p16);
|
||||
u6 = _mm_madd_epi16(y0, k__cospi_p16_m16);
|
||||
u7 = _mm_madd_epi16(y1, k__cospi_p16_m16);
|
||||
|
||||
w0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
|
||||
w1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
|
||||
w2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
|
||||
w3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
|
||||
w4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
|
||||
w5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
|
||||
w6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
|
||||
w7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
s0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
|
||||
s1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
|
||||
s2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
|
||||
s3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
|
||||
s4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
|
||||
s5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
|
||||
s6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
|
||||
s7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
|
||||
|
||||
y2 = _mm_packs_epi32(s0, s1);
|
||||
y3 = _mm_packs_epi32(s2, s3);
|
||||
y4 = _mm_packs_epi32(s4, s5);
|
||||
y5 = _mm_packs_epi32(s6, s7);
|
||||
|
||||
// step 3
|
||||
w0 = _mm_sub_epi16(v0, y3); // -w0
|
||||
w1 = _mm_add_epi16(v1, y2);
|
||||
w2 = _mm_sub_epi16(v1, y2);
|
||||
w3 = _mm_add_epi16(v0, y3); // -w3
|
||||
w4 = _mm_sub_epi16(v7, y4);
|
||||
w5 = _mm_add_epi16(v6, y5); // -w5
|
||||
w6 = _mm_sub_epi16(v6, y5); // -w6
|
||||
w7 = _mm_add_epi16(v7, y4);
|
||||
|
||||
// step 4
|
||||
x0 = _mm_unpacklo_epi16(w1, w6);
|
||||
x1 = _mm_unpackhi_epi16(w1, w6);
|
||||
x2 = _mm_unpacklo_epi16(w2, w5);
|
||||
x3 = _mm_unpackhi_epi16(w2, w5);
|
||||
|
||||
u0 = _mm_madd_epi16(x0, k__cospi_m08_m24);
|
||||
u1 = _mm_madd_epi16(x1, k__cospi_m08_m24);
|
||||
u2 = _mm_madd_epi16(x2, k__cospi_p24_m08);
|
||||
u3 = _mm_madd_epi16(x3, k__cospi_p24_m08);
|
||||
u4 = _mm_madd_epi16(x2, k__cospi_p08_p24);
|
||||
u5 = _mm_madd_epi16(x3, k__cospi_p08_p24);
|
||||
u6 = _mm_madd_epi16(x0, k__cospi_p24_m08);
|
||||
u7 = _mm_madd_epi16(x1, k__cospi_p24_m08);
|
||||
|
||||
s0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
|
||||
s1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
|
||||
s2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
|
||||
s3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
|
||||
s4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
|
||||
s5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
|
||||
s6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
|
||||
s7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
u0 = _mm_srai_epi32(s0, DCT_CONST_BITS);
|
||||
u1 = _mm_srai_epi32(s1, DCT_CONST_BITS);
|
||||
u2 = _mm_srai_epi32(s2, DCT_CONST_BITS);
|
||||
u3 = _mm_srai_epi32(s3, DCT_CONST_BITS);
|
||||
u4 = _mm_srai_epi32(s4, DCT_CONST_BITS);
|
||||
u5 = _mm_srai_epi32(s5, DCT_CONST_BITS);
|
||||
u6 = _mm_srai_epi32(s6, DCT_CONST_BITS);
|
||||
u7 = _mm_srai_epi32(s7, DCT_CONST_BITS);
|
||||
|
||||
y1 = _mm_packs_epi32(u0, u1);
|
||||
y2 = _mm_packs_epi32(u2, u3);
|
||||
y5 = _mm_packs_epi32(u4, u5);
|
||||
y6 = _mm_packs_epi32(u6, u7);
|
||||
|
||||
// step 5
|
||||
v0 = _mm_sub_epi16(w0, y1); // -v0
|
||||
v1 = _mm_add_epi16(w0, y1); // -v1
|
||||
v2 = _mm_sub_epi16(w3, y2); // -v2
|
||||
v3 = _mm_add_epi16(w3, y2); // -v3
|
||||
v4 = _mm_sub_epi16(w4, y5);
|
||||
v5 = _mm_add_epi16(w4, y5);
|
||||
v6 = _mm_sub_epi16(w7, y6);
|
||||
v7 = _mm_add_epi16(w7, y6);
|
||||
|
||||
u0 = _mm_unpacklo_epi16(v0, v7);
|
||||
u1 = _mm_unpackhi_epi16(v0, v7);
|
||||
u2 = _mm_unpacklo_epi16(v1, v6);
|
||||
u3 = _mm_unpackhi_epi16(v1, v6);
|
||||
u4 = _mm_unpacklo_epi16(v2, v5);
|
||||
u5 = _mm_unpackhi_epi16(v2, v5);
|
||||
u6 = _mm_unpacklo_epi16(v3, v4);
|
||||
u7 = _mm_unpackhi_epi16(v3, v4);
|
||||
|
||||
s0 = _mm_madd_epi16(u0, k__cospi_m30_p02); // x0
|
||||
s1 = _mm_madd_epi16(u1, k__cospi_m30_p02);
|
||||
s2 = _mm_madd_epi16(u2, k__cospi_m14_p18); // x1
|
||||
s3 = _mm_madd_epi16(u3, k__cospi_m14_p18);
|
||||
s4 = _mm_madd_epi16(u4, k__cospi_m22_p10); // x2
|
||||
s5 = _mm_madd_epi16(u5, k__cospi_m22_p10);
|
||||
s6 = _mm_madd_epi16(u6, k__cospi_m06_p26); // x3
|
||||
s7 = _mm_madd_epi16(u7, k__cospi_m06_p26);
|
||||
|
||||
w0 = _mm_madd_epi16(u6, k__cospi_p26_p06); // x4
|
||||
w1 = _mm_madd_epi16(u7, k__cospi_p26_p06);
|
||||
w2 = _mm_madd_epi16(u4, k__cospi_p10_p22); // x5
|
||||
w3 = _mm_madd_epi16(u5, k__cospi_p10_p22);
|
||||
w4 = _mm_madd_epi16(u2, k__cospi_p18_p14); // x6
|
||||
w5 = _mm_madd_epi16(u3, k__cospi_p18_p14);
|
||||
w6 = _mm_madd_epi16(u0, k__cospi_p02_p30); // x7
|
||||
w7 = _mm_madd_epi16(u1, k__cospi_p02_p30);
|
||||
|
||||
v0 = _mm_add_epi32(s0, k__DCT_CONST_ROUNDING);
|
||||
v1 = _mm_add_epi32(s1, k__DCT_CONST_ROUNDING);
|
||||
v2 = _mm_add_epi32(s2, k__DCT_CONST_ROUNDING);
|
||||
v3 = _mm_add_epi32(s3, k__DCT_CONST_ROUNDING);
|
||||
v4 = _mm_add_epi32(s4, k__DCT_CONST_ROUNDING);
|
||||
v5 = _mm_add_epi32(s5, k__DCT_CONST_ROUNDING);
|
||||
v6 = _mm_add_epi32(s6, k__DCT_CONST_ROUNDING);
|
||||
v7 = _mm_add_epi32(s7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
y0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
|
||||
y1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
|
||||
y2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
|
||||
y3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
|
||||
y4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
|
||||
y5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
|
||||
y6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
|
||||
y7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
|
||||
|
||||
u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
|
||||
u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
|
||||
u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
|
||||
u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
|
||||
u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
|
||||
u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
|
||||
u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
|
||||
u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
|
||||
|
||||
s0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
|
||||
s1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
|
||||
s2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
|
||||
s3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
|
||||
s4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
|
||||
s5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
|
||||
s6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
|
||||
s7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
|
||||
|
||||
in[14] = _mm_packs_epi32(u0, u1);
|
||||
in[6] = _mm_packs_epi32(u2, u3);
|
||||
in[10] = _mm_packs_epi32(u4, u5);
|
||||
in[2] = _mm_packs_epi32(u6, u7);
|
||||
in[12] = _mm_packs_epi32(s0, s1);
|
||||
in[4] = _mm_packs_epi32(s2, s3);
|
||||
in[8] = _mm_packs_epi32(s4, s5);
|
||||
in[0] = _mm_packs_epi32(s6, s7);
|
||||
}
|
||||
#endif // CONFIG_EXT_TX
|
||||
|
||||
static void fdct16_sse2(__m128i *in0, __m128i *in1) {
|
||||
fdct16_8col(in0);
|
||||
fdct16_8col(in1);
|
||||
|
@ -2432,6 +2777,14 @@ static void fadst16_sse2(__m128i *in0, __m128i *in1) {
|
|||
array_transpose_16x16(in0, in1);
|
||||
}
|
||||
|
||||
#if CONFIG_EXT_TX
|
||||
static void fdst16_sse2(__m128i *in0, __m128i *in1) {
|
||||
fdst16_8col(in0);
|
||||
fdst16_8col(in1);
|
||||
array_transpose_16x16(in0, in1);
|
||||
}
|
||||
#endif // CONFIG_EXT_TX
|
||||
|
||||
void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
|
||||
int stride, int tx_type) {
|
||||
__m128i in0[16], in1[16];
|
||||
|
@ -2497,6 +2850,55 @@ void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
|
|||
fadst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case DST_DST:
|
||||
load_buffer_16x16(input, in0, in1, stride, 0, 0);
|
||||
fdst16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fdst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case DCT_DST:
|
||||
load_buffer_16x16(input, in0, in1, stride, 0, 0);
|
||||
fdct16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fdst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case DST_DCT:
|
||||
load_buffer_16x16(input, in0, in1, stride, 0, 0);
|
||||
fdst16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fdct16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case DST_ADST:
|
||||
load_buffer_16x16(input, in0, in1, stride, 0, 0);
|
||||
fdst16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fadst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case ADST_DST:
|
||||
load_buffer_16x16(input, in0, in1, stride, 0, 0);
|
||||
fadst16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fdst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case DST_FLIPADST:
|
||||
load_buffer_16x16(input, in0, in1, stride, 0, 1);
|
||||
fdst16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fadst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
case FLIPADST_DST:
|
||||
load_buffer_16x16(input, in0, in1, stride, 1, 0);
|
||||
fadst16_sse2(in0, in1);
|
||||
right_shift_16x16(in0, in1);
|
||||
fdst16_sse2(in0, in1);
|
||||
write_buffer_16x16(output, in0, in1, 16);
|
||||
break;
|
||||
#endif // CONFIG_EXT_TX
|
||||
default:
|
||||
assert(0);
|
||||
|
|
Загрузка…
Ссылка в новой задаче