From a4593f17ca8b35c5ea70a503ffa68fa6edbee5f3 Mon Sep 17 00:00:00 2001 From: Yi Luo Date: Mon, 25 Apr 2016 09:41:11 -0700 Subject: [PATCH] HBD hybrid transform 4x4 SSE4.1 optimization - Optimization on tx_type: DCT_DCT, DCT_ADST, ADST_DCT, ADST_ADST. - Overall encoder speed improves ~4.5%-6%. - Update bit-exact unit test against current C version. Change-Id: If751c030612245b1c2470200c9570cf40d655504 --- test/vp10_fht4x4_test.cc | 239 ++++++++---------------- vp10/common/vp10_fwd_txfm2d.c | 2 +- vp10/common/x86/vp10_fwd_txfm2d_sse4.c | 9 - vp10/encoder/x86/highbd_fwd_txfm_sse4.c | 138 +++++++++++--- 4 files changed, 191 insertions(+), 197 deletions(-) diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc index 63d9ec7ed..9daf063f1 100644 --- a/test/vp10_fht4x4_test.cc +++ b/test/vp10_fht4x4_test.cc @@ -25,9 +25,9 @@ using libvpx_test::ACMRandom; namespace { typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type); - +using std::tr1::tuple; using libvpx_test::FhtFunc; -typedef std::tr1::tuple Ht4x4Param; +typedef tuple Ht4x4Param; void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) { @@ -37,13 +37,14 @@ void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, #if CONFIG_VP9_HIGHBITDEPTH typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride, int tx_type, int bd); +typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride, + int tx_type, int bd); +// Target optimized function, tx_type, bit depth +typedef tuple HighbdHt4x4Param; -typedef std::tr1::tuple -HighbdHt4x4Param; - -void highbe_fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, - int tx_type) { - vp10_highbd_fht4x4_c(in, out, stride, tx_type); +void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride, + int tx_type, int bd) { + vp10_fwd_txfm2d_4x4_c(in, out, stride, tx_type, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -83,98 +84,76 @@ TEST_P(VP10Trans4x4HT, CoeffCheck) { } #if CONFIG_VP9_HIGHBITDEPTH -class VP10HighbdTrans4x4HT - : public libvpx_test::TransformTestBase, - public ::testing::TestWithParam { +class VP10HighbdTrans4x4HT : public ::testing::TestWithParam { public: virtual ~VP10HighbdTrans4x4HT() {} virtual void SetUp() { fwd_txfm_ = GET_PARAM(0); - inv_txfm_ = GET_PARAM(1); - tx_type_ = GET_PARAM(2); - pitch_ = 4; - fwd_txfm_ref = highbe_fht4x4_ref; - bit_depth_ = GET_PARAM(3); + fwd_txfm_ref_ = highbe_fht4x4_ref; + tx_type_ = GET_PARAM(1); + bit_depth_ = GET_PARAM(2); mask_ = (1 << bit_depth_) - 1; - num_coeffs_ = GET_PARAM(4); + num_coeffs_ = 16; + + input_ = reinterpret_cast + (vpx_memalign(16, sizeof(int16_t) * num_coeffs_)); + output_ = reinterpret_cast + (vpx_memalign(16, sizeof(int32_t) * num_coeffs_)); + output_ref_ = reinterpret_cast + (vpx_memalign(16, sizeof(int32_t) * num_coeffs_)); + } + + virtual void TearDown() { + vpx_free(input_); + vpx_free(output_); + vpx_free(output_ref_); + libvpx_test::ClearSystemState(); } - virtual void TearDown() { libvpx_test::ClearSystemState(); } protected: - void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) { - fwd_txfm_(in, out, stride, tx_type_); - } + void RunBitexactCheck(); - void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) { - inv_txfm_(out, dst, stride, tx_type_, bit_depth_); - } - - FhtFunc fwd_txfm_; - IhighbdHtFunc inv_txfm_; + private: + HBDFhtFunc fwd_txfm_; + HBDFhtFunc fwd_txfm_ref_; + int tx_type_; + int bit_depth_; + int mask_; + int num_coeffs_; + int16_t *input_; + int32_t *output_; + int32_t *output_ref_; }; +void VP10HighbdTrans4x4HT::RunBitexactCheck() { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + int i, j; + const int stride = 4; + const int num_tests = 200000; + const int num_coeffs = 16; + + for (i = 0; i < num_tests; ++i) { + for (j = 0; j < num_coeffs; ++j) { + input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_); + } + + fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_); + fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_); + + for (j = 0; j < num_coeffs; ++j) { + EXPECT_EQ(output_[j], output_ref_[j]) + << "Not bit-exact result at index: " << j + << " at test block: " << i; + } + } +} + TEST_P(VP10HighbdTrans4x4HT, HighbdCoeffCheck) { - RunCoeffCheck(); + RunBitexactCheck(); } #endif // CONFIG_VP9_HIGHBITDEPTH -#define SPEED_TEST (0) -#if SPEED_TEST -#if CONFIG_EXT_TX -TEST(VP10Trans4x4HTSpeedTest, C_version) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 200000; - int bit_depth = 8; - int mask = (1 << bit_depth) - 1; - const int num_coeffs = 16; - int16_t *input = new int16_t[num_coeffs]; - tran_low_t *output = new tran_low_t[num_coeffs]; - const int stride = 4; - int tx_type; - - for (int j = 0; j < num_coeffs; ++j) { - input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); - } - for (int i = 0; i < count_test_block; ++i) { - for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { - vp10_fht4x4_c(input, output, stride, tx_type); - } - } - - delete[] input; - delete[] output; -} -#endif // CONFIG_EXT_TX - -#if HAVE_SSE2 && CONFIG_EXT_TX -TEST(VP10Trans4x4HTSpeedTest, SSE2_version) { - ACMRandom rnd(ACMRandom::DeterministicSeed()); - const int count_test_block = 200000; - int bit_depth = 8; - int mask = (1 << bit_depth) - 1; - const int num_coeffs = 16; - int16_t *input = new int16_t[num_coeffs]; - tran_low_t *output = reinterpret_cast( - vpx_memalign(16, num_coeffs * sizeof(tran_low_t))); - const int stride = 4; - int tx_type; - - for (int j = 0; j < num_coeffs; ++j) { - input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask); - } - for (int i = 0; i < count_test_block; ++i) { - for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) { - vp10_fht4x4_sse2(input, output, stride, tx_type); - } - } - - delete[] input; - vpx_free(output); -} -#endif // HAVE_SSE2 && CONFIG_EXT_TX -#endif // SPEED_TEST - using std::tr1::make_tuple; #if HAVE_SSE2 @@ -229,83 +208,23 @@ INSTANTIATE_TEST_CASE_P( SSE4_1, VP10HighbdTrans4x4HT, ::testing::Values( #if !CONFIG_EXT_TX - // make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 0, - // VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 1, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 2, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 3, - VPX_BITS_10, 16), - // make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 0, - // VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 1, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 2, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 3, - VPX_BITS_12, 16))); + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 12), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12))); #else - // make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 0, - // VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 1, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 2, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 3, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 4, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 5, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 6, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 7, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 8, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 10, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 11, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 12, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 13, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 14, - VPX_BITS_10, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 15, - VPX_BITS_10, 16), - // make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 0, - // VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 1, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 2, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 3, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 4, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 5, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 6, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 7, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 8, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 10, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 11, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 12, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 13, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 14, - VPX_BITS_12, 16), - make_tuple(&vp10_highbd_fht4x4_sse4_1, &vp10_highbd_iht4x4_16_add_c, 15, - VPX_BITS_12, 16))); + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 12), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10), + make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12))); #endif // !CONFIG_EXT_TX #endif // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c index 00f883449..ccb820f0a 100644 --- a/vp10/common/vp10_fwd_txfm2d.c +++ b/vp10/common/vp10_fwd_txfm2d.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c index d884571f2..1b1108723 100644 --- a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c +++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c @@ -87,15 +87,6 @@ static inline void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output, transpose_32(txfm_size, buf_128, out_128); } -void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *output, - const int stride, int tx_type, - const int bd) { - int32_t txfm_buf[16]; - const TXFM_2D_CFG* cfg = vp10_get_txfm_4x4_cfg(tx_type); - (void)bd; - fwd_txfm2d_sse4_1(input, output, stride, cfg, txfm_buf); -} - void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *output, const int stride, int tx_type, const int bd) { diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c index 16323b3ce..3cda783a7 100644 --- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c +++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c @@ -111,52 +111,136 @@ static void fdct4x4_sse4_1(__m128i *in, int bit) { in[3] = _mm_unpackhi_epi64(v1, v3); } -static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) { +static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) { _mm_store_si128((__m128i *)(output + 0 * 4), res[0]); _mm_store_si128((__m128i *)(output + 1 * 4), res[1]); _mm_store_si128((__m128i *)(output + 2 * 4), res[2]); _mm_store_si128((__m128i *)(output + 3 * 4), res[3]); } +// Note: +// We implement vp10_fwd_txfm2d_4x4(). This function is kept here since +// vp10_highbd_fht4x4_c() is not removed yet void vp10_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output, int stride, int tx_type) { + (void)input; + (void)output; + (void)stride; + (void)tx_type; + assert(0); +} + +static void fadst4x4_sse4_1(__m128i *in, int bit) { + const int32_t *cospi = cospi_arr[bit - cos_bit_min]; + const __m128i cospi8 = _mm_set1_epi32(cospi[8]); + const __m128i cospi56 = _mm_set1_epi32(cospi[56]); + const __m128i cospi40 = _mm_set1_epi32(cospi[40]); + const __m128i cospi24 = _mm_set1_epi32(cospi[24]); + const __m128i cospi32 = _mm_set1_epi32(cospi[32]); + const __m128i rnding = _mm_set1_epi32(1 << (bit - 1)); + const __m128i kZero = _mm_setzero_si128(); + __m128i s0, s1, s2, s3; + __m128i u0, u1, u2, u3; + __m128i v0, v1, v2, v3; + + // stage 0 + // stage 1 + // stage 2 + u0 = _mm_mullo_epi32(in[3], cospi8); + u1 = _mm_mullo_epi32(in[0], cospi56); + u2 = _mm_add_epi32(u0, u1); + s0 = _mm_add_epi32(u2, rnding); + s0 = _mm_srai_epi32(s0, bit); + + v0 = _mm_mullo_epi32(in[3], cospi56); + v1 = _mm_mullo_epi32(in[0], cospi8); + v2 = _mm_sub_epi32(v0, v1); + s1 = _mm_add_epi32(v2, rnding); + s1 = _mm_srai_epi32(s1, bit); + + u0 = _mm_mullo_epi32(in[1], cospi40); + u1 = _mm_mullo_epi32(in[2], cospi24); + u2 = _mm_add_epi32(u0, u1); + s2 = _mm_add_epi32(u2, rnding); + s2 = _mm_srai_epi32(s2, bit); + + v0 = _mm_mullo_epi32(in[1], cospi24); + v1 = _mm_mullo_epi32(in[2], cospi40); + v2 = _mm_sub_epi32(v0, v1); + s3 = _mm_add_epi32(v2, rnding); + s3 = _mm_srai_epi32(s3, bit); + + // stage 3 + u0 = _mm_add_epi32(s0, s2); + u2 = _mm_sub_epi32(s0, s2); + u1 = _mm_add_epi32(s1, s3); + u3 = _mm_sub_epi32(s1, s3); + + // stage 4 + v0 = _mm_mullo_epi32(u2, cospi32); + v1 = _mm_mullo_epi32(u3, cospi32); + v2 = _mm_add_epi32(v0, v1); + s2 = _mm_add_epi32(v2, rnding); + u2 = _mm_srai_epi32(s2, bit); + + v2 = _mm_sub_epi32(v0, v1); + s3 = _mm_add_epi32(v2, rnding); + u3 = _mm_srai_epi32(s3, bit); + + // u0, u1, u2, u3 + u2 = _mm_sub_epi32(kZero, u2); + u1 = _mm_sub_epi32(kZero, u1); + + // u0, u2, u3, u1 + // Transpose 4x4 32-bit + v0 = _mm_unpacklo_epi32(u0, u2); + v1 = _mm_unpackhi_epi32(u0, u2); + v2 = _mm_unpacklo_epi32(u3, u1); + v3 = _mm_unpackhi_epi32(u3, u1); + + in[0] = _mm_unpacklo_epi64(v0, v2); + in[1] = _mm_unpackhi_epi64(v0, v2); + in[2] = _mm_unpacklo_epi64(v1, v3); + in[3] = _mm_unpackhi_epi64(v1, v3); +} + +void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, tran_low_t *coeff, + int input_stride, int tx_type, + const int bd) { __m128i in[4]; - const TXFM_2D_CFG *cfg; - int bit; + const TXFM_2D_CFG *cfg = NULL; switch (tx_type) { case DCT_DCT: cfg = &fwd_txfm_2d_cfg_dct_dct_4; - load_buffer_4x4(input, in, stride, 0, 0, cfg->shift[0]); - bit = cfg->cos_bit_col[2]; - fdct4x4_sse4_1(in, bit); - bit = cfg->cos_bit_row[2]; - fdct4x4_sse4_1(in, bit); - write_buffer_4x4(output, in); + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); + fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); break; case ADST_DCT: + cfg = &fwd_txfm_2d_cfg_adst_dct_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fdct4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; case DCT_ADST: + cfg = &fwd_txfm_2d_cfg_dct_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fdct4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); + break; case ADST_ADST: - vp10_highbd_fht4x4_c(input, output, stride, tx_type); + cfg = &fwd_txfm_2d_cfg_adst_adst_4; + load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]); + fadst4x4_sse4_1(in, cfg->cos_bit_col[2]); + fadst4x4_sse4_1(in, cfg->cos_bit_row[2]); + write_buffer_4x4(in, coeff); break; -#if CONFIG_EXT_TX - case FLIPADST_DCT: - case DCT_FLIPADST: - case FLIPADST_FLIPADST: - case ADST_FLIPADST: - case FLIPADST_ADST: - vp10_highbd_fht4x4_c(input, output, stride, tx_type); - break; - case V_DCT: - case H_DCT: - case V_ADST: - case H_ADST: - case V_FLIPADST: - case H_FLIPADST: - vp10_highbd_fht4x4_c(input, output, stride, tx_type); - break; -#endif // CONFIG_EXT_TX default: assert(0); } + (void)bd; }