Merge "Highbd fht4x4 SSE4.1 optimization for DCT_DCT mode - Setup function vp10_highbd_fht4x4_sse4_1 for highbd SSE4.1 intrinsics optimization. - Wrote SSE4.1 functions: load_buffer_4x4(), write_buffer_4x4(), and fdct4x4_sse4_1(). - Used logic right shift to avoid coeff memory write/read. - Turned on vp10_highbd_fht4x4_sse4_1 for DCT_DCT mode only. - Improved overall encoding performance >2.3% for 50 frames sequence, park_joy_1080p_12.y4m, in which, --input-bit-depth=12, --bit-depth=12, 50 frames. - Unit test passed." into nextgenv2

2016-03-23 18:30:39 +00:00 · 2016-03-23 18:30:39 +00:00 · deb33056d1
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@ -670,7 +670,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {

  # fdct functions
  add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_highbd_fht4x4/;
+  specialize qw/vp10_highbd_fht4x4 sse4_1/;

  add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
  specialize qw/vp10_highbd_fht8x8/;
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@ -202,10 +202,12 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,

  switch (tx_type) {
    case DCT_DCT:
+      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
    case ADST_DCT:
    case DCT_ADST:
    case ADST_ADST:
-      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
      break;
 #if CONFIG_EXT_TX
    case FLIPADST_DCT:
@ -213,7 +215,7 @@ void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
    case FLIPADST_FLIPADST:
    case ADST_FLIPADST:
    case FLIPADST_ADST:
-      vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
      break;
    case V_DCT:
    case H_DCT:
--- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_ports/mem.h"
+
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr) {
+  const __m128i k__nonzero_bias_a = _mm_setr_epi32(0, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi32(1, 0, 0, 0);
+  __m128i mask;
+
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], 4);
+  in[1] = _mm_slli_epi32(in[1], 4);
+  in[2] = _mm_slli_epi32(in[2], 4);
+  in[3] = _mm_slli_epi32(in[3], 4);
+
+  mask = _mm_cmpeq_epi32(in[0], k__nonzero_bias_a);
+  in[0] = _mm_add_epi32(in[0], mask);
+  in[0] = _mm_add_epi32(in[0], k__nonzero_bias_b);
+}
+
+static void fdct4x4_sse4_1(__m128i *in) {
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi64x(cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi64x(-cospi_16_64);
+  const __m128i k__cospi_p08_p08 = _mm_set1_epi64x(cospi_8_64);
+  const __m128i k__cospi_m08_m08 = _mm_set1_epi64x(-cospi_8_64);
+  const __m128i k__cospi_p24_p24 = _mm_set1_epi64x(cospi_24_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi64x(DCT_CONST_ROUNDING);
+
+  __m128i s[8];
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  s[0] = _mm_add_epi32(in[0], in[3]);
+  s[1] = _mm_add_epi32(in[1], in[2]);
+  s[2] = _mm_sub_epi32(in[1], in[2]);
+  s[3] = _mm_sub_epi32(in[0], in[3]);
+
+  v0 = _mm_cvtepi32_epi64(s[0]);  // s01 s00
+  v1 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[0], s[0]));  // s03 s02
+  v2 = _mm_cvtepi32_epi64(s[1]);  // s11 s10
+  v3 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[1], s[1]));  // s13 s12
+  v4 = _mm_cvtepi32_epi64(s[2]);  // s21 s20
+  v5 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[2], s[2]));  // s23 s22
+  v6 = _mm_cvtepi32_epi64(s[3]);  // s31 s30
+  v7 = _mm_cvtepi32_epi64(_mm_unpackhi_epi64(s[3], s[3]));  // s33 s32
+
+  u0 = _mm_mul_epi32(v0, k__cospi_p16_p16);
+  u1 = _mm_mul_epi32(v1, k__cospi_p16_p16);
+  u2 = _mm_mul_epi32(v2, k__cospi_p16_p16);
+  u3 = _mm_mul_epi32(v3, k__cospi_p16_p16);
+
+  s[0] = _mm_add_epi64(u0, u2);  // y10 y00
+  s[1] = _mm_add_epi64(u1, u3);  // y30 y20
+
+  u2 = _mm_mul_epi32(v2, k__cospi_m16_m16);
+  u3 = _mm_mul_epi32(v3, k__cospi_m16_m16);
+
+  s[2] = _mm_add_epi64(u0, u2);  // y12 y02
+  s[3] = _mm_add_epi64(u1, u3);  // y32 y22
+
+  u0 = _mm_mul_epi32(v6, k__cospi_p08_p08);
+  u1 = _mm_mul_epi32(v5, k__cospi_p24_p24);
+  u2 = _mm_mul_epi32(v4, k__cospi_p24_p24);
+  u3 = _mm_mul_epi32(v7, k__cospi_p08_p08);
+
+  s[4] = _mm_add_epi64(u0, u2);  // y11 y01
+  s[5] = _mm_add_epi64(u1, u3);  // y31 y21
+
+  u0 = _mm_mul_epi32(v4, k__cospi_m08_m08);
+  u1 = _mm_mul_epi32(v5, k__cospi_m08_m08);
+  u2 = _mm_mul_epi32(v6, k__cospi_p24_p24);
+  u3 = _mm_mul_epi32(v7, k__cospi_p24_p24);
+
+  s[6] = _mm_add_epi64(u0, u2);  // y13 y03
+  s[7] = _mm_add_epi64(u1, u3);  // y33 y23
+
+  s[0] = _mm_add_epi64(s[0], k__DCT_CONST_ROUNDING);
+  s[1] = _mm_add_epi64(s[1], k__DCT_CONST_ROUNDING);
+  s[2] = _mm_add_epi64(s[2], k__DCT_CONST_ROUNDING);
+  s[3] = _mm_add_epi64(s[3], k__DCT_CONST_ROUNDING);
+  s[4] = _mm_add_epi64(s[4], k__DCT_CONST_ROUNDING);
+  s[5] = _mm_add_epi64(s[5], k__DCT_CONST_ROUNDING);
+  s[6] = _mm_add_epi64(s[6], k__DCT_CONST_ROUNDING);
+  s[7] = _mm_add_epi64(s[7], k__DCT_CONST_ROUNDING);
+
+  s[0] = _mm_srli_epi64(s[0], DCT_CONST_BITS);
+  s[1] = _mm_srli_epi64(s[1], DCT_CONST_BITS);
+  s[2] = _mm_srli_epi64(s[2], DCT_CONST_BITS);
+  s[3] = _mm_srli_epi64(s[3], DCT_CONST_BITS);
+  s[4] = _mm_srli_epi64(s[4], DCT_CONST_BITS);
+  s[5] = _mm_srli_epi64(s[5], DCT_CONST_BITS);
+  s[6] = _mm_srli_epi64(s[6], DCT_CONST_BITS);
+  s[7] = _mm_srli_epi64(s[7], DCT_CONST_BITS);
+
+  s[0] = _mm_shuffle_epi32(s[0], 0x88);
+  s[1] = _mm_shuffle_epi32(s[1], 0x88);
+  s[2] = _mm_shuffle_epi32(s[2], 0x88);
+  s[3] = _mm_shuffle_epi32(s[3], 0x88);
+  s[4] = _mm_shuffle_epi32(s[4], 0x88);
+  s[5] = _mm_shuffle_epi32(s[5], 0x88);
+  s[6] = _mm_shuffle_epi32(s[6], 0x88);
+  s[7] = _mm_shuffle_epi32(s[7], 0x88);
+
+  v0 = _mm_unpacklo_epi32(s[0], s[4]);
+  v1 = _mm_unpacklo_epi32(s[2], s[6]);
+  v2 = _mm_unpacklo_epi32(s[1], s[5]);
+  v3 = _mm_unpacklo_epi32(s[3], s[7]);
+
+  in[0] = _mm_unpacklo_epi64(v0, v1);
+  in[1] = _mm_unpackhi_epi64(v0, v1);
+  in[2] = _mm_unpacklo_epi64(v2, v3);
+  in[3] = _mm_unpackhi_epi64(v2, v3);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
+  const __m128i kOne = _mm_set1_epi32(1);
+  res[0] = _mm_add_epi32(res[0], kOne);
+  res[1] = _mm_add_epi32(res[1], kOne);
+  res[2] = _mm_add_epi32(res[2], kOne);
+  res[3] = _mm_add_epi32(res[3], kOne);
+  res[0] = _mm_srai_epi32(res[0], 2);
+  res[1] = _mm_srai_epi32(res[1], 2);
+  res[2] = _mm_srai_epi32(res[2], 2);
+  res[3] = _mm_srai_epi32(res[3], 2);
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+void vp10_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+                               int stride, int tx_type) {
+  __m128i in[4];
+
+  switch (tx_type) {
+    case DCT_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fdct4x4_sse4_1(in);
+      fdct4x4_sse4_1(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_highbd_fht4x4_c(input, output, stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_highbd_fht4x4_c(input, output, stride, tx_type);
+      break;
+    case DST_DST:
+    case DCT_DST:
+    case DST_DCT:
+    case DST_ADST:
+    case ADST_DST:
+    case DST_FLIPADST:
+    case FLIPADST_DST:
+      vp10_highbd_fht4x4_c(input, output, stride, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@ -112,6 +112,7 @@ endif

 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.c
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c

 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoiser_sse2.c