Factor out x86 SIMD intrinsic synonyms

Change-Id: Idc4ac3ccd2ba19087cdb74a3e4a6774ac50386aa
This commit is contained in:
Geza Lore 2016-06-01 13:53:16 +01:00
Родитель 73bc3119be
Коммит 9ebca46933
2 изменённых файлов: 149 добавлений и 111 удалений

Просмотреть файл

@ -16,51 +16,20 @@
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
#include "vpx_dsp/x86/synonyms.h"
#include "./vpx_dsp_rtcd.h"
#define MASK_BITS 6
static INLINE __m128i mm_loadl_32(const void *a) {
return _mm_cvtsi32_si128(*(const uint32_t*)a);
}
static INLINE __m128i mm_loadl_64(const void *a) {
return _mm_loadl_epi64((const __m128i*)a);
}
static INLINE __m128i mm_loadu_128(const void *a) {
return _mm_loadu_si128((const __m128i*)a);
}
static INLINE void mm_storel_32(void *const a, const __m128i v) {
*(uint32_t*)a = _mm_cvtsi128_si32(v);
}
static INLINE void mm_storel_64(void *const a, const __m128i v) {
_mm_storel_epi64((__m128i*)a, v);
}
static INLINE void mm_storeu_128(void *const a, const __m128i v) {
_mm_storeu_si128((__m128i*)a, v);
}
static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
}
static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
}
//////////////////////////////////////////////////////////////////////////////
// Common kernels
//////////////////////////////////////////////////////////////////////////////
static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_b = mm_loadl_32(src0);
const __m128i v_s1_b = mm_loadl_32(src1);
const __m128i v_s0_b = xx_loadl_32(src0);
const __m128i v_s1_b = xx_loadl_32(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
@ -69,15 +38,15 @@ static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_b = mm_loadl_64(src0);
const __m128i v_s1_b = mm_loadl_64(src1);
const __m128i v_s0_b = xx_loadl_64(src0);
const __m128i v_s1_b = xx_loadl_64(src1);
const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
@ -86,7 +55,7 @@ static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
@ -106,7 +75,7 @@ static void blend_mask6_w4_sse4_1(
(void)w;
do {
const __m128i v_m0_b = mm_loadl_32(mask);
const __m128i v_m0_b = xx_loadl_32(mask);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
@ -114,7 +83,7 @@ static void blend_mask6_w4_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_32(dst, v_res_b);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -134,7 +103,7 @@ static void blend_mask6_w8_sse4_1(
(void)w;
do {
const __m128i v_m0_b = mm_loadl_64(mask);
const __m128i v_m0_b = xx_loadl_64(mask);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
@ -142,7 +111,7 @@ static void blend_mask6_w8_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_64(dst, v_res_b);
xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -162,8 +131,8 @@ static void blend_mask6_w16n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 16) {
const __m128i v_m0l_b = mm_loadl_64(mask + c);
const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
const __m128i v_m0l_b = xx_loadl_64(mask + c);
const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
@ -176,7 +145,7 @@ static void blend_mask6_w16n_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
mm_storeu_128(dst + c, v_res_b);
xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@ -202,7 +171,7 @@ static void blend_mask6_sx_w4_sse4_1(
(void)w;
do {
const __m128i v_r_b = mm_loadl_64(mask);
const __m128i v_r_b = xx_loadl_64(mask);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -212,7 +181,7 @@ static void blend_mask6_sx_w4_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_32(dst, v_res_b);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -234,7 +203,7 @@ static void blend_mask6_sx_w8_sse4_1(
(void)w;
do {
const __m128i v_r_b = mm_loadu_128(mask);
const __m128i v_r_b = xx_loadu_128(mask);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -244,7 +213,7 @@ static void blend_mask6_sx_w8_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_64(dst, v_res_b);
xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -266,8 +235,8 @@ static void blend_mask6_sx_w16n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 16) {
const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
@ -283,7 +252,7 @@ static void blend_mask6_sx_w16n_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
mm_storeu_128(dst + c, v_res_b);
xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@ -307,8 +276,8 @@ static void blend_mask6_sy_w4_sse4_1(
(void)w;
do {
const __m128i v_ra_b = mm_loadl_32(mask);
const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
const __m128i v_ra_b = xx_loadl_32(mask);
const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -318,7 +287,7 @@ static void blend_mask6_sy_w4_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_32(dst, v_res_b);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -338,8 +307,8 @@ static void blend_mask6_sy_w8_sse4_1(
(void)w;
do {
const __m128i v_ra_b = mm_loadl_64(mask);
const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -349,7 +318,7 @@ static void blend_mask6_sy_w8_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_64(dst, v_res_b);
xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -370,8 +339,8 @@ static void blend_mask6_sy_w16n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 16) {
const __m128i v_ra_b = mm_loadu_128(mask + c);
const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
const __m128i v_ra_b = xx_loadu_128(mask + c);
const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
@ -386,7 +355,7 @@ static void blend_mask6_sy_w16n_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
mm_storeu_128(dst + c, v_res_b);
xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@ -412,22 +381,22 @@ static void blend_mask6_sx_sy_w4_sse4_1(
(void)w;
do {
const __m128i v_ra_b = mm_loadl_64(mask);
const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_32(dst, v_res_b);
xx_storel_32(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -449,22 +418,22 @@ static void blend_mask6_sx_sy_w8_sse4_1(
(void)w;
do {
const __m128i v_ra_b = mm_loadu_128(mask);
const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
const __m128i v_ra_b = xx_loadu_128(mask);
const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
mm_storel_64(dst, v_res_b);
xx_storel_64(dst, v_res_b);
dst += dst_stride;
src0 += src0_stride;
@ -486,10 +455,10 @@ static void blend_mask6_sx_sy_w16n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 16) {
const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
@ -501,8 +470,8 @@ static void blend_mask6_sx_sy_w16n_sse4_1(
const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
@ -513,7 +482,7 @@ static void blend_mask6_sx_sy_w16n_sse4_1(
const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
mm_storeu_128(dst + c, v_res_b);
xx_storeu_128(dst + c, v_res_b);
}
dst += dst_stride;
src0 += src0_stride;
@ -575,38 +544,38 @@ typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = mm_loadl_64(src0);
const __m128i v_s1_w = mm_loadl_64(src1);
const __m128i v_s0_w = xx_loadl_64(src0);
const __m128i v_s1_w = xx_loadl_64(src1);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = mm_loadu_128(src0);
const __m128i v_s1_w = mm_loadu_128(src1);
const __m128i v_s0_w = xx_loadu_128(src0);
const __m128i v_s1_w = xx_loadu_128(src1);
const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
return v_res_w;
}
static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = mm_loadl_64(src0);
const __m128i v_s1_w = mm_loadl_64(src1);
const __m128i v_s0_w = xx_loadl_64(src0);
const __m128i v_s1_w = xx_loadl_64(src1);
// Interleave
const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@ -622,15 +591,15 @@ static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
// Round
const __m128i v_res_w = mm_round_epu16(v_pssum_d);
const __m128i v_res_w = xx_round_epu16(v_pssum_d);
return v_res_w;
}
static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
const __m128i v_m0_w, const __m128i v_m1_w) {
const __m128i v_s0_w = mm_loadu_128(src0);
const __m128i v_s1_w = mm_loadu_128(src1);
const __m128i v_s0_w = xx_loadu_128(src0);
const __m128i v_s1_w = xx_loadu_128(src1);
// Interleave
const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@ -650,7 +619,7 @@ static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
// Round
const __m128i v_res_w = mm_round_epu16(v_pssum_d);
const __m128i v_res_w = xx_round_epu16(v_pssum_d);
return v_res_w;
}
@ -668,13 +637,13 @@ static INLINE void blend_mask6_bn_w4_sse4_1(
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
const __m128i v_m0_b = mm_loadl_32(mask);
const __m128i v_m0_b = xx_loadl_32(mask);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
mm_storel_64(dst, v_res_w);
xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@ -718,13 +687,13 @@ static inline void blend_mask6_bn_w8n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 8) {
const __m128i v_m0_b = mm_loadl_64(mask + c);
const __m128i v_m0_b = xx_loadl_64(mask + c);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
mm_storeu_128(dst + c, v_res_w);
xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
@ -770,7 +739,7 @@ static INLINE void blend_mask6_bn_sx_w4_sse4_1(
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
const __m128i v_r_b = mm_loadl_64(mask);
const __m128i v_r_b = xx_loadl_64(mask);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -778,7 +747,7 @@ static INLINE void blend_mask6_bn_sx_w4_sse4_1(
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
mm_storel_64(dst, v_res_w);
xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@ -824,7 +793,7 @@ static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 8) {
const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -832,7 +801,7 @@ static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
mm_storeu_128(dst + c, v_res_w);
xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
@ -876,8 +845,8 @@ static INLINE void blend_mask6_bn_sy_w4_sse4_1(
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
const __m128i v_ra_b = mm_loadl_32(mask);
const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
const __m128i v_ra_b = xx_loadl_32(mask);
const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -885,7 +854,7 @@ static INLINE void blend_mask6_bn_sy_w4_sse4_1(
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
mm_storel_64(dst, v_res_w);
xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@ -929,8 +898,8 @@ static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 8) {
const __m128i v_ra_b = mm_loadl_64(mask + c);
const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
const __m128i v_ra_b = xx_loadl_64(mask + c);
const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -938,7 +907,7 @@ static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
mm_storeu_128(dst + c, v_res_w);
xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;
@ -984,20 +953,20 @@ static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
do {
const __m128i v_ra_b = mm_loadl_64(mask);
const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
const __m128i v_ra_b = xx_loadl_64(mask);
const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
mm_storel_64(dst, v_res_w);
xx_storel_64(dst, v_res_w);
dst += dst_stride;
src0 += src0_stride;
@ -1043,20 +1012,20 @@ static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
do {
int c;
for (c = 0; c < w; c += 8) {
const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
v_zmask_b);
const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
mm_storeu_128(dst + c, v_res_w);
xx_storeu_128(dst + c, v_res_w);
}
dst += dst_stride;
src0 += src0_stride;

69
vpx_dsp/x86/synonyms.h Normal file
Просмотреть файл

@ -0,0 +1,69 @@
/*
* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#ifndef VPX_DSP_X86_SYNONYS_H_
#define VPX_DSP_X86_SYNONYS_H_
#include <immintrin.h>
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
/**
* Various reusable shorthands for x86 SIMD intrinsics.
*
* Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
* Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
*/
// Loads and stores to do away with the tedium of casting the address
// to the right type.
static INLINE __m128i xx_loadl_32(const void *a) {
return _mm_cvtsi32_si128(*(const uint32_t*)a);
}
static INLINE __m128i xx_loadl_64(const void *a) {
return _mm_loadl_epi64((const __m128i*)a);
}
static INLINE __m128i xx_load_128(const void *a) {
return _mm_load_si128((const __m128i*)a);
}
static INLINE __m128i xx_loadu_128(const void *a) {
return _mm_loadu_si128((const __m128i*)a);
}
static INLINE void xx_storel_32(void *const a, const __m128i v) {
*(uint32_t*)a = _mm_cvtsi128_si32(v);
}
static INLINE void xx_storel_64(void *const a, const __m128i v) {
_mm_storel_epi64((__m128i*)a, v);
}
static INLINE void xx_store_128(void *const a, const __m128i v) {
_mm_store_si128((__m128i*)a, v);
}
static INLINE void xx_storeu_128(void *const a, const __m128i v) {
_mm_storeu_si128((__m128i*)a, v);
}
static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
}
static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
}
#endif // VPX_DSP_X86_SYNONYS_H_