Factor out x86 SIMD intrinsic synonyms

Change-Id: Idc4ac3ccd2ba19087cdb74a3e4a6774ac50386aa
2016-06-01 13:53:16 +01:00 · 2016-06-01 13:53:16 +01:00 · 9ebca46933
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_mask6_sse4.c
@ -16,51 +16,20 @@
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_dsp_common.h"

+#include "vpx_dsp/x86/synonyms.h"
+
 #include "./vpx_dsp_rtcd.h"

 #define MASK_BITS 6

-static INLINE __m128i mm_loadl_32(const void *a) {
-  return _mm_cvtsi32_si128(*(const uint32_t*)a);
-}
-
-static INLINE __m128i mm_loadl_64(const void *a) {
-  return _mm_loadl_epi64((const __m128i*)a);
-}
-
-static INLINE __m128i mm_loadu_128(const void *a) {
-  return _mm_loadu_si128((const __m128i*)a);
-}
-
-static INLINE void mm_storel_32(void *const a, const __m128i v) {
-  *(uint32_t*)a = _mm_cvtsi128_si32(v);
-}
-
-static INLINE void mm_storel_64(void *const a, const __m128i v) {
-  _mm_storel_epi64((__m128i*)a, v);
-}
-
-static INLINE void mm_storeu_128(void *const a, const __m128i v) {
-  _mm_storeu_si128((__m128i*)a, v);
-}
-
-static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
-  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
-}
-
-static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
-  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
-  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // Common kernels
 //////////////////////////////////////////////////////////////////////////////

 static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = mm_loadl_32(src0);
-  const __m128i v_s1_b = mm_loadl_32(src1);
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);

@ -69,15 +38,15 @@ static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,

  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);

-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);

  return v_res_w;
 }

 static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = mm_loadl_64(src0);
-  const __m128i v_s1_b = mm_loadl_64(src1);
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);

@ -86,7 +55,7 @@ static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,

  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);

-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);

  return v_res_w;
 }
@ -106,7 +75,7 @@ static void blend_mask6_w4_sse4_1(
  (void)w;

  do {
-    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_b = xx_loadl_32(mask);
    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

@ -114,7 +83,7 @@ static void blend_mask6_w4_sse4_1(

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -134,7 +103,7 @@ static void blend_mask6_w8_sse4_1(
  (void)w;

  do {
-    const __m128i v_m0_b = mm_loadl_64(mask);
+    const __m128i v_m0_b = xx_loadl_64(mask);
    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

@ -142,7 +111,7 @@ static void blend_mask6_w8_sse4_1(

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -162,8 +131,8 @@ static void blend_mask6_w16n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 16) {
-      const __m128i v_m0l_b = mm_loadl_64(mask + c);
-      const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+      const __m128i v_m0l_b = xx_loadl_64(mask + c);
+      const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
@ -176,7 +145,7 @@ static void blend_mask6_w16n_sse4_1(

      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);

-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -202,7 +171,7 @@ static void blend_mask6_sx_w4_sse4_1(
  (void)w;

  do {
-    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_r_b = xx_loadl_64(mask);
    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));

    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -212,7 +181,7 @@ static void blend_mask6_sx_w4_sse4_1(

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -234,7 +203,7 @@ static void blend_mask6_sx_w8_sse4_1(
  (void)w;

  do {
-    const __m128i v_r_b = mm_loadu_128(mask);
+    const __m128i v_r_b = xx_loadu_128(mask);
    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));

    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -244,7 +213,7 @@ static void blend_mask6_sx_w8_sse4_1(

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -266,8 +235,8 @@ static void blend_mask6_sx_w16n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 16) {
-      const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
-      const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));

@ -283,7 +252,7 @@ static void blend_mask6_sx_w16n_sse4_1(

      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);

-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -307,8 +276,8 @@ static void blend_mask6_sy_w4_sse4_1(
  (void)w;

  do {
-    const __m128i v_ra_b = mm_loadl_32(mask);
-    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);

    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -318,7 +287,7 @@ static void blend_mask6_sy_w4_sse4_1(

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -338,8 +307,8 @@ static void blend_mask6_sy_w8_sse4_1(
  (void)w;

  do {
-    const __m128i v_ra_b = mm_loadl_64(mask);
-    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);

    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -349,7 +318,7 @@ static void blend_mask6_sy_w8_sse4_1(

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -370,8 +339,8 @@ static void blend_mask6_sy_w16n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 16) {
-      const __m128i v_ra_b = mm_loadu_128(mask + c);
-      const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+      const __m128i v_ra_b = xx_loadu_128(mask + c);
+      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);

      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
@ -386,7 +355,7 @@ static void blend_mask6_sy_w16n_sse4_1(

      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);

-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -412,22 +381,22 @@ static void blend_mask6_sx_sy_w4_sse4_1(
  (void)w;

  do {
-    const __m128i v_ra_b = mm_loadl_64(mask);
-    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                           v_zmask_b);
    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);

-    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_32(dst, v_res_b);
+    xx_storel_32(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -449,22 +418,22 @@ static void blend_mask6_sx_sy_w8_sse4_1(
  (void)w;

  do {
-    const __m128i v_ra_b = mm_loadu_128(mask);
-    const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                           v_zmask_b);
    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);

-    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);

    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);

-    mm_storel_64(dst, v_res_b);
+    xx_storel_64(dst, v_res_b);

    dst += dst_stride;
    src0 += src0_stride;
@ -486,10 +455,10 @@ static void blend_mask6_sx_sy_w16n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 16) {
-      const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
-      const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
-      const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
-      const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
@ -501,8 +470,8 @@ static void blend_mask6_sx_sy_w16n_sse4_1(
      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);

-      const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
-      const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);

@ -513,7 +482,7 @@ static void blend_mask6_sx_sy_w16n_sse4_1(

      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);

-      mm_storeu_128(dst + c, v_res_b);
+      xx_storeu_128(dst + c, v_res_b);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -575,38 +544,38 @@ typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,

 static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadl_64(src0);
-  const __m128i v_s1_w = mm_loadl_64(src1);
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);

  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);

  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);

-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);

  return v_res_w;
 }

 static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadu_128(src0);
-  const __m128i v_s1_w = mm_loadu_128(src1);
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);

  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);

  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);

-  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);

  return v_res_w;
 }

 static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadl_64(src0);
-  const __m128i v_s1_w = mm_loadl_64(src1);
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);

  // Interleave
  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@ -622,15 +591,15 @@ static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);

  // Round
-  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);

  return v_res_w;
 }

 static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = mm_loadu_128(src0);
-  const __m128i v_s1_w = mm_loadu_128(src1);
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);

  // Interleave
  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
@ -650,7 +619,7 @@ static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);

  // Round
-  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);

  return v_res_w;
 }
@ -668,13 +637,13 @@ static INLINE void blend_mask6_bn_w4_sse4_1(
  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);

  do {
-    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_b = xx_loadl_32(mask);
    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);

-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);

    dst += dst_stride;
    src0 += src0_stride;
@ -718,13 +687,13 @@ static inline void blend_mask6_bn_w8n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 8) {
-      const __m128i v_m0_b = mm_loadl_64(mask + c);
+      const __m128i v_m0_b = xx_loadl_64(mask + c);
      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);

-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -770,7 +739,7 @@ static INLINE void blend_mask6_bn_sx_w4_sse4_1(
  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);

  do {
-    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_r_b = xx_loadl_64(mask);
    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));

    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -778,7 +747,7 @@ static INLINE void blend_mask6_bn_sx_w4_sse4_1(

    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);

-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);

    dst += dst_stride;
    src0 += src0_stride;
@ -824,7 +793,7 @@ static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 8) {
-      const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));

      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
@ -832,7 +801,7 @@ static INLINE void blend_mask6_bn_sx_w8n_sse4_1(

      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);

-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -876,8 +845,8 @@ static INLINE void blend_mask6_bn_sy_w4_sse4_1(
  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);

  do {
-    const __m128i v_ra_b = mm_loadl_32(mask);
-    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);

    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -885,7 +854,7 @@ static INLINE void blend_mask6_bn_sy_w4_sse4_1(

    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);

-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);

    dst += dst_stride;
    src0 += src0_stride;
@ -929,8 +898,8 @@ static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = mm_loadl_64(mask + c);
-      const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+      const __m128i v_ra_b = xx_loadl_64(mask + c);
+      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);

      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
@ -938,7 +907,7 @@ static INLINE void blend_mask6_bn_sy_w8n_sse4_1(

      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);

-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
    }
    dst += dst_stride;
    src0 += src0_stride;
@ -984,20 +953,20 @@ static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);

  do {
-    const __m128i v_ra_b = mm_loadl_64(mask);
-    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                           v_zmask_b);
    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);

-    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);

-    mm_storel_64(dst, v_res_w);
+    xx_storel_64(dst, v_res_w);

    dst += dst_stride;
    src0 += src0_stride;
@ -1043,20 +1012,20 @@ static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
  do {
    int c;
    for (c = 0; c < w; c += 8) {
-      const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
-      const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
      const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
                                             v_zmask_b);
      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);

-      const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);

      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);

-      mm_storeu_128(dst + c, v_res_w);
+      xx_storeu_128(dst + c, v_res_w);
    }
    dst += dst_stride;
    src0 += src0_stride;
--- a/vpx_dsp/x86/synonyms.h
+++ b/vpx_dsp/x86/synonyms.h
@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_SYNONYS_H_
+#define VPX_DSP_X86_SYNONYS_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+  return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+  return _mm_load_si128((const __m128i*)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+  *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+  _mm_store_si128((__m128i*)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+#endif  // VPX_DSP_X86_SYNONYS_H_