Add newly necessary casts in SIMD compat headers (#14733)

https://github.com/llvm/llvm-project/commit/db7efcab7dd9 updated the wasm_*_extract_lane intrinsics to be functions instead of macros, which means their arguments must now be v128_t rather than any other 128-bit vector type under -fno-lax-vector-conversions. Update the SIMD compat headers with more casts to keep them compiling cleanly.
2021-07-22 14:08:24 -07:00 · 2021-07-22 14:08:24 -07:00 · ecf79d7cee
--- a/system/include/compat/avxintrin.h
+++ b/system/include/compat/avxintrin.h
@ -155,17 +155,23 @@ _mm_maskload_ps(const float *__mem_addr, __m128i __mask)
 static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_maskstore_pd(double *__mem_addr, __m128i __mask, __m128d __a)
 {
-  if ((wasm_i64x2_extract_lane(__mask, 0) & 0x8000000000000000ull) != 0) __mem_addr[0] = wasm_f64x2_extract_lane(__a, 0);
-  if ((wasm_i64x2_extract_lane(__mask, 1) & 0x8000000000000000ull) != 0) __mem_addr[1] = wasm_f64x2_extract_lane(__a, 1);
+  if ((wasm_i64x2_extract_lane(__mask, 0) & 0x8000000000000000ull) != 0)
+    __mem_addr[0] = wasm_f64x2_extract_lane((v128_t)__a, 0);
+  if ((wasm_i64x2_extract_lane(__mask, 1) & 0x8000000000000000ull) != 0)
+    __mem_addr[1] = wasm_f64x2_extract_lane((v128_t)__a, 1);
 }

 static __inline__ void __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_maskstore_ps(float *__mem_addr, __m128i __mask, __m128 __a)
 {
-  if ((wasm_i32x4_extract_lane(__mask, 0) & 0x80000000ull) != 0) __mem_addr[0] = wasm_f32x4_extract_lane(__a, 0);
-  if ((wasm_i32x4_extract_lane(__mask, 1) & 0x80000000ull) != 0) __mem_addr[1] = wasm_f32x4_extract_lane(__a, 1);
-  if ((wasm_i32x4_extract_lane(__mask, 2) & 0x80000000ull) != 0) __mem_addr[2] = wasm_f32x4_extract_lane(__a, 2);
-  if ((wasm_i32x4_extract_lane(__mask, 3) & 0x80000000ull) != 0) __mem_addr[3] = wasm_f32x4_extract_lane(__a, 3);
+  if ((wasm_i32x4_extract_lane(__mask, 0) & 0x80000000ull) != 0)
+    __mem_addr[0] = wasm_f32x4_extract_lane((v128_t)__a, 0);
+  if ((wasm_i32x4_extract_lane(__mask, 1) & 0x80000000ull) != 0)
+    __mem_addr[1] = wasm_f32x4_extract_lane((v128_t)__a, 1);
+  if ((wasm_i32x4_extract_lane(__mask, 2) & 0x80000000ull) != 0)
+    __mem_addr[2] = wasm_f32x4_extract_lane((v128_t)__a, 2);
+  if ((wasm_i32x4_extract_lane(__mask, 3) & 0x80000000ull) != 0)
+    __mem_addr[3] = wasm_f32x4_extract_lane((v128_t)__a, 3);
 }

 #define _mm_permute_pd(__a, __imm) __extension__ ({ \
@ -181,18 +187,17 @@ static __inline__ __m128d __attribute__((__always_inline__, __nodebug__))
 _mm_permutevar_pd(__m128d __a, __m128d __b)
 {
  return (__m128d)wasm_f64x2_make(
-      ((__f64x2)__a)[(wasm_i64x2_extract_lane(__b, 0) >> 1) & 1],
-      ((__f64x2)__a)[(wasm_i64x2_extract_lane(__b, 1) >> 1) & 1]);
+    ((__f64x2)__a)[(wasm_i64x2_extract_lane((v128_t)__b, 0) >> 1) & 1],
+    ((__f64x2)__a)[(wasm_i64x2_extract_lane((v128_t)__b, 1) >> 1) & 1]);
 }

 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
 _mm_permutevar_ps(__m128 __a, __m128 __b)
 {
-  return (__m128)wasm_f32x4_make(
-      ((__f32x4)__a)[wasm_i32x4_extract_lane(__b, 0) & 3],
-      ((__f32x4)__a)[wasm_i32x4_extract_lane(__b, 1) & 3],
-      ((__f32x4)__a)[wasm_i32x4_extract_lane(__b, 2) & 3],
-      ((__f32x4)__a)[wasm_i32x4_extract_lane(__b, 3) & 3]);
+  return (__m128)wasm_f32x4_make(((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 0) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 1) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 2) & 3],
+    ((__f32x4)__a)[wasm_i32x4_extract_lane((v128_t)__b, 3) & 3]);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
--- a/system/include/compat/emmintrin.h
+++ b/system/include/compat/emmintrin.h
@ -289,73 +289,73 @@ _mm_cmpnge_sd(__m128d __a, __m128d __b)
 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_comieq_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) == wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) == wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_comilt_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) < wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) < wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_comile_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) <= wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) <= wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_comigt_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) > wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) > wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_comige_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) >= wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) >= wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_comineq_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) != wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) != wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_ucomieq_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) == wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) == wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_ucomilt_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) < wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) < wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_ucomile_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) <= wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) <= wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_ucomigt_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) > wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) > wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_ucomige_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) >= wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) >= wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__))
 _mm_ucomineq_sd(__m128d __a, __m128d __b)
 {
-  return wasm_f64x2_extract_lane(__a, 0) != wasm_f64x2_extract_lane(__b, 0);
+  return wasm_f64x2_extract_lane((v128_t)__a, 0) != wasm_f64x2_extract_lane((v128_t)__b, 0);
 }

 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__))
--- a/system/include/compat/smmintrin.h
+++ b/system/include/compat/smmintrin.h
@ -262,8 +262,8 @@ _mm_max_epu32(__m128i __a, __m128i __b)
                                        (((__imm8) & 4) ? 6 : 2), \
                                        (((__imm8) & 8) ? 7 : 3)); })

-#define _mm_extract_ps(__a, __imm8) __extension__ ({       \
-                                       wasm_i32x4_extract_lane((__a), (__imm8) & 3); })
+#define _mm_extract_ps(__a, __imm8)                                                                \
+  __extension__({ wasm_i32x4_extract_lane((v128_t)(__a), (__imm8)&3); })

 #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __f32x4 __a = (__f32x4)(X); \
                                                    (D) = __a[N]; }))
--- a/system/include/compat/xmmintrin.h
+++ b/system/include/compat/xmmintrin.h
@ -515,73 +515,73 @@ _mm_cmpnlt_ss(__m128 __a, __m128 __b)
 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_comieq_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) == wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) == wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_comige_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) >= wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) >= wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_comigt_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) > wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) > wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_comile_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) <= wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) <= wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_comilt_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) < wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) < wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_comineq_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) != wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) != wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_ucomieq_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) == wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) == wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_ucomige_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) >= wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) >= wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_ucomigt_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) > wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) > wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_ucomile_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) <= wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) <= wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_ucomilt_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) < wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) < wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ int __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
 _mm_ucomineq_ss(__m128 __a, __m128 __b)
 {
-  return wasm_f32x4_extract_lane(__a, 0) != wasm_f32x4_extract_lane(__b, 0);
+  return wasm_f32x4_extract_lane((v128_t)__a, 0) != wasm_f32x4_extract_lane((v128_t)__b, 0);
 }

 static __inline__ __m128 __attribute__((__always_inline__, __nodebug__, DIAGNOSE_SLOW))
--- a/tests/sse/test_sse_diagnostic.cpp
+++ b/tests/sse/test_sse_diagnostic.cpp
@ -11,5 +11,5 @@
 int main() {
  __m128 a = _mm_set_ps(1.0f, 2.0f, 3.0f, 4.0f);
  __m128 b = _mm_rcp_ps(a);
-  return (int)wasm_f32x4_extract_lane(b, 0);
+  return (int)wasm_f32x4_extract_lane((v128_t)b, 0);
 }