DirectXMath/Extensions/DirectXMathSSE4.h

418 строки
12 KiB
C++

//-------------------------------------------------------------------------------------
// DirectXMathSSE4.h -- SSE4.1 extensions for SIMD C++ Math library
//
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
//
// http://go.microsoft.com/fwlink/?LinkID=615560
//-------------------------------------------------------------------------------------
#pragma once
#if defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC) || __arm__ || __aarch64__
#error SSE4 not supported on ARM platform
#endif
#include <smmintrin.h>
#include <DirectXMath.h>
namespace DirectX
{
namespace SSE4
{
inline bool XMVerifySSE4Support()
{
// Should return true on AMD Bulldozer, Intel Core 2 ("Penryn"), and Intel Core i7 ("Nehalem") or later processors
// See http://msdn.microsoft.com/en-us/library/hskdteyh.aspx
int CPUInfo[4] = { -1 };
#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
__cpuid(0, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 0);
#endif
if ( CPUInfo[0] < 1 )
return false;
#if (defined(__clang__) || defined(__GNUC__)) && defined(__cpuid)
__cpuid(1, CPUInfo[0], CPUInfo[1], CPUInfo[2], CPUInfo[3]);
#else
__cpuid(CPUInfo, 1);
#endif
// We only check for SSE4.1 instruction set. SSE4.2 instructions are not used.
return ( (CPUInfo[2] & 0x80000) == 0x80000 );
}
//-------------------------------------------------------------------------------------
// Vector
//-------------------------------------------------------------------------------------
#ifdef __clang__
#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
#endif
inline void XM_CALLCONV XMVectorGetYPtr(_Out_ float *y, _In_ FXMVECTOR V)
{
assert( y != nullptr );
*reinterpret_cast<int*>(y) = _mm_extract_ps( V, 1 );
}
inline void XM_CALLCONV XMVectorGetZPtr(_Out_ float *z, _In_ FXMVECTOR V)
{
assert( z != nullptr );
*reinterpret_cast<int*>(z) = _mm_extract_ps( V, 2 );
}
inline void XM_CALLCONV XMVectorGetWPtr(_Out_ float *w, _In_ FXMVECTOR V)
{
assert( w != nullptr );
*reinterpret_cast<int*>(w) = _mm_extract_ps( V, 3 );
}
inline uint32_t XM_CALLCONV XMVectorGetIntY(FXMVECTOR V)
{
__m128i V1 = _mm_castps_si128( V );
return static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
}
inline uint32_t XM_CALLCONV XMVectorGetIntZ(FXMVECTOR V)
{
__m128i V1 = _mm_castps_si128( V );
return static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
}
inline uint32_t XM_CALLCONV XMVectorGetIntW(FXMVECTOR V)
{
__m128i V1 = _mm_castps_si128( V );
return static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
}
inline void XM_CALLCONV XMVectorGetIntYPtr(_Out_ uint32_t *y, _In_ FXMVECTOR V)
{
assert( y != nullptr );
__m128i V1 = _mm_castps_si128( V );
*y = static_cast<uint32_t>( _mm_extract_epi32( V1, 1 ) );
}
inline void XM_CALLCONV XMVectorGetIntZPtr(_Out_ uint32_t *z, _In_ FXMVECTOR V)
{
assert( z != nullptr );
__m128i V1 = _mm_castps_si128( V );
*z = static_cast<uint32_t>( _mm_extract_epi32( V1, 2 ) );
}
inline void XM_CALLCONV XMVectorGetIntWPtr(_Out_ uint32_t *w, _In_ FXMVECTOR V)
{
assert( w != nullptr );
__m128i V1 = _mm_castps_si128( V );
*w = static_cast<uint32_t>( _mm_extract_epi32( V1, 3 ) );
}
inline XMVECTOR XM_CALLCONV XMVectorSetY(FXMVECTOR V, float y)
{
XMVECTOR vResult = _mm_set_ss(y);
vResult = _mm_insert_ps( V, vResult, 0x10 );
return vResult;
}
inline XMVECTOR XM_CALLCONV XMVectorSetZ(FXMVECTOR V, float z)
{
XMVECTOR vResult = _mm_set_ss(z);
vResult = _mm_insert_ps( V, vResult, 0x20 );
return vResult;
}
inline XMVECTOR XM_CALLCONV XMVectorSetW(FXMVECTOR V, float w)
{
XMVECTOR vResult = _mm_set_ss(w);
vResult = _mm_insert_ps( V, vResult, 0x30 );
return vResult;
}
inline XMVECTOR XM_CALLCONV XMVectorSetIntY(FXMVECTOR V, uint32_t y)
{
__m128i vResult = _mm_castps_si128( V );
vResult = _mm_insert_epi32( vResult, static_cast<int>(y), 1 );
return _mm_castsi128_ps( vResult );
}
inline XMVECTOR XM_CALLCONV XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
{
__m128i vResult = _mm_castps_si128( V );
vResult = _mm_insert_epi32( vResult, static_cast<int>(z), 2 );
return _mm_castsi128_ps( vResult );
}
inline XMVECTOR XM_CALLCONV XMVectorSetIntW(FXMVECTOR V, uint32_t w)
{
__m128i vResult = _mm_castps_si128( V );
vResult = _mm_insert_epi32( vResult, static_cast<int>(w), 3 );
return _mm_castsi128_ps( vResult );
}
inline XMVECTOR XM_CALLCONV XMVectorRound( FXMVECTOR V )
{
return _mm_round_ps( V, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
}
inline XMVECTOR XM_CALLCONV XMVectorTruncate( FXMVECTOR V )
{
return _mm_round_ps( V, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC );
}
inline XMVECTOR XM_CALLCONV XMVectorFloor( FXMVECTOR V )
{
return _mm_floor_ps( V );
}
inline XMVECTOR XM_CALLCONV XMVectorCeiling( FXMVECTOR V )
{
return _mm_ceil_ps( V );
}
//-------------------------------------------------------------------------------------
// Vector2
//-------------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMVector2Dot( FXMVECTOR V1, FXMVECTOR V2 )
{
return _mm_dp_ps( V1, V2, 0x3f );
}
inline XMVECTOR XM_CALLCONV XMVector2LengthSq( FXMVECTOR V )
{
return SSE4::XMVector2Dot(V, V);
}
inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLengthEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
return _mm_rsqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector2ReciprocalLength( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
return _mm_div_ps( g_XMOne, vLengthSq );
}
inline XMVECTOR XM_CALLCONV XMVector2LengthEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
return _mm_sqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector2Length( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
return _mm_sqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector2NormalizeEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x3f );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, V);
}
inline XMVECTOR XM_CALLCONV XMVector2Normalize( FXMVECTOR V )
{
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x3f );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(V,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
vResult = _mm_or_ps(vTemp1,vTemp2);
return vResult;
}
//-------------------------------------------------------------------------------------
// Vector3
//-------------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMVector3Dot( FXMVECTOR V1, FXMVECTOR V2 )
{
return _mm_dp_ps( V1, V2, 0x7f );
}
inline XMVECTOR XM_CALLCONV XMVector3LengthSq( FXMVECTOR V )
{
return SSE4::XMVector3Dot(V, V);
}
inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLengthEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
return _mm_rsqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector3ReciprocalLength( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
return _mm_div_ps( g_XMOne, vLengthSq );
}
inline XMVECTOR XM_CALLCONV XMVector3LengthEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
return _mm_sqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector3Length( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
return _mm_sqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector3NormalizeEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0x7f );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, V);
}
inline XMVECTOR XM_CALLCONV XMVector3Normalize( FXMVECTOR V )
{
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0x7f );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
vResult = _mm_or_ps(vTemp1,vTemp2);
return vResult;
}
//-------------------------------------------------------------------------------------
// Vector4
//-------------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMVector4Dot( FXMVECTOR V1, FXMVECTOR V2 )
{
return _mm_dp_ps( V1, V2, 0xff );
}
inline XMVECTOR XM_CALLCONV XMVector4LengthSq( FXMVECTOR V )
{
return SSE4::XMVector4Dot(V, V);
}
inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLengthEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
return _mm_rsqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector4ReciprocalLength( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
XMVECTOR vLengthSq = _mm_sqrt_ps( vTemp );
return _mm_div_ps( g_XMOne, vLengthSq );
}
inline XMVECTOR XM_CALLCONV XMVector4LengthEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
return _mm_sqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector4Length( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
return _mm_sqrt_ps( vTemp );
}
inline XMVECTOR XM_CALLCONV XMVector4NormalizeEst( FXMVECTOR V )
{
XMVECTOR vTemp = _mm_dp_ps( V, V, 0xff );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, V);
}
inline XMVECTOR XM_CALLCONV XMVector4Normalize( FXMVECTOR V )
{
XMVECTOR vLengthSq = _mm_dp_ps( V, V, 0xff );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Create zero with a single instruction
XMVECTOR vZeroMask = _mm_setzero_ps();
// Test for a divide by zero (Must be FP to detect -0.0)
vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Divide to perform the normalization
vResult = _mm_div_ps(V,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vZeroMask);
// Select qnan or result based on infinite length
XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
vResult = _mm_or_ps(vTemp1,vTemp2);
return vResult;
}
//-------------------------------------------------------------------------------------
// Plane
//-------------------------------------------------------------------------------------
inline XMVECTOR XM_CALLCONV XMPlaneNormalizeEst( FXMVECTOR P )
{
XMVECTOR vTemp = _mm_dp_ps( P, P, 0x7f );
XMVECTOR vResult = _mm_rsqrt_ps( vTemp );
return _mm_mul_ps(vResult, P);
}
inline XMVECTOR XM_CALLCONV XMPlaneNormalize( FXMVECTOR P )
{
XMVECTOR vLengthSq = _mm_dp_ps( P, P, 0x7f );
// Prepare for the division
XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
// Failsafe on zero (Or epsilon) length planes
// If the length is infinity, set the elements to zero
vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
// Reciprocal mul to perform the normalization
vResult = _mm_div_ps(P,vResult);
// Any that are infinity, set to zero
vResult = _mm_and_ps(vResult,vLengthSq);
return vResult;
}
} // namespace SSE4
} // namespace DirectX