зеркало из https://github.com/mozilla/gecko-dev.git
Backed out 3 changesets (bug 1804226, bug 1801557) for AudioNodeEngine bustages related. CLOSED TREE
Backed out changeset 55a4d00bc8b2 (bug 1804226) Backed out changeset f47bedfe0f5f (bug 1801557) Backed out changeset cccb159a5b46 (bug 1801557)
This commit is contained in:
Родитель
1481a588ef
Коммит
848e2cfb4b
|
@ -3091,5 +3091,3 @@ set_config("MMX_FLAGS", ["-mmmx"])
|
|||
set_config("SSE_FLAGS", ["-msse"])
|
||||
set_config("SSE2_FLAGS", ["-msse2"])
|
||||
set_config("SSSE3_FLAGS", ["-mssse3"])
|
||||
set_config("SSE4_2_FLAGS", ["-msse4.2"])
|
||||
set_config("FMA_FLAGS", ["-mfma"])
|
||||
|
|
|
@ -9,15 +9,12 @@
|
|||
#include "mozilla/AbstractThread.h"
|
||||
#ifdef USE_NEON
|
||||
# include "mozilla/arm.h"
|
||||
# include "AudioNodeEngineGeneric.h"
|
||||
# include "AudioNodeEngineNEON.h"
|
||||
#endif
|
||||
#ifdef USE_SSE2
|
||||
# include "mozilla/SSE.h"
|
||||
# include "AudioNodeEngineGeneric.h"
|
||||
#endif
|
||||
#if defined(USE_SSE42) && defined(USE_FMA3)
|
||||
# include "mozilla/SSE.h"
|
||||
# include "AudioNodeEngineGeneric.h"
|
||||
# include "AlignmentUtils.h"
|
||||
# include "AudioNodeEngineSSE2.h"
|
||||
#endif
|
||||
#include "AudioBlock.h"
|
||||
#include "Tracing.h"
|
||||
|
@ -70,8 +67,7 @@ void AudioBufferAddWithScale(const float* aInput, float aScale, float* aOutput,
|
|||
uint32_t aSize) {
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBufferAddWithScale(aInput, aScale, aOutput,
|
||||
aSize);
|
||||
AudioBufferAddWithScale_NEON(aInput, aScale, aOutput, aSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -97,16 +93,7 @@ void AudioBufferAddWithScale(const float* aInput, float aScale, float* aOutput,
|
|||
// we need to round aSize down to the nearest multiple of 16
|
||||
uint32_t alignedSize = aSize & ~0x0F;
|
||||
if (alignedSize > 0) {
|
||||
# if defined(USE_SSE42) && defined(USE_FMA3)
|
||||
if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
|
||||
Engine<xsimd::fma3<xsimd::sse4_2>>::AudioBufferAddWithScale(
|
||||
aInput, aScale, aOutput, alignedSize);
|
||||
} else
|
||||
# endif
|
||||
{
|
||||
Engine<xsimd::sse2>::AudioBufferAddWithScale(aInput, aScale, aOutput,
|
||||
alignedSize);
|
||||
}
|
||||
AudioBufferAddWithScale_SSE(aInput, aScale, aOutput, alignedSize);
|
||||
|
||||
// adjust parameters for use with scalar operations below
|
||||
aInput += alignedSize;
|
||||
|
@ -140,16 +127,14 @@ void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
|
|||
} else {
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
|
||||
aOutput);
|
||||
AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
|
||||
aOutput);
|
||||
AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -164,16 +149,7 @@ void BufferComplexMultiply(const float* aInput, const float* aScale,
|
|||
float* aOutput, uint32_t aSize) {
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse()) {
|
||||
# if defined(USE_SSE42) && defined(USE_FMA3)
|
||||
if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
|
||||
Engine<xsimd::fma3<xsimd::sse4_2>>::BufferComplexMultiply(aInput, aScale,
|
||||
aOutput, aSize);
|
||||
} else
|
||||
# endif
|
||||
{
|
||||
Engine<xsimd::sse2>::BufferComplexMultiply(aInput, aScale, aOutput,
|
||||
aSize);
|
||||
}
|
||||
BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -206,16 +182,14 @@ void AudioBlockCopyChannelWithScale(const float aInput[WEBAUDIO_BLOCK_SIZE],
|
|||
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
|
||||
aOutput);
|
||||
AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
|
||||
aOutput);
|
||||
AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -240,14 +214,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
|
|||
}
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
|
||||
AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
|
||||
AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -260,14 +234,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
|
|||
void AudioBufferInPlaceScale(float* aBlock, float* aScale, uint32_t aSize) {
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
|
||||
AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
|
||||
AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -301,24 +275,16 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
|||
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
|
||||
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
|
||||
AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
|
||||
aIsOnTheLeft, aOutputL, aOutputR);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
# if defined(USE_SSE42) && defined(USE_FMA3)
|
||||
if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
|
||||
Engine<xsimd::fma3<xsimd::sse4_2>>::AudioBlockPanStereoToStereo(
|
||||
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
|
||||
} else
|
||||
# endif
|
||||
{
|
||||
Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
|
||||
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
|
||||
}
|
||||
AudioBlockPanStereoToStereo_SSE(aInputL, aInputR, aGainL, aGainR,
|
||||
aIsOnTheLeft, aOutputL, aOutputR);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -347,24 +313,8 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
|||
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
#ifdef USE_NEON
|
||||
if (mozilla::supports_neon()) {
|
||||
Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
|
||||
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
# if defined(USE_SSE42) && defined(USE_FMA3)
|
||||
if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
|
||||
Engine<xsimd::fma3<xsimd::sse2>>::AudioBlockPanStereoToStereo(
|
||||
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
|
||||
} else
|
||||
# endif
|
||||
{
|
||||
Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
|
||||
aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
|
||||
}
|
||||
AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
|
||||
aIsOnTheLeft, aOutputL, aOutputR);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
@ -399,16 +349,7 @@ float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
|
|||
}
|
||||
|
||||
uint32_t vLength = (aLength >> 4) << 4;
|
||||
# if defined(USE_SSE42) && defined(USE_FMA3)
|
||||
if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
|
||||
sum += Engine<xsimd::fma3<xsimd::sse4_2>>::AudioBufferSumOfSquares(
|
||||
alignedInput, vLength);
|
||||
} else
|
||||
# endif
|
||||
{
|
||||
sum +=
|
||||
Engine<xsimd::sse2>::AudioBufferSumOfSquares(alignedInput, vLength);
|
||||
}
|
||||
sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);
|
||||
|
||||
// adjust aInput and aLength to use scalar operations for any
|
||||
// remaining values
|
||||
|
@ -427,7 +368,7 @@ float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
|
|||
void NaNToZeroInPlace(float* aSamples, size_t aCount) {
|
||||
#ifdef USE_SSE2
|
||||
if (mozilla::supports_sse2()) {
|
||||
Engine<xsimd::sse2>::NaNToZeroInPlace(aSamples, aCount);
|
||||
NaNToZeroInPlace_SSE(aSamples, aCount);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -1,261 +0,0 @@
|
|||
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* this source code form is subject to the terms of the mozilla public
|
||||
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef MOZILLA_AUDIONODEENGINEGENERIC_H_
|
||||
#define MOZILLA_AUDIONODEENGINEGENERIC_H_
|
||||
|
||||
#include "AudioNodeEngine.h"
|
||||
#include "AlignmentUtils.h"
|
||||
|
||||
#include "xsimd/xsimd.hpp"
|
||||
|
||||
namespace mozilla {
|
||||
|
||||
template <class Arch>
|
||||
struct Engine {
|
||||
static void AudioBufferAddWithScale(const float* aInput, float aScale,
|
||||
float* aOutput, uint32_t aSize) {
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
xsimd::batch<float, Arch> vgain(aScale);
|
||||
|
||||
#pragma GCC unroll(4)
|
||||
for (unsigned i = 0; i < aSize; i += 4 * xsimd::batch<float, Arch>::size) {
|
||||
auto vin1 = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
|
||||
auto vin2 = xsimd::batch<float, Arch>::load_aligned(&aOutput[i]);
|
||||
auto vout = xsimd::fma(vin1, vgain, vin2);
|
||||
vout.store_aligned(&aOutput[i]);
|
||||
}
|
||||
};
|
||||
|
||||
static void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
|
||||
float* aOutput) {
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
|
||||
xsimd::batch<float, Arch> vgain = (aScale);
|
||||
|
||||
#pragma GCC unroll(4)
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
|
||||
i += xsimd::batch<float, Arch>::size) {
|
||||
auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
|
||||
auto vout = vin * vgain;
|
||||
vout.store_aligned(&aOutput[i]);
|
||||
}
|
||||
};
|
||||
|
||||
static void AudioBlockCopyChannelWithScale(
|
||||
const float aInput[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aScale[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aScale);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
|
||||
#pragma GCC unroll(4)
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
|
||||
i += xsimd::batch<float, Arch>::size) {
|
||||
auto vscaled = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
|
||||
auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
|
||||
auto vout = vin * vscaled;
|
||||
vout.store_aligned(&aOutput[i]);
|
||||
}
|
||||
};
|
||||
|
||||
static void AudioBufferInPlaceScale(float* aBlock, float aScale,
|
||||
uint32_t aSize) {
|
||||
ASSERT_ALIGNED16(aBlock);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
xsimd::batch<float, Arch> vgain(aScale);
|
||||
|
||||
#pragma GCC unroll(4)
|
||||
for (unsigned i = 0; i < aSize; i += xsimd::batch<float, Arch>::size) {
|
||||
auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
|
||||
auto vout = vin * vgain;
|
||||
vout.store_aligned(&aBlock[i]);
|
||||
}
|
||||
};
|
||||
|
||||
static void AudioBufferInPlaceScale(float* aBlock, float* aScale,
|
||||
uint32_t aSize) {
|
||||
ASSERT_ALIGNED16(aBlock);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
#pragma GCC unroll(4)
|
||||
for (unsigned i = 0; i < aSize; i += xsimd::batch<float, Arch>::size) {
|
||||
auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
|
||||
auto vgain = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
|
||||
auto vout = vin * vgain;
|
||||
vout.store_aligned(&aBlock[i]);
|
||||
}
|
||||
};
|
||||
|
||||
static void AudioBlockPanStereoToStereo(
|
||||
const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR,
|
||||
bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
ASSERT_ALIGNED16(aInputL);
|
||||
ASSERT_ALIGNED16(aInputR);
|
||||
ASSERT_ALIGNED16(aOutputL);
|
||||
ASSERT_ALIGNED16(aOutputR);
|
||||
|
||||
xsimd::batch<float, Arch> vgainl(aGainL);
|
||||
xsimd::batch<float, Arch> vgainr(aGainR);
|
||||
|
||||
if (aIsOnTheLeft) {
|
||||
#pragma GCC unroll(2)
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
|
||||
i += xsimd::batch<float, Arch>::size) {
|
||||
auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
|
||||
auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
|
||||
|
||||
/* left channel : aOutputL = aInputL + aInputR * gainL */
|
||||
auto vout = xsimd::fma(vinr, vgainl, vinl);
|
||||
vout.store_aligned(&aOutputL[i]);
|
||||
|
||||
/* right channel : aOutputR = aInputR * gainR */
|
||||
auto vscaled = vinr * vgainr;
|
||||
vscaled.store_aligned(&aOutputR[i]);
|
||||
}
|
||||
} else {
|
||||
#pragma GCC unroll(2)
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
|
||||
i += xsimd::batch<float, Arch>::size) {
|
||||
auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
|
||||
auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
|
||||
|
||||
/* left channel : aInputL * gainL */
|
||||
auto vscaled = vinl * vgainl;
|
||||
vscaled.store_aligned(&aOutputL[i]);
|
||||
|
||||
/* right channel: aOutputR = aInputR + aInputL * gainR */
|
||||
auto vout = xsimd::fma(vinl, vgainr, vinr);
|
||||
vout.store_aligned(&aOutputR[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void BufferComplexMultiply(const float* aInput, const float* aScale,
|
||||
float* aOutput, uint32_t aSize) {
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aScale);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
#pragma GCC unroll(2)
|
||||
for (unsigned i = 0; i < aSize * 2;
|
||||
i += 2 * xsimd::batch<std::complex<float>>::size) {
|
||||
auto in1 = xsimd::batch<std::complex<float>>::load_aligned(
|
||||
reinterpret_cast<const std::complex<float>*>(&aInput[i]));
|
||||
auto in2 = xsimd::batch<std::complex<float>>::load_aligned(
|
||||
reinterpret_cast<const std::complex<float>*>(&aScale[i]));
|
||||
auto out = in1 * in2;
|
||||
out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
|
||||
}
|
||||
};
|
||||
|
||||
static float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_MULTIPLE16(aLength);
|
||||
|
||||
constexpr uint32_t unroll_factor = 4;
|
||||
xsimd::batch<float, Arch> accs[unroll_factor] = {0.f, 0.f, 0.f, 0.f};
|
||||
|
||||
for (uint32_t i = 0; i < aLength;
|
||||
i += unroll_factor * xsimd::batch<float, Arch>::size) {
|
||||
#pragma GCC unroll
|
||||
for (uint32_t j = 0; j < unroll_factor; ++j) {
|
||||
auto in = xsimd::batch<float, Arch>::load_aligned(
|
||||
&aInput[i + xsimd::batch<float, Arch>::size * j]);
|
||||
accs[j] = xsimd::fma(in, in, accs[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return reduce_add((accs[0] + accs[1]) + (accs[2] + accs[3]));
|
||||
};
|
||||
|
||||
static void NaNToZeroInPlace(float* aSamples, size_t aCount) {
|
||||
float* samplesAligned16 = ALIGNED16(aSamples);
|
||||
size_t leadingElementsScalar =
|
||||
std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
|
||||
size_t remainingElements = aCount - leadingElementsScalar;
|
||||
size_t vectoredEnd =
|
||||
aCount - remainingElements % (4 * xsimd::batch<float, Arch>::size);
|
||||
|
||||
MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) %
|
||||
(4 * xsimd::batch<float, Arch>::size)));
|
||||
|
||||
size_t i = 0;
|
||||
for (; i < leadingElementsScalar; i++) {
|
||||
if (aSamples[i] != aSamples[i]) {
|
||||
aSamples[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_ALIGNED16(&aSamples[i]);
|
||||
|
||||
#pragma GCC unroll(4)
|
||||
for (; i < vectoredEnd; i += xsimd::batch<float, Arch>::size) {
|
||||
auto vin = xsimd::batch<float, Arch>::load_aligned(&aSamples[i]);
|
||||
auto vout =
|
||||
xsimd::select(xsimd::isnan(vin), xsimd::batch<float, Arch>(0.f), vin);
|
||||
vout.store_aligned(&aSamples[i]);
|
||||
}
|
||||
for (; i < aCount; i++) {
|
||||
if (aSamples[i] != aSamples[i]) {
|
||||
aSamples[i] = 0.0;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void AudioBlockPanStereoToStereo(
|
||||
const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aGainL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aGainR[WEBAUDIO_BLOCK_SIZE],
|
||||
const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
ASSERT_ALIGNED16(aInputL);
|
||||
ASSERT_ALIGNED16(aInputR);
|
||||
ASSERT_ALIGNED16(aGainL);
|
||||
ASSERT_ALIGNED16(aGainR);
|
||||
ASSERT_ALIGNED16(aIsOnTheLeft);
|
||||
ASSERT_ALIGNED16(aOutputL);
|
||||
ASSERT_ALIGNED16(aOutputR);
|
||||
|
||||
#pragma GCC unroll(2)
|
||||
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE;
|
||||
i += xsimd::batch<float, Arch>::size) {
|
||||
auto mask =
|
||||
xsimd::batch_bool<float, Arch>::load_aligned(&aIsOnTheLeft[i]);
|
||||
|
||||
auto inputL = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
|
||||
auto inputR = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
|
||||
auto gainL = xsimd::batch<float, Arch>::load_aligned(&aGainL[i]);
|
||||
auto gainR = xsimd::batch<float, Arch>::load_aligned(&aGainR[i]);
|
||||
|
||||
auto outL_true = xsimd::fma(inputR, gainL, inputL);
|
||||
auto outR_true = inputR * gainR;
|
||||
|
||||
auto outL_false = inputL * gainL;
|
||||
auto outR_false = xsimd::fma(inputL, gainR, inputR);
|
||||
|
||||
auto outL = xsimd::select(mask, outL_true, outL_false);
|
||||
auto outR = xsimd::select(mask, outR_true, outR_false);
|
||||
|
||||
outL.store_aligned(&aOutputL[i]);
|
||||
outR.store_aligned(&aOutputR[i]);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mozilla
|
||||
|
||||
#endif
|
|
@ -3,7 +3,350 @@
|
|||
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "AudioNodeEngineGeneric.h"
|
||||
#include "AudioNodeEngineNEON.h"
|
||||
#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
|
||||
# include <arm64_neon.h>
|
||||
#else
|
||||
# include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
//#ifdef DEBUG
|
||||
#if 0 // see bug 921099
|
||||
# define ASSERT_ALIGNED(ptr) \
|
||||
MOZ_ASSERT((((uintptr_t)ptr + 15) & ~0x0F) == (uintptr_t)ptr, \
|
||||
#ptr " has to be aligned 16-bytes aligned.");
|
||||
#else
|
||||
# define ASSERT_ALIGNED(ptr)
|
||||
#endif
|
||||
|
||||
#define ADDRESS_OF(array, index) ((float32_t*)&array[index])
|
||||
|
||||
namespace mozilla {
|
||||
template struct Engine<xsimd::neon>;
|
||||
void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
|
||||
float* aOutput, uint32_t aSize) {
|
||||
ASSERT_ALIGNED(aInput);
|
||||
ASSERT_ALIGNED(aOutput);
|
||||
|
||||
float32x4_t vin0, vin1, vin2, vin3;
|
||||
float32x4_t vout0, vout1, vout2, vout3;
|
||||
float32x4_t vscale = vmovq_n_f32(aScale);
|
||||
|
||||
uint32_t dif = aSize % 16;
|
||||
aSize -= dif;
|
||||
unsigned i = 0;
|
||||
for (; i < aSize; i += 16) {
|
||||
vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
|
||||
vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
|
||||
vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
|
||||
vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
|
||||
|
||||
vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
|
||||
vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
|
||||
vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
|
||||
vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));
|
||||
|
||||
vout0 = vmlaq_f32(vout0, vin0, vscale);
|
||||
vout1 = vmlaq_f32(vout1, vin1, vscale);
|
||||
vout2 = vmlaq_f32(vout2, vin2, vscale);
|
||||
vout3 = vmlaq_f32(vout3, vin3, vscale);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
|
||||
}
|
||||
|
||||
for (unsigned j = 0; j < dif; ++i, ++j) {
|
||||
aOutput[i] += aInput[i] * aScale;
|
||||
}
|
||||
}
|
||||
void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
|
||||
float* aOutput) {
|
||||
ASSERT_ALIGNED(aInput);
|
||||
ASSERT_ALIGNED(aOutput);
|
||||
|
||||
float32x4_t vin0, vin1, vin2, vin3;
|
||||
float32x4_t vout0, vout1, vout2, vout3;
|
||||
float32x4_t vscale = vmovq_n_f32(aScale);
|
||||
|
||||
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
|
||||
vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
|
||||
vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
|
||||
vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
|
||||
vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
|
||||
|
||||
vout0 = vmulq_f32(vin0, vscale);
|
||||
vout1 = vmulq_f32(vin1, vscale);
|
||||
vout2 = vmulq_f32(vin2, vscale);
|
||||
vout3 = vmulq_f32(vin3, vscale);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBlockCopyChannelWithScale_NEON(
|
||||
const float aInput[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aScale[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
|
||||
ASSERT_ALIGNED(aInput);
|
||||
ASSERT_ALIGNED(aScale);
|
||||
ASSERT_ALIGNED(aOutput);
|
||||
|
||||
float32x4_t vin0, vin1, vin2, vin3;
|
||||
float32x4_t vout0, vout1, vout2, vout3;
|
||||
float32x4_t vscale0, vscale1, vscale2, vscale3;
|
||||
|
||||
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
|
||||
vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
|
||||
vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
|
||||
vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
|
||||
vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
|
||||
|
||||
vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
|
||||
vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
|
||||
vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
|
||||
vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
|
||||
|
||||
vout0 = vmulq_f32(vin0, vscale0);
|
||||
vout1 = vmulq_f32(vin1, vscale1);
|
||||
vout2 = vmulq_f32(vin2, vscale2);
|
||||
vout3 = vmulq_f32(vin3, vscale3);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
|
||||
vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
|
||||
ASSERT_ALIGNED(aBlock);
|
||||
|
||||
float32x4_t vin0, vin1, vin2, vin3;
|
||||
float32x4_t vout0, vout1, vout2, vout3;
|
||||
float32x4_t vscale = vmovq_n_f32(aScale);
|
||||
|
||||
uint32_t dif = aSize % 16;
|
||||
uint32_t vectorSize = aSize - dif;
|
||||
uint32_t i = 0;
|
||||
for (; i < vectorSize; i += 16) {
|
||||
vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
|
||||
vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
|
||||
vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
|
||||
vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
|
||||
|
||||
vout0 = vmulq_f32(vin0, vscale);
|
||||
vout1 = vmulq_f32(vin1, vscale);
|
||||
vout2 = vmulq_f32(vin2, vscale);
|
||||
vout3 = vmulq_f32(vin3, vscale);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
|
||||
}
|
||||
|
||||
for (unsigned j = 0; j < dif; ++i, ++j) {
|
||||
aBlock[i] *= aScale;
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
|
||||
uint32_t aSize) {
|
||||
ASSERT_ALIGNED(aBlock);
|
||||
|
||||
float32x4_t vin0, vin1, vin2, vin3;
|
||||
float32x4_t vout0, vout1, vout2, vout3;
|
||||
float32x4_t vscale0, vscale1, vscale2, vscale3;
|
||||
|
||||
uint32_t dif = aSize % 16;
|
||||
uint32_t vectorSize = aSize - dif;
|
||||
uint32_t i = 0;
|
||||
for (; i < vectorSize; i += 16) {
|
||||
vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
|
||||
vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
|
||||
vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
|
||||
vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
|
||||
|
||||
vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
|
||||
vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
|
||||
vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
|
||||
vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
|
||||
|
||||
vout0 = vmulq_f32(vin0, vscale0);
|
||||
vout1 = vmulq_f32(vin1, vscale1);
|
||||
vout2 = vmulq_f32(vin2, vscale2);
|
||||
vout3 = vmulq_f32(vin3, vscale3);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
|
||||
vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
|
||||
}
|
||||
|
||||
for (unsigned j = 0; j < dif; ++i, ++j) {
|
||||
aBlock[i] *= aScale[i];
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
float aGainL, float aGainR,
|
||||
bool aIsOnTheLeft,
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
ASSERT_ALIGNED(aInputL);
|
||||
ASSERT_ALIGNED(aInputR);
|
||||
ASSERT_ALIGNED(aOutputL);
|
||||
ASSERT_ALIGNED(aOutputR);
|
||||
|
||||
float32x4_t vinL0, vinL1;
|
||||
float32x4_t vinR0, vinR1;
|
||||
float32x4_t voutL0, voutL1;
|
||||
float32x4_t voutR0, voutR1;
|
||||
float32x4_t vscaleL = vmovq_n_f32(aGainL);
|
||||
float32x4_t vscaleR = vmovq_n_f32(aGainR);
|
||||
|
||||
if (aIsOnTheLeft) {
|
||||
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
|
||||
vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
|
||||
vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
|
||||
|
||||
vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
|
||||
vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
|
||||
|
||||
voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
|
||||
voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
|
||||
vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
|
||||
|
||||
voutR0 = vmulq_f32(vinR0, vscaleR);
|
||||
voutR1 = vmulq_f32(vinR1, vscaleR);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
|
||||
vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
|
||||
}
|
||||
} else {
|
||||
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
|
||||
vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
|
||||
vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
|
||||
|
||||
vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
|
||||
vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
|
||||
|
||||
voutL0 = vmulq_f32(vinL0, vscaleL);
|
||||
voutL1 = vmulq_f32(vinL1, vscaleL);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
|
||||
vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
|
||||
|
||||
voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
|
||||
voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
|
||||
vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBlockPanStereoToStereo_NEON(
|
||||
const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aGainL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aGainR[WEBAUDIO_BLOCK_SIZE],
|
||||
const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
ASSERT_ALIGNED(aInputL);
|
||||
ASSERT_ALIGNED(aInputR);
|
||||
ASSERT_ALIGNED(aGainL);
|
||||
ASSERT_ALIGNED(aGainR);
|
||||
ASSERT_ALIGNED(aIsOnTheLeft);
|
||||
ASSERT_ALIGNED(aOutputL);
|
||||
ASSERT_ALIGNED(aOutputR);
|
||||
|
||||
float32x4_t vinL0, vinL1;
|
||||
float32x4_t vinR0, vinR1;
|
||||
float32x4_t voutL0, voutL1;
|
||||
float32x4_t voutR0, voutR1;
|
||||
float32x4_t vscaleL0, vscaleL1;
|
||||
float32x4_t vscaleR0, vscaleR1;
|
||||
float32x4_t onleft0, onleft1, notonleft0, notonleft1;
|
||||
|
||||
float32x4_t zero = vmovq_n_f32(0);
|
||||
uint8x8_t isOnTheLeft;
|
||||
|
||||
// Although MSVC throws uninitialized value warning for voutL0 and voutL1,
|
||||
// since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
|
||||
// compiler warning, set zero.
|
||||
voutL0 = zero;
|
||||
voutL1 = zero;
|
||||
|
||||
for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
|
||||
vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
|
||||
vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
|
||||
|
||||
vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
|
||||
vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
|
||||
|
||||
vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
|
||||
vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));
|
||||
|
||||
vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
|
||||
vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));
|
||||
|
||||
// Load output with boolean "on the left" values. This assumes that
|
||||
// bools are stored as a single byte.
|
||||
isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
|
||||
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
|
||||
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
|
||||
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
|
||||
voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
|
||||
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
|
||||
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
|
||||
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
|
||||
voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);
|
||||
|
||||
// Convert the boolean values into masks by setting all bits to 1
|
||||
// if true.
|
||||
voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
|
||||
voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);
|
||||
|
||||
// The right output masks are the same as the left masks
|
||||
voutR0 = voutL0;
|
||||
voutR1 = voutL1;
|
||||
|
||||
// Calculate left channel assuming isOnTheLeft
|
||||
onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
|
||||
onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL1);
|
||||
|
||||
// Calculate left channel assuming not isOnTheLeft
|
||||
notonleft0 = vmulq_f32(vinL0, vscaleL0);
|
||||
notonleft1 = vmulq_f32(vinL1, vscaleL1);
|
||||
|
||||
// Write results using previously stored masks
|
||||
voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
|
||||
voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);
|
||||
|
||||
// Calculate right channel assuming isOnTheLeft
|
||||
onleft0 = vmulq_f32(vinR0, vscaleR0);
|
||||
onleft1 = vmulq_f32(vinR1, vscaleR1);
|
||||
|
||||
// Calculate right channel assuming not isOnTheLeft
|
||||
notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
|
||||
notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);
|
||||
|
||||
// Write results using previously stored masks
|
||||
voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
|
||||
voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);
|
||||
|
||||
vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
|
||||
vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
|
||||
vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
|
||||
vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
|
||||
}
|
||||
}
|
||||
} // namespace mozilla
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* this source code form is subject to the terms of the mozilla public
|
||||
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#ifndef MOZILLA_AUDIONODEENGINENEON_H_
|
||||
#define MOZILLA_AUDIONODEENGINENEON_H_
|
||||
|
||||
#include "AudioNodeEngine.h"
|
||||
|
||||
namespace mozilla {
|
||||
void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
|
||||
float* aOutput, uint32_t aSize);
|
||||
|
||||
void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
|
||||
float* aOutput);
|
||||
|
||||
void AudioBlockCopyChannelWithScale_NEON(
|
||||
const float aInput[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aScale[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutput[WEBAUDIO_BLOCK_SIZE]);
|
||||
|
||||
void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize);
|
||||
void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale, uint32_t aSize);
|
||||
|
||||
void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
float aGainL, float aGainR,
|
||||
bool aIsOnTheLeft,
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputR[WEBAUDIO_BLOCK_SIZE]);
|
||||
|
||||
void AudioBlockPanStereoToStereo_NEON(
|
||||
const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aGainL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aGainR[WEBAUDIO_BLOCK_SIZE],
|
||||
const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]);
|
||||
} // namespace mozilla
|
||||
|
||||
#endif /* MOZILLA_AUDIONODEENGINENEON_H_ */
|
|
@ -3,8 +3,361 @@
|
|||
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "AudioNodeEngineGeneric.h"
|
||||
#include "AudioNodeEngineSSE2.h"
|
||||
#include "AlignmentUtils.h"
|
||||
#include <emmintrin.h>
|
||||
|
||||
namespace mozilla {
|
||||
template struct Engine<xsimd::sse2>;
|
||||
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
|
||||
float* aOutput, uint32_t aSize) {
|
||||
__m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
|
||||
vout1, vout2, vout3, vgain;
|
||||
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
vgain = _mm_load1_ps(&aScale);
|
||||
|
||||
for (unsigned i = 0; i < aSize; i += 16) {
|
||||
vin0 = _mm_load_ps(&aInput[i]);
|
||||
vin1 = _mm_load_ps(&aInput[i + 4]);
|
||||
vin2 = _mm_load_ps(&aInput[i + 8]);
|
||||
vin3 = _mm_load_ps(&aInput[i + 12]);
|
||||
|
||||
vscaled0 = _mm_mul_ps(vin0, vgain);
|
||||
vscaled1 = _mm_mul_ps(vin1, vgain);
|
||||
vscaled2 = _mm_mul_ps(vin2, vgain);
|
||||
vscaled3 = _mm_mul_ps(vin3, vgain);
|
||||
|
||||
vin0 = _mm_load_ps(&aOutput[i]);
|
||||
vin1 = _mm_load_ps(&aOutput[i + 4]);
|
||||
vin2 = _mm_load_ps(&aOutput[i + 8]);
|
||||
vin3 = _mm_load_ps(&aOutput[i + 12]);
|
||||
|
||||
vout0 = _mm_add_ps(vin0, vscaled0);
|
||||
vout1 = _mm_add_ps(vin1, vscaled1);
|
||||
vout2 = _mm_add_ps(vin2, vscaled2);
|
||||
vout3 = _mm_add_ps(vin3, vscaled3);
|
||||
|
||||
_mm_store_ps(&aOutput[i], vout0);
|
||||
_mm_store_ps(&aOutput[i + 4], vout1);
|
||||
_mm_store_ps(&aOutput[i + 8], vout2);
|
||||
_mm_store_ps(&aOutput[i + 12], vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
|
||||
float* aOutput) {
|
||||
__m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;
|
||||
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
|
||||
__m128 vgain = _mm_load1_ps(&aScale);
|
||||
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
|
||||
vin0 = _mm_load_ps(&aInput[i]);
|
||||
vin1 = _mm_load_ps(&aInput[i + 4]);
|
||||
vin2 = _mm_load_ps(&aInput[i + 8]);
|
||||
vin3 = _mm_load_ps(&aInput[i + 12]);
|
||||
vout0 = _mm_mul_ps(vin0, vgain);
|
||||
vout1 = _mm_mul_ps(vin1, vgain);
|
||||
vout2 = _mm_mul_ps(vin2, vgain);
|
||||
vout3 = _mm_mul_ps(vin3, vgain);
|
||||
_mm_store_ps(&aOutput[i], vout0);
|
||||
_mm_store_ps(&aOutput[i + 4], vout1);
|
||||
_mm_store_ps(&aOutput[i + 8], vout2);
|
||||
_mm_store_ps(&aOutput[i + 12], vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aScale[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutput[WEBAUDIO_BLOCK_SIZE]) {
|
||||
__m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
|
||||
vout1, vout2, vout3;
|
||||
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aScale);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
|
||||
vscaled0 = _mm_load_ps(&aScale[i]);
|
||||
vscaled1 = _mm_load_ps(&aScale[i + 4]);
|
||||
vscaled2 = _mm_load_ps(&aScale[i + 8]);
|
||||
vscaled3 = _mm_load_ps(&aScale[i + 12]);
|
||||
|
||||
vin0 = _mm_load_ps(&aInput[i]);
|
||||
vin1 = _mm_load_ps(&aInput[i + 4]);
|
||||
vin2 = _mm_load_ps(&aInput[i + 8]);
|
||||
vin3 = _mm_load_ps(&aInput[i + 12]);
|
||||
|
||||
vout0 = _mm_mul_ps(vin0, vscaled0);
|
||||
vout1 = _mm_mul_ps(vin1, vscaled1);
|
||||
vout2 = _mm_mul_ps(vin2, vscaled2);
|
||||
vout3 = _mm_mul_ps(vin3, vscaled3);
|
||||
|
||||
_mm_store_ps(&aOutput[i], vout0);
|
||||
_mm_store_ps(&aOutput[i + 4], vout1);
|
||||
_mm_store_ps(&aOutput[i + 8], vout2);
|
||||
_mm_store_ps(&aOutput[i + 12], vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize) {
|
||||
__m128 vout0, vout1, vout2, vout3, vin0, vin1, vin2, vin3;
|
||||
|
||||
ASSERT_ALIGNED16(aBlock);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
__m128 vgain = _mm_load1_ps(&aScale);
|
||||
|
||||
for (unsigned i = 0; i < aSize; i += 16) {
|
||||
vin0 = _mm_load_ps(&aBlock[i]);
|
||||
vin1 = _mm_load_ps(&aBlock[i + 4]);
|
||||
vin2 = _mm_load_ps(&aBlock[i + 8]);
|
||||
vin3 = _mm_load_ps(&aBlock[i + 12]);
|
||||
vout0 = _mm_mul_ps(vin0, vgain);
|
||||
vout1 = _mm_mul_ps(vin1, vgain);
|
||||
vout2 = _mm_mul_ps(vin2, vgain);
|
||||
vout3 = _mm_mul_ps(vin3, vgain);
|
||||
_mm_store_ps(&aBlock[i], vout0);
|
||||
_mm_store_ps(&aBlock[i + 4], vout1);
|
||||
_mm_store_ps(&aBlock[i + 8], vout2);
|
||||
_mm_store_ps(&aBlock[i + 12], vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize) {
|
||||
__m128 vout0, vout1, vout2, vout3, vgain0, vgain1, vgain2, vgain3, vin0, vin1,
|
||||
vin2, vin3;
|
||||
|
||||
ASSERT_ALIGNED16(aBlock);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
for (unsigned i = 0; i < aSize; i += 16) {
|
||||
vin0 = _mm_load_ps(&aBlock[i]);
|
||||
vin1 = _mm_load_ps(&aBlock[i + 4]);
|
||||
vin2 = _mm_load_ps(&aBlock[i + 8]);
|
||||
vin3 = _mm_load_ps(&aBlock[i + 12]);
|
||||
vgain0 = _mm_load_ps(&aScale[i]);
|
||||
vgain1 = _mm_load_ps(&aScale[i + 4]);
|
||||
vgain2 = _mm_load_ps(&aScale[i + 8]);
|
||||
vgain3 = _mm_load_ps(&aScale[i + 12]);
|
||||
vout0 = _mm_mul_ps(vin0, vgain0);
|
||||
vout1 = _mm_mul_ps(vin1, vgain1);
|
||||
vout2 = _mm_mul_ps(vin2, vgain2);
|
||||
vout3 = _mm_mul_ps(vin3, vgain3);
|
||||
_mm_store_ps(&aBlock[i], vout0);
|
||||
_mm_store_ps(&aBlock[i + 4], vout1);
|
||||
_mm_store_ps(&aBlock[i + 8], vout2);
|
||||
_mm_store_ps(&aBlock[i + 12], vout3);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
float aGainL, float aGainR,
|
||||
bool aIsOnTheLeft,
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
|
||||
__m128 vinl0, vinr0, vinl1, vinr1, vout0, vout1, vscaled0, vscaled1, vgainl,
|
||||
vgainr;
|
||||
|
||||
ASSERT_ALIGNED16(aInputL);
|
||||
ASSERT_ALIGNED16(aInputR);
|
||||
ASSERT_ALIGNED16(aOutputL);
|
||||
ASSERT_ALIGNED16(aOutputR);
|
||||
|
||||
vgainl = _mm_load1_ps(&aGainL);
|
||||
vgainr = _mm_load1_ps(&aGainR);
|
||||
|
||||
if (aIsOnTheLeft) {
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
|
||||
vinl0 = _mm_load_ps(&aInputL[i]);
|
||||
vinr0 = _mm_load_ps(&aInputR[i]);
|
||||
vinl1 = _mm_load_ps(&aInputL[i + 4]);
|
||||
vinr1 = _mm_load_ps(&aInputR[i + 4]);
|
||||
|
||||
/* left channel : aOutputL = aInputL + aInputR * gainL */
|
||||
vscaled0 = _mm_mul_ps(vinr0, vgainl);
|
||||
vscaled1 = _mm_mul_ps(vinr1, vgainl);
|
||||
vout0 = _mm_add_ps(vscaled0, vinl0);
|
||||
vout1 = _mm_add_ps(vscaled1, vinl1);
|
||||
_mm_store_ps(&aOutputL[i], vout0);
|
||||
_mm_store_ps(&aOutputL[i + 4], vout1);
|
||||
|
||||
/* right channel : aOutputR = aInputR * gainR */
|
||||
vscaled0 = _mm_mul_ps(vinr0, vgainr);
|
||||
vscaled1 = _mm_mul_ps(vinr1, vgainr);
|
||||
_mm_store_ps(&aOutputR[i], vscaled0);
|
||||
_mm_store_ps(&aOutputR[i + 4], vscaled1);
|
||||
}
|
||||
} else {
|
||||
for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
|
||||
vinl0 = _mm_load_ps(&aInputL[i]);
|
||||
vinr0 = _mm_load_ps(&aInputR[i]);
|
||||
vinl1 = _mm_load_ps(&aInputL[i + 4]);
|
||||
vinr1 = _mm_load_ps(&aInputR[i + 4]);
|
||||
|
||||
/* left channel : aInputL * gainL */
|
||||
vscaled0 = _mm_mul_ps(vinl0, vgainl);
|
||||
vscaled1 = _mm_mul_ps(vinl1, vgainl);
|
||||
_mm_store_ps(&aOutputL[i], vscaled0);
|
||||
_mm_store_ps(&aOutputL[i + 4], vscaled1);
|
||||
|
||||
/* right channel: aOutputR = aInputR + aInputL * gainR */
|
||||
vscaled0 = _mm_mul_ps(vinl0, vgainr);
|
||||
vscaled1 = _mm_mul_ps(vinl1, vgainr);
|
||||
vout0 = _mm_add_ps(vscaled0, vinr0);
|
||||
vout1 = _mm_add_ps(vscaled1, vinr1);
|
||||
_mm_store_ps(&aOutputR[i], vout0);
|
||||
_mm_store_ps(&aOutputR[i + 4], vout1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
|
||||
float* aOutput, uint32_t aSize) {
|
||||
unsigned i;
|
||||
__m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
|
||||
outimag1, outimag2, outimag3;
|
||||
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_ALIGNED16(aScale);
|
||||
ASSERT_ALIGNED16(aOutput);
|
||||
ASSERT_MULTIPLE16(aSize);
|
||||
|
||||
for (i = 0; i < aSize * 2; i += 16) {
|
||||
in0 = _mm_load_ps(&aInput[i]);
|
||||
in1 = _mm_load_ps(&aInput[i + 4]);
|
||||
in2 = _mm_load_ps(&aInput[i + 8]);
|
||||
in3 = _mm_load_ps(&aInput[i + 12]);
|
||||
|
||||
outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
in0 = _mm_load_ps(&aScale[i]);
|
||||
in1 = _mm_load_ps(&aScale[i + 4]);
|
||||
in2 = _mm_load_ps(&aScale[i + 8]);
|
||||
in3 = _mm_load_ps(&aScale[i + 12]);
|
||||
|
||||
outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
|
||||
_mm_mul_ps(outimag0, outimag1));
|
||||
in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
|
||||
_mm_mul_ps(outimag0, outreal1));
|
||||
in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
|
||||
_mm_mul_ps(outimag2, outimag3));
|
||||
in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
|
||||
_mm_mul_ps(outimag2, outreal3));
|
||||
|
||||
outreal0 = _mm_unpacklo_ps(in0, in1);
|
||||
outreal1 = _mm_unpackhi_ps(in0, in1);
|
||||
outreal2 = _mm_unpacklo_ps(in2, in3);
|
||||
outreal3 = _mm_unpackhi_ps(in2, in3);
|
||||
|
||||
_mm_store_ps(&aOutput[i], outreal0);
|
||||
_mm_store_ps(&aOutput[i + 4], outreal1);
|
||||
_mm_store_ps(&aOutput[i + 8], outreal2);
|
||||
_mm_store_ps(&aOutput[i + 12], outreal3);
|
||||
}
|
||||
}
|
||||
|
||||
float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) {
|
||||
unsigned i;
|
||||
__m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3;
|
||||
float out[4];
|
||||
|
||||
ASSERT_ALIGNED16(aInput);
|
||||
ASSERT_MULTIPLE16(aLength);
|
||||
|
||||
acc0 = _mm_setzero_ps();
|
||||
acc1 = _mm_setzero_ps();
|
||||
acc2 = _mm_setzero_ps();
|
||||
acc3 = _mm_setzero_ps();
|
||||
|
||||
for (i = 0; i < aLength; i += 16) {
|
||||
in0 = _mm_load_ps(&aInput[i]);
|
||||
in1 = _mm_load_ps(&aInput[i + 4]);
|
||||
in2 = _mm_load_ps(&aInput[i + 8]);
|
||||
in3 = _mm_load_ps(&aInput[i + 12]);
|
||||
|
||||
in0 = _mm_mul_ps(in0, in0);
|
||||
in1 = _mm_mul_ps(in1, in1);
|
||||
in2 = _mm_mul_ps(in2, in2);
|
||||
in3 = _mm_mul_ps(in3, in3);
|
||||
|
||||
acc0 = _mm_add_ps(acc0, in0);
|
||||
acc1 = _mm_add_ps(acc1, in1);
|
||||
acc2 = _mm_add_ps(acc2, in2);
|
||||
acc3 = _mm_add_ps(acc3, in3);
|
||||
}
|
||||
|
||||
acc0 = _mm_add_ps(acc0, acc1);
|
||||
acc0 = _mm_add_ps(acc0, acc2);
|
||||
acc0 = _mm_add_ps(acc0, acc3);
|
||||
|
||||
_mm_store_ps(out, acc0);
|
||||
|
||||
return out[0] + out[1] + out[2] + out[3];
|
||||
}
|
||||
|
||||
void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount) {
|
||||
__m128 vin0, vin1, vin2, vin3;
|
||||
__m128 vmask0, vmask1, vmask2, vmask3;
|
||||
__m128 vout0, vout1, vout2, vout3;
|
||||
|
||||
float* samplesAligned16 = ALIGNED16(aSamples);
|
||||
size_t leadingElementsScalar =
|
||||
std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
|
||||
size_t remainingElements = aCount - leadingElementsScalar;
|
||||
size_t vectoredEnd = aCount - remainingElements % 16;
|
||||
|
||||
MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) % 16));
|
||||
|
||||
size_t i = 0;
|
||||
for (; i < leadingElementsScalar; i++) {
|
||||
if (aSamples[i] != aSamples[i]) {
|
||||
aSamples[i] = 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
ASSERT_ALIGNED16(&aSamples[i]);
|
||||
|
||||
for (; i < vectoredEnd; i += 16) {
|
||||
vin0 = _mm_load_ps(&aSamples[i + 0]);
|
||||
vin1 = _mm_load_ps(&aSamples[i + 4]);
|
||||
vin2 = _mm_load_ps(&aSamples[i + 8]);
|
||||
vin3 = _mm_load_ps(&aSamples[i + 12]);
|
||||
|
||||
vmask0 = _mm_cmpord_ps(vin0, vin0);
|
||||
vmask1 = _mm_cmpord_ps(vin1, vin1);
|
||||
vmask2 = _mm_cmpord_ps(vin2, vin2);
|
||||
vmask3 = _mm_cmpord_ps(vin3, vin3);
|
||||
|
||||
vout0 = _mm_and_ps(vin0, vmask0);
|
||||
vout1 = _mm_and_ps(vin1, vmask1);
|
||||
vout2 = _mm_and_ps(vin2, vmask2);
|
||||
vout3 = _mm_and_ps(vin3, vmask3);
|
||||
|
||||
_mm_store_ps(&aSamples[i + 0], vout0);
|
||||
_mm_store_ps(&aSamples[i + 4], vout1);
|
||||
_mm_store_ps(&aSamples[i + 8], vout2);
|
||||
_mm_store_ps(&aSamples[i + 12], vout3);
|
||||
}
|
||||
for (; i < aCount; i++) {
|
||||
if (aSamples[i] != aSamples[i]) {
|
||||
aSamples[i] = 0.0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace mozilla
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* this source code form is subject to the terms of the mozilla public
|
||||
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "AudioNodeEngine.h"
|
||||
|
||||
namespace mozilla {
|
||||
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
|
||||
float* aOutput, uint32_t aSize);
|
||||
|
||||
void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
|
||||
float* aOutput);
|
||||
|
||||
void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aScale[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutput[WEBAUDIO_BLOCK_SIZE]);
|
||||
|
||||
void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize);
|
||||
void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize);
|
||||
|
||||
void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
|
||||
const float aInputR[WEBAUDIO_BLOCK_SIZE],
|
||||
float aGainL, float aGainR,
|
||||
bool aIsOnTheLeft,
|
||||
float aOutputL[WEBAUDIO_BLOCK_SIZE],
|
||||
float aOutputR[WEBAUDIO_BLOCK_SIZE]);
|
||||
|
||||
float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength);
|
||||
|
||||
void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
|
||||
float* aOutput, uint32_t aSize);
|
||||
|
||||
void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount);
|
||||
} // namespace mozilla
|
|
@ -1,10 +0,0 @@
|
|||
/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
|
||||
/* this source code form is subject to the terms of the mozilla public
|
||||
* license, v. 2.0. if a copy of the mpl was not distributed with this file,
|
||||
* You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
#include "AudioNodeEngineGeneric.h"
|
||||
|
||||
namespace mozilla {
|
||||
template struct Engine<xsimd::fma3<xsimd::sse4_2>>;
|
||||
} // namespace mozilla
|
|
@ -127,23 +127,16 @@ UNIFIED_SOURCES += [
|
|||
|
||||
if CONFIG["CPU_ARCH"] == "aarch64" or CONFIG["BUILD_ARM_NEON"]:
|
||||
DEFINES["USE_NEON"] = True
|
||||
LOCAL_INCLUDES += ["/third_party/xsimd/include"]
|
||||
SOURCES += ["AudioNodeEngineNEON.cpp"]
|
||||
SOURCES["AudioNodeEngineNEON.cpp"].flags += CONFIG["NEON_FLAGS"]
|
||||
if CONFIG["BUILD_ARM_NEON"]:
|
||||
LOCAL_INCLUDES += ["/media/openmax_dl/dl/api/"]
|
||||
|
||||
# Are we targeting x86 or x64? If so, build SSEX files.
|
||||
# Are we targeting x86 or x64? If so, build SSE2 files.
|
||||
if CONFIG["INTEL_ARCHITECTURE"]:
|
||||
SOURCES += ["AudioNodeEngineSSE2.cpp", "AudioNodeEngineSSE4_2_FMA3.cpp"]
|
||||
SOURCES += ["AudioNodeEngineSSE2.cpp"]
|
||||
DEFINES["USE_SSE2"] = True
|
||||
DEFINES["USE_SSE4_2"] = True
|
||||
DEFINES["USE_FMA3"] = True
|
||||
LOCAL_INCLUDES += ["/third_party/xsimd/include"]
|
||||
SOURCES["AudioNodeEngineSSE2.cpp"].flags += CONFIG["SSE2_FLAGS"]
|
||||
SOURCES["AudioNodeEngineSSE4_2_FMA3.cpp"].flags += (
|
||||
CONFIG["SSE4_2_FLAGS"] + CONFIG["FMA_FLAGS"]
|
||||
)
|
||||
|
||||
include("/ipc/chromium/chromium-config.mozbuild")
|
||||
|
||||
|
|
|
@ -147,10 +147,6 @@ bool sse4_1_enabled = has_cpuid_bits(1u, ecx, (1u << 19));
|
|||
bool sse4_2_enabled = has_cpuid_bits(1u, ecx, (1u << 20));
|
||||
# endif
|
||||
|
||||
# if !defined(MOZILLA_PRESUME_FMA3)
|
||||
bool fma3_enabled = has_cpuid_bits(1u, ecx, (1u << 12));
|
||||
# endif
|
||||
|
||||
# if !defined(MOZILLA_PRESUME_AVX) || !defined(MOZILLA_PRESUME_AVX2)
|
||||
static bool has_avx() {
|
||||
# if defined(MOZILLA_PRESUME_AVX)
|
||||
|
|
|
@ -215,9 +215,6 @@ extern bool MFBT_DATA sse4_1_enabled;
|
|||
# if !defined(MOZILLA_PRESUME_SSE4_2)
|
||||
extern bool MFBT_DATA sse4_2_enabled;
|
||||
# endif
|
||||
# if !defined(MOZILLA_PRESUME_FMA3)
|
||||
extern bool MFBT_DATA fma3_enabled;
|
||||
# endif
|
||||
# if !defined(MOZILLA_PRESUME_AVX)
|
||||
extern bool MFBT_DATA avx_enabled;
|
||||
# endif
|
||||
|
@ -320,16 +317,6 @@ inline bool supports_sse4_2() { return sse_private::sse4_2_enabled; }
|
|||
inline bool supports_sse4_2() { return false; }
|
||||
#endif
|
||||
|
||||
#if defined(MOZILLA_PRESUME_FMA3)
|
||||
# define MOZILLA_MAY_SUPPORT_FMA3 1
|
||||
inline bool supports_fma3() { return true; }
|
||||
#elif defined(MOZILLA_SSE_HAVE_CPUID_DETECTION)
|
||||
# define MOZILLA_MAY_SUPPORT_FMA3 1
|
||||
inline bool supports_fma3() { return sse_private::fma3_enabled; }
|
||||
#else
|
||||
inline bool supports_fma3() { return false; }
|
||||
#endif
|
||||
|
||||
#if defined(MOZILLA_PRESUME_AVX)
|
||||
# define MOZILLA_MAY_SUPPORT_AVX 1
|
||||
inline bool supports_avx() { return true; }
|
||||
|
|
|
@ -1,29 +0,0 @@
|
|||
Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
|
||||
Copyright (c) 2016, QuantStack
|
||||
Copyright (c) 2018, Serge Guelton
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
@ -1,152 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
|
||||
#define XSIMD_GENERIC_ARITHMETIC_HPP
|
||||
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// bitwise_lshift
|
||||
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x << y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// bitwise_rshift
|
||||
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x >> y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// div
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept -> T
|
||||
{ return x / y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return x * y + z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return x * y - z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// fnma
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return -x * y + z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return -x * y - z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept -> T
|
||||
{ return x * y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// sadd
|
||||
template <class A>
|
||||
inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return add(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return add(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
|
||||
// ssub
|
||||
template <class A>
|
||||
inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return sub(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return sub(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,96 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_COMPLEX_HPP
|
||||
#define XSIMD_GENERIC_COMPLEX_HPP
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// real
|
||||
template <class A, class T>
|
||||
inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self.real();
|
||||
}
|
||||
|
||||
// imag
|
||||
template <class A, class T>
|
||||
inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self.imag();
|
||||
}
|
||||
|
||||
// arg
|
||||
template <class A, class T>
|
||||
inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return atan2(imag(self), real(self));
|
||||
}
|
||||
|
||||
// conj
|
||||
template <class A, class T>
|
||||
inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { real(self), -imag(self) };
|
||||
}
|
||||
|
||||
// norm
|
||||
template <class A, class T>
|
||||
inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { fma(real(self), real(self), imag(self) * imag(self)) };
|
||||
}
|
||||
|
||||
// proj
|
||||
template <class A, class T>
|
||||
inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = complex_batch_type_t<batch<T, A>>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
using real_value_type = typename real_batch::value_type;
|
||||
auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
|
||||
return select(cond,
|
||||
batch_type(constants::infinity<real_batch>(),
|
||||
copysign(real_batch(real_value_type(0)), imag(self))),
|
||||
batch_type(self));
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,239 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_DETAILS_HPP
|
||||
#define XSIMD_GENERIC_DETAILS_HPP
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "../../math/xsimd_rem_pio2.hpp"
|
||||
#include "../../types/xsimd_generic_arch.hpp"
|
||||
#include "../../types/xsimd_utils.hpp"
|
||||
#include "../xsimd_constants.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
// Forward declaration. Should we put them in a separate file?
|
||||
template <class T, class A>
|
||||
inline batch<T, A> abs(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline bool any(batch_bool<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
|
||||
template <class A, class T_out, class T_in>
|
||||
inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
|
||||
template <class B, class T, class A>
|
||||
inline B bitwise_cast(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> cos(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> exp(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
|
||||
template <class T, class A, uint64_t... Coefs>
|
||||
inline batch<T, A> horner(const batch<T, A>& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> log(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
|
||||
template <class T, class A>
|
||||
inline T reduce_add(batch<T, A> const&) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sign(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sin(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> tan(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class F, class A, class T, class... Batches>
|
||||
inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
|
||||
{
|
||||
constexpr std::size_t size = batch<T, A>::size;
|
||||
alignas(A::alignment()) T self_buffer[size];
|
||||
alignas(A::alignment()) T other_buffer[size];
|
||||
self.store_aligned(&self_buffer[0]);
|
||||
other.store_aligned(&other_buffer[0]);
|
||||
for (std::size_t i = 0; i < size; ++i)
|
||||
{
|
||||
self_buffer[i] = func(self_buffer[i], other_buffer[i]);
|
||||
}
|
||||
return batch<T, A>::load_aligned(self_buffer);
|
||||
}
|
||||
|
||||
template <class U, class F, class A, class T>
|
||||
inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<U, A>::size,
|
||||
"Source and destination sizes must match");
|
||||
constexpr std::size_t src_size = batch<T, A>::size;
|
||||
constexpr std::size_t dest_size = batch<U, A>::size;
|
||||
alignas(A::alignment()) T self_buffer[src_size];
|
||||
alignas(A::alignment()) U other_buffer[dest_size];
|
||||
self.store_aligned(&self_buffer[0]);
|
||||
for (std::size_t i = 0; i < src_size; ++i)
|
||||
{
|
||||
other_buffer[i] = func(self_buffer[i]);
|
||||
}
|
||||
return batch<U, A>::load_aligned(other_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Generic conversion handling machinery. Each architecture must define
|
||||
// conversion function when such conversions exits in the form of
|
||||
// intrinsic. Then we use that information to automatically decide whether
|
||||
// to use scalar or vector conversion when doing load / store / batch_cast
|
||||
struct with_fast_conversion
|
||||
{
|
||||
};
|
||||
struct with_slow_conversion
|
||||
{
|
||||
};
|
||||
|
||||
template <class A, class From, class To, class = void>
|
||||
struct conversion_type_impl
|
||||
{
|
||||
using type = with_slow_conversion;
|
||||
};
|
||||
|
||||
using xsimd::detail::void_t;
|
||||
|
||||
template <class A, class From, class To>
|
||||
struct conversion_type_impl<A, From, To,
|
||||
void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
|
||||
std::declval<const batch<To, A>&>(),
|
||||
std::declval<const A&>()))>>
|
||||
{
|
||||
using type = with_fast_conversion;
|
||||
};
|
||||
|
||||
template <class A, class From, class To>
|
||||
using conversion_type = typename conversion_type_impl<A, From, To>::type;
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
/* origin: boost/simdfunction/horn.hpp*/
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class B, uint64_t c>
|
||||
inline B coef() noexcept
|
||||
{
|
||||
using value_type = typename B::value_type;
|
||||
return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
|
||||
}
|
||||
template <class B>
|
||||
inline B horner(const B&) noexcept
|
||||
{
|
||||
return B(typename B::value_type(0.));
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0>
|
||||
inline B horner(const B&) noexcept
|
||||
{
|
||||
return coef<B, c0>();
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
|
||||
inline B horner(const B& self) noexcept
|
||||
{
|
||||
return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
|
||||
}
|
||||
|
||||
/* origin: boost/simdfunction/horn1.hpp*/
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class B>
|
||||
inline B horner1(const B&) noexcept
|
||||
{
|
||||
return B(1.);
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0>
|
||||
inline B horner1(const B& x) noexcept
|
||||
{
|
||||
return x + detail::coef<B, c0>();
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
|
||||
inline B horner1(const B& x) noexcept
|
||||
{
|
||||
return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,163 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_LOGICAL_HPP
|
||||
#define XSIMD_GENERIC_LOGICAL_HPP
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// from mask
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
|
||||
// This is inefficient but should never be called. It's just a
|
||||
// temporary implementation until arm support is added.
|
||||
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
|
||||
buffer[i] = mask & (1ull << i);
|
||||
return batch_bool<T, A>::load_aligned(buffer);
|
||||
}
|
||||
|
||||
// ge
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return other <= self;
|
||||
}
|
||||
|
||||
// gt
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return other < self;
|
||||
}
|
||||
|
||||
// is_even
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return is_flint(self * T(0.5));
|
||||
}
|
||||
|
||||
// is_flint
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
|
||||
return frac == T(0.);
|
||||
}
|
||||
|
||||
// is_odd
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return is_even(self - T(1.));
|
||||
}
|
||||
|
||||
// isinf
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(false);
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return abs(self) == std::numeric_limits<float>::infinity();
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return abs(self) == std::numeric_limits<double>::infinity();
|
||||
}
|
||||
|
||||
// isfinite
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(true);
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return (self - self) == 0.f;
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return (self - self) == 0.;
|
||||
}
|
||||
|
||||
// isnan
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(false);
|
||||
}
|
||||
|
||||
// le
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return (self < other) || (self == other);
|
||||
}
|
||||
|
||||
// neq
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return !(other == self);
|
||||
}
|
||||
|
||||
// logical_and
|
||||
template <class A, class T>
|
||||
inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x && y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// logical_or
|
||||
template <class A, class T>
|
||||
inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x || y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// mask
|
||||
template <class A, class T>
|
||||
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
|
||||
self.store_aligned(buffer);
|
||||
// This is inefficient but should never be called. It's just a
|
||||
// temporary implementation until arm support is added.
|
||||
uint64_t res = 0;
|
||||
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
|
||||
if (buffer[i])
|
||||
res |= 1ul << i;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,397 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_MEMORY_HPP
|
||||
#define XSIMD_GENERIC_MEMORY_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "../../types/xsimd_batch_constant.hpp"
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <class batch_type, typename batch_type::value_type... Values>
|
||||
struct batch_constant;
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// extract_pair
|
||||
template <class A, class T>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
|
||||
{
|
||||
constexpr std::size_t size = batch<T, A>::size;
|
||||
assert(i < size && "index in bounds");
|
||||
|
||||
alignas(A::alignment()) T self_buffer[size];
|
||||
self.store_aligned(self_buffer);
|
||||
|
||||
alignas(A::alignment()) T other_buffer[size];
|
||||
other.store_aligned(other_buffer);
|
||||
|
||||
alignas(A::alignment()) T concat_buffer[size];
|
||||
|
||||
for (std::size_t j = 0; j < (size - i); ++j)
|
||||
{
|
||||
concat_buffer[j] = other_buffer[i + j];
|
||||
if (j < i)
|
||||
{
|
||||
concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
|
||||
}
|
||||
}
|
||||
return batch<T, A>::load_aligned(concat_buffer);
|
||||
}
|
||||
|
||||
// gather
|
||||
namespace detail
|
||||
{
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
|
||||
inline batch<T, A> gather(U const* src, batch<V, A> const& index,
|
||||
::xsimd::index<N> I) noexcept
|
||||
{
|
||||
return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
|
||||
}
|
||||
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
|
||||
inline batch<T, A>
|
||||
gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
|
||||
{
|
||||
static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
|
||||
|
||||
const auto test = gather<N - 1, T, A>(src, index, {});
|
||||
return insert(test, static_cast<T>(src[index.get(I)]), I);
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, typename A, typename V>
|
||||
inline batch<T, A>
|
||||
gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Index and destination sizes must match");
|
||||
|
||||
return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
|
||||
}
|
||||
|
||||
// Gather with runtime indexes and mismatched strides.
|
||||
template <typename T, typename A, typename U, typename V>
|
||||
inline detail::sizes_mismatch_t<T, U, batch<T, A>>
|
||||
gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Index and destination sizes must match");
|
||||
|
||||
return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
|
||||
}
|
||||
|
||||
// Gather with runtime indexes and matching strides.
|
||||
template <typename T, typename A, typename U, typename V>
|
||||
inline detail::stride_match_t<T, U, batch<T, A>>
|
||||
gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Index and destination sizes must match");
|
||||
|
||||
return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
|
||||
}
|
||||
|
||||
// insert
|
||||
template <class A, class T, size_t I>
|
||||
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
|
||||
{
|
||||
struct index_mask
|
||||
{
|
||||
static constexpr bool get(size_t index, size_t /* size*/)
|
||||
{
|
||||
return index != I;
|
||||
}
|
||||
};
|
||||
batch<T, A> tmp(val);
|
||||
return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
|
||||
}
|
||||
|
||||
// get
|
||||
template <class A, size_t I, class T>
|
||||
inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[I];
|
||||
}
|
||||
|
||||
template <class A, size_t I, class T>
|
||||
inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[I];
|
||||
}
|
||||
|
||||
template <class A, size_t I, class T>
|
||||
inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[I];
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
|
||||
{
|
||||
using T2 = typename batch<std::complex<T>, A>::value_type;
|
||||
alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
// load_aligned
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
|
||||
{
|
||||
using batch_type_in = batch<T_in, A>;
|
||||
using batch_type_out = batch<T_out, A>;
|
||||
return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
|
||||
}
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
|
||||
using batch_type_out = batch<T_out, A>;
|
||||
alignas(A::alignment()) T_out buffer[batch_type_out::size];
|
||||
std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
|
||||
return batch_type_out::load_aligned(buffer);
|
||||
}
|
||||
}
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
|
||||
}
|
||||
|
||||
// load_unaligned
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
|
||||
{
|
||||
using batch_type_in = batch<T_in, A>;
|
||||
using batch_type_out = batch<T_out, A>;
|
||||
return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
|
||||
}
|
||||
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
|
||||
return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
|
||||
}
|
||||
}
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Scatter with runtime indexes.
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
|
||||
inline void scatter(batch<T, A> const& src, U* dst,
|
||||
batch<V, A> const& index,
|
||||
::xsimd::index<N> I) noexcept
|
||||
{
|
||||
dst[index.get(I)] = static_cast<U>(src.get(I));
|
||||
}
|
||||
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
|
||||
inline void
|
||||
scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
|
||||
::xsimd::index<N> I) noexcept
|
||||
{
|
||||
static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
|
||||
|
||||
kernel::detail::scatter<N - 1, T, A, U, V>(
|
||||
src, dst, index, {});
|
||||
dst[index.get(I)] = static_cast<U>(src.get(I));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <typename A, typename T, typename V>
|
||||
inline void
|
||||
scatter(batch<T, A> const& src, T* dst,
|
||||
batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Source and index sizes must match");
|
||||
kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
|
||||
src, dst, index, {});
|
||||
}
|
||||
|
||||
template <typename A, typename T, typename U, typename V>
|
||||
inline detail::sizes_mismatch_t<T, U, void>
|
||||
scatter(batch<T, A> const& src, U* dst,
|
||||
batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Source and index sizes must match");
|
||||
kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
|
||||
src, dst, index, {});
|
||||
}
|
||||
|
||||
template <typename A, typename T, typename U, typename V>
|
||||
inline detail::stride_match_t<T, U, void>
|
||||
scatter(batch<T, A> const& src, U* dst,
|
||||
batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Source and index sizes must match");
|
||||
const auto tmp = batch_cast<U>(src);
|
||||
kernel::scatter<A>(tmp, dst, index, A {});
|
||||
}
|
||||
|
||||
// store
|
||||
template <class T, class A>
|
||||
inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
constexpr auto size = batch_bool<T, A>::size;
|
||||
alignas(A::alignment()) T buffer[size];
|
||||
kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
|
||||
for (std::size_t i = 0; i < size; ++i)
|
||||
mem[i] = bool(buffer[i]);
|
||||
}
|
||||
|
||||
// store_aligned
|
||||
template <class A, class T_in, class T_out>
|
||||
inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
|
||||
alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
|
||||
store_aligned(&buffer[0], self);
|
||||
std::copy(std::begin(buffer), std::end(buffer), mem);
|
||||
}
|
||||
|
||||
// store_unaligned
|
||||
template <class A, class T_in, class T_out>
|
||||
inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
|
||||
return store_aligned<A>(mem, self, generic {});
|
||||
}
|
||||
|
||||
// swizzle
|
||||
template <class A, class T, class ITy, ITy... Vs>
|
||||
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
|
||||
}
|
||||
}
|
||||
|
||||
// load_complex_aligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_out, A>;
|
||||
T_in const* buffer = reinterpret_cast<T_in const*>(mem);
|
||||
real_batch hi = real_batch::load_aligned(buffer),
|
||||
lo = real_batch::load_aligned(buffer + real_batch::size);
|
||||
return detail::load_complex(hi, lo, A {});
|
||||
}
|
||||
|
||||
// load_complex_unaligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_out, A>;
|
||||
T_in const* buffer = reinterpret_cast<T_in const*>(mem);
|
||||
real_batch hi = real_batch::load_unaligned(buffer),
|
||||
lo = real_batch::load_unaligned(buffer + real_batch::size);
|
||||
return detail::load_complex(hi, lo, A {});
|
||||
}
|
||||
|
||||
// store_complex_aligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_in, A>;
|
||||
real_batch hi = detail::complex_high(src, A {});
|
||||
real_batch lo = detail::complex_low(src, A {});
|
||||
T_out* buffer = reinterpret_cast<T_out*>(dst);
|
||||
lo.store_aligned(buffer);
|
||||
hi.store_aligned(buffer + real_batch::size);
|
||||
}
|
||||
|
||||
// store_compelx_unaligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_in, A>;
|
||||
real_batch hi = detail::complex_high(src, A {});
|
||||
real_batch lo = detail::complex_low(src, A {});
|
||||
T_out* buffer = reinterpret_cast<T_out*>(dst);
|
||||
lo.store_unaligned(buffer);
|
||||
hi.store_unaligned(buffer + real_batch::size);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,72 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_ROUNDING_HPP
|
||||
#define XSIMD_GENERIC_ROUNDING_HPP
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// ceil
|
||||
template <class A, class T>
|
||||
inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
batch<T, A> truncated_self = trunc(self);
|
||||
return select(truncated_self < self, truncated_self + 1, truncated_self);
|
||||
}
|
||||
|
||||
// floor
|
||||
template <class A, class T>
|
||||
inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
batch<T, A> truncated_self = trunc(self);
|
||||
return select(truncated_self > self, truncated_self - 1, truncated_self);
|
||||
}
|
||||
|
||||
// round
|
||||
template <class A, class T>
|
||||
inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto v = abs(self);
|
||||
auto c = ceil(v);
|
||||
auto cp = select(c - 0.5 > v, c - 1, c);
|
||||
return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
|
||||
}
|
||||
|
||||
// trunc
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self;
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,969 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_TRIGO_HPP
|
||||
#define XSIMD_GENERIC_TRIGO_HPP
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
/* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
using namespace types;
|
||||
|
||||
// acos
|
||||
template <class A, class T>
|
||||
inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = abs(self);
|
||||
auto x_larger_05 = x > batch_type(0.5);
|
||||
x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
|
||||
x = asin(x);
|
||||
x = select(x_larger_05, x + x, x);
|
||||
x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
|
||||
return select(x_larger_05, x, constants::pio2<batch_type>() - x);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
batch_type tmp = asin(z);
|
||||
return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
|
||||
}
|
||||
|
||||
// acosh
|
||||
/* origin: boost/simd/arch/common/simd/function/acosh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = self - batch_type(1.);
|
||||
auto test = x > constants::oneotwoeps<batch_type>();
|
||||
batch_type z = select(test, self, x + sqrt(x + x + x * x));
|
||||
batch_type l1pz = log1p(z);
|
||||
return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
batch_type w = acos(z);
|
||||
w = batch_type(-w.imag(), w.real());
|
||||
return w;
|
||||
}
|
||||
|
||||
// asin
|
||||
template <class A>
|
||||
inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type x = abs(self);
|
||||
batch_type sign = bitofsign(self);
|
||||
auto x_larger_05 = x > batch_type(0.5);
|
||||
batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
|
||||
x = select(x_larger_05, sqrt(z), x);
|
||||
batch_type z1 = detail::horner<batch_type,
|
||||
0x3e2aaae4,
|
||||
0x3d9980f6,
|
||||
0x3d3a3ec7,
|
||||
0x3cc617e3,
|
||||
0x3d2cb352>(z);
|
||||
z1 = fma(z1, z * x, x);
|
||||
z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
|
||||
return z ^ sign;
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type x = abs(self);
|
||||
auto small_cond = x < constants::sqrteps<batch_type>();
|
||||
batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
|
||||
batch_type zz1 = batch_type(1.) - x;
|
||||
batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
|
||||
zz1 = sqrt(zz1 + zz1);
|
||||
batch_type z = constants::pio4<batch_type>() - zz1;
|
||||
zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
|
||||
z = z - zz1;
|
||||
zz1 = z + constants::pio4<batch_type>();
|
||||
batch_type zz2 = self * self;
|
||||
z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
|
||||
zz2 = fma(x, z, x);
|
||||
return select(x > batch_type(1.), constants::nan<batch_type>(),
|
||||
select(small_cond, x,
|
||||
select(x > ct1, zz1, zz2))
|
||||
^ bitofsign(self));
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch x = z.real();
|
||||
real_batch y = z.imag();
|
||||
|
||||
batch_type ct(-y, x);
|
||||
batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
|
||||
zz = log(ct + sqrt(zz));
|
||||
batch_type resg(zz.imag(), -zz.real());
|
||||
|
||||
return select(y == real_batch(0.),
|
||||
select(fabs(x) > real_batch(1.),
|
||||
batch_type(constants::pio2<real_batch>(), real_batch(0.)),
|
||||
batch_type(asin(x), real_batch(0.))),
|
||||
resg);
|
||||
}
|
||||
|
||||
// asinh
|
||||
/* origin: boost/simd/arch/common/simd/function/asinh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A>
|
||||
average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
|
||||
{
|
||||
return (x1 & x2) + ((x1 ^ x2) >> 1);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A>
|
||||
averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
|
||||
{
|
||||
return averagef(x1, x2);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
|
||||
{
|
||||
return averagef(x1, x2);
|
||||
}
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type x = abs(self);
|
||||
auto lthalf = x < batch_type(0.5);
|
||||
batch_type x2 = x * x;
|
||||
batch_type bts = bitofsign(self);
|
||||
batch_type z(0.);
|
||||
if (any(lthalf))
|
||||
{
|
||||
z = detail::horner<batch_type,
|
||||
0x3f800000,
|
||||
0xbe2aa9ad,
|
||||
0x3d9949b1,
|
||||
0xbd2ee581,
|
||||
0x3ca4d6e6>(x2)
|
||||
* x;
|
||||
if (all(lthalf))
|
||||
return z ^ bts;
|
||||
}
|
||||
batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
|
||||
#ifndef XSIMD_NO_NANS
|
||||
return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
|
||||
#else
|
||||
return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
|
||||
#endif
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type x = abs(self);
|
||||
auto test = x > constants::oneosqrteps<batch_type>();
|
||||
batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
|
||||
#ifndef XSIMD_NO_INFINITIES
|
||||
z = select(x == constants::infinity<batch_type>(), x, z);
|
||||
#endif
|
||||
batch_type l1pz = log1p(z);
|
||||
z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
|
||||
return bitofsign(self) ^ z;
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
batch_type w = asin(batch_type(-z.imag(), z.real()));
|
||||
w = batch_type(w.imag(), -w.real());
|
||||
return w;
|
||||
}
|
||||
|
||||
// atan
|
||||
namespace detail
|
||||
{
|
||||
template <class A>
|
||||
static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
const auto flag1 = x < constants::tan3pio8<batch_type>();
|
||||
const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
|
||||
batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
|
||||
yy = select(flag2, constants::pio4<batch_type>(), yy);
|
||||
batch_type xx = select(flag1, x, -recx);
|
||||
xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
|
||||
const batch_type z = xx * xx;
|
||||
batch_type z1 = detail::horner<batch_type,
|
||||
0xbeaaaa2aul,
|
||||
0x3e4c925ful,
|
||||
0xbe0e1b85ul,
|
||||
0x3da4f0d1ul>(z);
|
||||
z1 = fma(xx, z1 * z, xx);
|
||||
z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
|
||||
z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
|
||||
return yy + z1;
|
||||
}
|
||||
template <class A>
|
||||
static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
const auto flag1 = x < constants::tan3pio8<batch_type>();
|
||||
const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
|
||||
batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
|
||||
yy = select(flag2, constants::pio4<batch_type>(), yy);
|
||||
batch_type xx = select(flag1, x, -recx);
|
||||
xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
|
||||
batch_type z = xx * xx;
|
||||
z *= detail::horner<batch_type,
|
||||
0xc0503669fd28ec8eull,
|
||||
0xc05eb8bf2d05ba25ull,
|
||||
0xc052c08c36880273ull,
|
||||
0xc03028545b6b807aull,
|
||||
0xbfec007fa1f72594ull>(z)
|
||||
/ detail::horner1<batch_type,
|
||||
0x4068519efbbd62ecull,
|
||||
0x407e563f13b049eaull,
|
||||
0x407b0e18d2e2be3bull,
|
||||
0x4064a0dd43b8fa25ull,
|
||||
0x4038dbc45b14603cull>(z);
|
||||
z = fma(xx, z, xx);
|
||||
z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
|
||||
z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
|
||||
return yy + z;
|
||||
}
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type absa = abs(self);
|
||||
const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
|
||||
return x ^ bitofsign(self);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch x = z.real();
|
||||
real_batch y = z.imag();
|
||||
real_batch x2 = x * x;
|
||||
real_batch one(1.);
|
||||
real_batch a = one - x2 - (y * y);
|
||||
real_batch w = 0.5 * atan2(2. * x, a);
|
||||
real_batch num = y + one;
|
||||
num = x2 + num * num;
|
||||
real_batch den = y - one;
|
||||
den = x2 + den * den;
|
||||
batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
|
||||
batch_type(real_batch(0.), constants::infinity<real_batch>()),
|
||||
batch_type(w, 0.25 * log(num / den)));
|
||||
return res;
|
||||
}
|
||||
|
||||
// atanh
|
||||
/* origin: boost/simd/arch/common/simd/function/acosh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = abs(self);
|
||||
batch_type t = x + x;
|
||||
batch_type z = batch_type(1.) - x;
|
||||
auto test = x < batch_type(0.5);
|
||||
batch_type tmp = select(test, x, t) / z;
|
||||
return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
batch_type w = atan(batch_type(-z.imag(), z.real()));
|
||||
w = batch_type(w.imag(), -w.real());
|
||||
return w;
|
||||
}
|
||||
|
||||
// atan2
|
||||
template <class A, class T>
|
||||
inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type q = abs(self / other);
|
||||
const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
|
||||
return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
|
||||
}
|
||||
|
||||
// cos
|
||||
namespace detail
|
||||
{
|
||||
template <class T, class A>
|
||||
inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
|
||||
{
|
||||
return x & batch<T, A>(3);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
|
||||
{
|
||||
return to_float(quadrant(to_int(x)));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type a = x * batch_type(0.25);
|
||||
return (a - floor(a)) * batch_type(4.);
|
||||
}
|
||||
/* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0x3d2aaaa5,
|
||||
0xbab60619,
|
||||
0x37ccf5ce>(z);
|
||||
return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0xbe2aaaa2,
|
||||
0x3c08839d,
|
||||
0xb94ca1f9>(z);
|
||||
return fma(y * z, x, x);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type zz = z * z;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0x3eaaaa6f,
|
||||
0x3e0896dd,
|
||||
0x3d5ac5c9,
|
||||
0x3cc821b5,
|
||||
0x3b4c779c,
|
||||
0x3c19c53b>(zz);
|
||||
return fma(y, zz * z, z);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, y, -batch_type(1.) / y);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, batch_type(1.) / y, -y);
|
||||
}
|
||||
|
||||
/* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A>
|
||||
static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0x3fe0000000000000ull,
|
||||
0xbfa5555555555551ull,
|
||||
0x3f56c16c16c15d47ull,
|
||||
0xbefa01a019ddbcd9ull,
|
||||
0x3e927e4f8e06d9a5ull,
|
||||
0xbe21eea7c1e514d4ull,
|
||||
0x3da8ff831ad9b219ull>(z);
|
||||
return batch_type(1.) - y * z;
|
||||
}
|
||||
|
||||
template <class A>
|
||||
static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0xbfc5555555555548ull,
|
||||
0x3f8111111110f7d0ull,
|
||||
0xbf2a01a019bfdf03ull,
|
||||
0x3ec71de3567d4896ull,
|
||||
0xbe5ae5e5a9291691ull,
|
||||
0x3de5d8fd1fcf0ec1ull>(z);
|
||||
return fma(y * z, x, x);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type zz = z * z;
|
||||
batch_type num = detail::horner<batch_type,
|
||||
0xc1711fead3299176ull,
|
||||
0x413199eca5fc9dddull,
|
||||
0xc0c992d8d24f3f38ull>(zz);
|
||||
batch_type den = detail::horner1<batch_type,
|
||||
0xc189afe03cbe5a31ull,
|
||||
0x4177d98fc2ead8efull,
|
||||
0xc13427bc582abc96ull,
|
||||
0x40cab8a5eeb36572ull>(zz);
|
||||
return fma(z, (zz * (num / den)), z);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, y, -batch_type(1.) / y);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, batch_type(1.) / y, -y);
|
||||
}
|
||||
/* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
struct trigo_radian_tag
|
||||
{
|
||||
};
|
||||
struct trigo_pi_tag
|
||||
{
|
||||
};
|
||||
|
||||
template <class B, class Tag = trigo_radian_tag>
|
||||
struct trigo_reducer
|
||||
{
|
||||
static inline B reduce(const B& x, B& xr) noexcept
|
||||
{
|
||||
if (all(x <= constants::pio4<B>()))
|
||||
{
|
||||
xr = x;
|
||||
return B(0.);
|
||||
}
|
||||
else if (all(x <= constants::pio2<B>()))
|
||||
{
|
||||
auto test = x > constants::pio4<B>();
|
||||
xr = x - constants::pio2_1<B>();
|
||||
xr -= constants::pio2_2<B>();
|
||||
xr -= constants::pio2_3<B>();
|
||||
xr = select(test, xr, x);
|
||||
return select(test, B(1.), B(0.));
|
||||
}
|
||||
else if (all(x <= constants::twentypi<B>()))
|
||||
{
|
||||
B xi = nearbyint(x * constants::twoopi<B>());
|
||||
xr = fnma(xi, constants::pio2_1<B>(), x);
|
||||
xr -= xi * constants::pio2_2<B>();
|
||||
xr -= xi * constants::pio2_3<B>();
|
||||
return quadrant(xi);
|
||||
}
|
||||
else if (all(x <= constants::mediumpi<B>()))
|
||||
{
|
||||
B fn = nearbyint(x * constants::twoopi<B>());
|
||||
B r = x - fn * constants::pio2_1<B>();
|
||||
B w = fn * constants::pio2_1t<B>();
|
||||
B t = r;
|
||||
w = fn * constants::pio2_2<B>();
|
||||
r = t - w;
|
||||
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
|
||||
t = r;
|
||||
w = fn * constants::pio2_3<B>();
|
||||
r = t - w;
|
||||
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
|
||||
xr = r - w;
|
||||
return quadrant(fn);
|
||||
}
|
||||
else
|
||||
{
|
||||
static constexpr std::size_t size = B::size;
|
||||
using value_type = typename B::value_type;
|
||||
alignas(B) std::array<value_type, size> tmp;
|
||||
alignas(B) std::array<value_type, size> txr;
|
||||
alignas(B) std::array<value_type, size> args;
|
||||
x.store_aligned(args.data());
|
||||
|
||||
for (std::size_t i = 0; i < size; ++i)
|
||||
{
|
||||
double arg = args[i];
|
||||
if (arg == std::numeric_limits<value_type>::infinity())
|
||||
{
|
||||
tmp[i] = 0.;
|
||||
txr[i] = std::numeric_limits<value_type>::quiet_NaN();
|
||||
}
|
||||
else
|
||||
{
|
||||
double y[2];
|
||||
std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
|
||||
tmp[i] = value_type(n & 3);
|
||||
txr[i] = value_type(y[0]);
|
||||
}
|
||||
}
|
||||
xr = B::load_aligned(&txr[0]);
|
||||
B res = B::load_aligned(&tmp[0]);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class B>
|
||||
struct trigo_reducer<B, trigo_pi_tag>
|
||||
{
|
||||
static inline B reduce(const B& x, B& xr) noexcept
|
||||
{
|
||||
B xi = nearbyint(x * B(2.));
|
||||
B x2 = x - xi * B(0.5);
|
||||
xr = x2 * constants::pi<B>();
|
||||
return quadrant(xi);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type z = xr * xr;
|
||||
const batch_type se = detail::sin_eval(z, xr);
|
||||
const batch_type ce = detail::cos_eval(z);
|
||||
const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
|
||||
return z1 ^ sign_bit;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
|
||||
}
|
||||
|
||||
// cosh
|
||||
|
||||
/* origin: boost/simd/arch/common/simd/function/cosh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = abs(self);
|
||||
auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
|
||||
batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
|
||||
batch_type tmp = exp(x * fac);
|
||||
batch_type tmp1 = batch_type(0.5) * tmp;
|
||||
return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto x = z.real();
|
||||
auto y = z.imag();
|
||||
return { cosh(x) * cos(y), sinh(x) * sin(y) };
|
||||
}
|
||||
|
||||
// sin
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T, class Tag = trigo_radian_tag>
|
||||
inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type z = xr * xr;
|
||||
const batch_type se = detail::sin_eval(z, xr);
|
||||
const batch_type ce = detail::cos_eval(z);
|
||||
const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
|
||||
return z1 ^ sign_bit;
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::sin(self);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
|
||||
}
|
||||
|
||||
// sincos
|
||||
template <class A, class T>
|
||||
inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
const batch_type z = xr * xr;
|
||||
const batch_type se = detail::sin_eval(z, xr);
|
||||
const batch_type ce = detail::cos_eval(z);
|
||||
auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
|
||||
auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
|
||||
return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
|
||||
sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch rcos = cos(z.real());
|
||||
real_batch rsin = sin(z.real());
|
||||
real_batch icosh = cosh(z.imag());
|
||||
real_batch isinh = sinh(z.imag());
|
||||
return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
|
||||
}
|
||||
|
||||
// sinh
|
||||
namespace detail
|
||||
{
|
||||
/* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A>
|
||||
inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type sqr_self = self * self;
|
||||
return detail::horner<batch_type,
|
||||
0x3f800000, // 1.0f
|
||||
0x3e2aaacc, // 1.66667160211E-1f
|
||||
0x3c087bbe, // 8.33028376239E-3f
|
||||
0x39559e2f // 2.03721912945E-4f
|
||||
>(sqr_self)
|
||||
* self;
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type sqrself = self * self;
|
||||
return fma(self, (detail::horner<batch_type,
|
||||
0xc115782bdbf6ab05ull, // -3.51754964808151394800E5
|
||||
0xc0c694b8c71d6182ull, // -1.15614435765005216044E4,
|
||||
0xc064773a398ff4feull, // -1.63725857525983828727E2,
|
||||
0xbfe9435fe8bb3cd6ull // -7.89474443963537015605E-1
|
||||
>(sqrself)
|
||||
/ detail::horner1<batch_type,
|
||||
0xc1401a20e4f90044ull, // -2.11052978884890840399E6
|
||||
0x40e1a7ba7ed72245ull, // 3.61578279834431989373E4,
|
||||
0xc0715b6096e96484ull // -2.77711081420602794433E2,
|
||||
>(sqrself))
|
||||
* sqrself,
|
||||
self);
|
||||
}
|
||||
}
|
||||
/* origin: boost/simd/arch/common/simd/function/sinh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type half(0.5);
|
||||
batch_type x = abs(a);
|
||||
auto lt1 = x < batch_type(1.);
|
||||
batch_type bts = bitofsign(a);
|
||||
batch_type z(0.);
|
||||
if (any(lt1))
|
||||
{
|
||||
z = detail::sinh_kernel(x);
|
||||
if (all(lt1))
|
||||
return z ^ bts;
|
||||
}
|
||||
auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
|
||||
batch_type fac = select(test1, half, batch_type(1.));
|
||||
batch_type tmp = exp(x * fac);
|
||||
batch_type tmp1 = half * tmp;
|
||||
batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
|
||||
return select(lt1, z, r) ^ bts;
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto x = z.real();
|
||||
auto y = z.imag();
|
||||
return { sinh(x) * cos(y), cosh(x) * sin(y) };
|
||||
}
|
||||
|
||||
// tan
|
||||
template <class A, class T>
|
||||
inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
auto test = (swap_bit == batch_type(0.));
|
||||
const batch_type y = detail::tan_eval(xr, test);
|
||||
return y ^ bitofsign(self);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
|
||||
batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
|
||||
real_batch wreal = sin(2 * z.real()) / d;
|
||||
real_batch wimag = sinh(2 * z.imag());
|
||||
batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
|
||||
return select(d == real_batch(0.), winf, wres);
|
||||
}
|
||||
|
||||
// tanh
|
||||
namespace detail
|
||||
{
|
||||
/* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class B>
|
||||
struct tanh_kernel;
|
||||
|
||||
template <class A>
|
||||
struct tanh_kernel<batch<float, A>>
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
static inline batch_type tanh(const batch_type& x) noexcept
|
||||
{
|
||||
batch_type sqrx = x * x;
|
||||
return fma(detail::horner<batch_type,
|
||||
0xbeaaaa99, // -3.33332819422E-1F
|
||||
0x3e088393, // +1.33314422036E-1F
|
||||
0xbd5c1e2d, // -5.37397155531E-2F
|
||||
0x3ca9134e, // +2.06390887954E-2F
|
||||
0xbbbaf0ea // -5.70498872745E-3F
|
||||
>(sqrx)
|
||||
* sqrx,
|
||||
x, x);
|
||||
}
|
||||
|
||||
static inline batch_type cotanh(const batch_type& x) noexcept
|
||||
{
|
||||
return batch_type(1.) / tanh(x);
|
||||
}
|
||||
};
|
||||
|
||||
template <class A>
|
||||
struct tanh_kernel<batch<double, A>>
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
static inline batch_type tanh(const batch_type& x) noexcept
|
||||
{
|
||||
batch_type sqrx = x * x;
|
||||
return fma(sqrx * p(sqrx) / q(sqrx), x, x);
|
||||
}
|
||||
|
||||
static inline batch_type cotanh(const batch_type& x) noexcept
|
||||
{
|
||||
batch_type sqrx = x * x;
|
||||
batch_type qval = q(sqrx);
|
||||
return qval / (x * fma(p(sqrx), sqrx, qval));
|
||||
}
|
||||
|
||||
static inline batch_type p(const batch_type& x) noexcept
|
||||
{
|
||||
return detail::horner<batch_type,
|
||||
0xc0993ac030580563, // -1.61468768441708447952E3
|
||||
0xc058d26a0e26682d, // -9.92877231001918586564E1,
|
||||
0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
|
||||
>(x);
|
||||
}
|
||||
|
||||
static inline batch_type q(const batch_type& x) noexcept
|
||||
{
|
||||
return detail::horner1<batch_type,
|
||||
0x40b2ec102442040c, // 4.84406305325125486048E3
|
||||
0x40a176fa0e5535fa, // 2.23548839060100448583E3,
|
||||
0x405c33f28a581B86 // 1.12811678491632931402E2,
|
||||
>(x);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
/* origin: boost/simd/arch/common/simd/function/tanh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type one(1.);
|
||||
batch_type x = abs(self);
|
||||
auto test = x < (batch_type(5.) / batch_type(8.));
|
||||
batch_type bts = bitofsign(self);
|
||||
batch_type z = one;
|
||||
if (any(test))
|
||||
{
|
||||
z = detail::tanh_kernel<batch_type>::tanh(x);
|
||||
if (all(test))
|
||||
return z ^ bts;
|
||||
}
|
||||
batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
|
||||
return select(test, z, r) ^ bts;
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = typename batch<std::complex<T>, A>::real_batch;
|
||||
auto x = z.real();
|
||||
auto y = z.imag();
|
||||
real_batch two(2);
|
||||
auto d = cosh(two * x) + cos(two * y);
|
||||
return { sinh(two * x) / d, sin(two * y) / d };
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,940 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX2_HPP
|
||||
#define XSIMD_AVX2_HPP
|
||||
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_avx2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// abs
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_abs_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_abs_epi16(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_abs_epi32(self);
|
||||
}
|
||||
else
|
||||
{
|
||||
return abs(self, avx {});
|
||||
}
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
// add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_add_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_add_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_add_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_add_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return add(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_and
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_and_si256(self, other);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_and_si256(self, other);
|
||||
}
|
||||
|
||||
// bitwise_andnot
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_andnot_si256(other, self);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_andnot_si256(other, self);
|
||||
}
|
||||
|
||||
// bitwise_not
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
|
||||
}
|
||||
|
||||
// bitwise_lshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_slli_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_slli_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_slli_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_lshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_sllv_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_sllv_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_lshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_or
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_or_si256(self, other);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_or_si256(self, other);
|
||||
}
|
||||
|
||||
// bitwise_rshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
__m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
|
||||
__m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
|
||||
__m256i res = _mm256_srai_epi16(self, other);
|
||||
return _mm256_or_si256(
|
||||
detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
|
||||
{ return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
|
||||
sign_mask, cmp_is_negative),
|
||||
_mm256_andnot_si256(sign_mask, res));
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_srai_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srai_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_srli_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srli_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_srli_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srav_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srlv_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_srlv_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_xor
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, other);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, other);
|
||||
}
|
||||
|
||||
// complex_low
|
||||
template <class A>
|
||||
inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
__m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
|
||||
__m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
|
||||
return _mm256_blend_pd(tmp0, tmp1, 10);
|
||||
}
|
||||
|
||||
// complex_high
|
||||
template <class A>
|
||||
inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
__m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
|
||||
__m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
|
||||
return _mm256_blend_pd(tmp0, tmp1, 10);
|
||||
}
|
||||
|
||||
// fast_cast
|
||||
namespace detail
|
||||
{
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
|
||||
{
|
||||
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
|
||||
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
|
||||
__m256 cnst65536f = _mm256_set1_ps(65536.0f);
|
||||
|
||||
__m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */
|
||||
__m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */
|
||||
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
|
||||
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
|
||||
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
|
||||
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
// adapted to avx
|
||||
__m256i xH = _mm256_srli_epi64(x, 32);
|
||||
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
|
||||
__m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
|
||||
0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
|
||||
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
|
||||
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
|
||||
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
// adapted to avx
|
||||
__m256i xH = _mm256_srai_epi32(x, 16);
|
||||
xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
|
||||
xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67
|
||||
__m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
|
||||
0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
|
||||
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
|
||||
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
|
||||
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
|
||||
}
|
||||
}
|
||||
|
||||
// eq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_cmpeq_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_cmpeq_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_cmpeq_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_cmpeq_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return eq(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// gather
|
||||
template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
|
||||
inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
|
||||
kernel::requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
|
||||
}
|
||||
|
||||
template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
|
||||
inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
|
||||
kernel::requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
|
||||
}
|
||||
|
||||
template <class A, class U,
|
||||
detail::enable_sized_integral_t<U, 4> = 0>
|
||||
inline batch<float, A> gather(batch<float, A> const&, float const* src,
|
||||
batch<U, A> const& index,
|
||||
kernel::requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i32gather_ps(src, index, sizeof(float));
|
||||
}
|
||||
|
||||
template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
|
||||
inline batch<double, A> gather(batch<double, A> const&, double const* src,
|
||||
batch<U, A> const& index,
|
||||
requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i64gather_pd(src, index, sizeof(double));
|
||||
}
|
||||
|
||||
// gather: handmade conversions
|
||||
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
|
||||
inline batch<float, A> gather(batch<float, A> const&, double const* src,
|
||||
batch<V, A> const& index,
|
||||
requires_arch<avx2>) noexcept
|
||||
{
|
||||
const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
|
||||
const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
|
||||
return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
|
||||
}
|
||||
|
||||
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
|
||||
inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
|
||||
batch<V, A> const& index,
|
||||
requires_arch<avx2>) noexcept
|
||||
{
|
||||
const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
|
||||
const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
|
||||
return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
|
||||
}
|
||||
|
||||
// lt
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_cmpgt_epi8(other, self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_cmpgt_epi16(other, self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_cmpgt_epi32(other, self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_cmpgt_epi64(other, self);
|
||||
}
|
||||
else
|
||||
{
|
||||
return lt(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return lt(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// load_complex
|
||||
template <class A>
|
||||
inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type real = _mm256_castpd_ps(
|
||||
_mm256_permute4x64_pd(
|
||||
_mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
|
||||
_MM_SHUFFLE(3, 1, 2, 0)));
|
||||
batch_type imag = _mm256_castpd_ps(
|
||||
_mm256_permute4x64_pd(
|
||||
_mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
|
||||
_MM_SHUFFLE(3, 1, 2, 0)));
|
||||
return { real, imag };
|
||||
}
|
||||
template <class A>
|
||||
inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
return { real, imag };
|
||||
}
|
||||
// mask
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
|
||||
return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
|
||||
}
|
||||
else
|
||||
{
|
||||
return mask(self, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// max
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_max_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_max_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_max_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_max_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_max_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_max_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// min
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_min_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_min_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_min_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_min_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_min_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_min_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_mullo_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_mullo_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return mul(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
__m256i tmp1 = _mm256_hadd_epi32(self, self);
|
||||
__m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
|
||||
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
|
||||
__m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
|
||||
return _mm_cvtsi128_si32(tmp4);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
__m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
|
||||
__m256i tmp2 = _mm256_add_epi64(self, tmp1);
|
||||
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
|
||||
__m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
|
||||
#if defined(__x86_64__)
|
||||
return _mm_cvtsi128_si64(res);
|
||||
#else
|
||||
__m128i m;
|
||||
_mm_storel_epi64(&m, res);
|
||||
int64_t i;
|
||||
std::memcpy(&i, &m, sizeof(i));
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return reduce_add(self, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// sadd
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_adds_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_adds_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_adds_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_adds_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// select
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(cond, true_br, false_br, avx {});
|
||||
}
|
||||
}
|
||||
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
|
||||
// FIXME: for some reason mask here is not considered as an immediate,
|
||||
// but it's okay for _mm256_blend_epi32
|
||||
// case 2: return _mm256_blend_epi16(false_br, true_br, mask);
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_blend_epi32(false_br, true_br, mask);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
constexpr int imask = detail::interleave(mask);
|
||||
return _mm256_blend_epi32(false_br, true_br, imask);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
|
||||
}
|
||||
}
|
||||
|
||||
// slide_left
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 256)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
if (BitCount > 128)
|
||||
{
|
||||
constexpr unsigned M = (BitCount - 128) / 8;
|
||||
auto y = _mm256_bslli_epi128(x, M);
|
||||
return _mm256_permute2x128_si256(y, y, 0x28);
|
||||
}
|
||||
if (BitCount == 128)
|
||||
{
|
||||
return _mm256_permute2x128_si256(x, x, 0x28);
|
||||
}
|
||||
// shifting by [0, 128[ bits
|
||||
constexpr unsigned M = BitCount / 8;
|
||||
auto y = _mm256_bslli_epi128(x, M);
|
||||
auto z = _mm256_bsrli_epi128(x, 16 - M);
|
||||
auto w = _mm256_permute2x128_si256(z, z, 0x28);
|
||||
return _mm256_or_si256(y, w);
|
||||
}
|
||||
|
||||
// slide_right
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 256)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
if (BitCount > 128)
|
||||
{
|
||||
constexpr unsigned M = (BitCount - 128) / 8;
|
||||
auto y = _mm256_bsrli_epi128(x, M);
|
||||
return _mm256_permute2x128_si256(y, y, 0x81);
|
||||
}
|
||||
if (BitCount == 128)
|
||||
{
|
||||
return _mm256_permute2x128_si256(x, x, 0x81);
|
||||
}
|
||||
// shifting by [0, 128[ bits
|
||||
constexpr unsigned M = BitCount / 8;
|
||||
auto y = _mm256_bsrli_epi128(x, M);
|
||||
auto z = _mm256_bslli_epi128(x, 16 - M);
|
||||
auto w = _mm256_permute2x128_si256(z, z, 0x81);
|
||||
return _mm256_or_si256(y, w);
|
||||
}
|
||||
|
||||
// ssub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_subs_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_subs_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_subs_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_subs_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_sub_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_sub_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_sub_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_sub_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sub(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// swizzle
|
||||
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
|
||||
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
|
||||
}
|
||||
|
||||
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
|
||||
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
|
||||
return _mm256_permute4x64_pd(self, mask);
|
||||
}
|
||||
|
||||
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
|
||||
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
|
||||
return _mm256_permute4x64_epi64(self, mask);
|
||||
}
|
||||
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
|
||||
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
|
||||
}
|
||||
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
|
||||
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
|
||||
}
|
||||
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
|
||||
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
|
||||
}
|
||||
|
||||
// zip_hi
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi8(self, other);
|
||||
auto hi = _mm256_unpackhi_epi8(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi16(self, other);
|
||||
auto hi = _mm256_unpackhi_epi16(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi32(self, other);
|
||||
auto hi = _mm256_unpackhi_epi32(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi64(self, other);
|
||||
auto hi = _mm256_unpackhi_epi64(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// zip_lo
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi8(self, other);
|
||||
auto hi = _mm256_unpackhi_epi8(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi16(self, other);
|
||||
auto hi = _mm256_unpackhi_epi16(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi32(self, other);
|
||||
auto hi = _mm256_unpackhi_epi32(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi64(self, other);
|
||||
auto hi = _mm256_unpackhi_epi64(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,627 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512BW_HPP
|
||||
#define XSIMD_AVX512BW_HPP
|
||||
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_avx512bw_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T, int Cmp>
|
||||
inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
|
||||
{
|
||||
using register_type = typename batch_bool<T, A>::register_type;
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// abs
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_unsigned<T>::value)
|
||||
{
|
||||
return self;
|
||||
}
|
||||
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_abs_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_abs_epi16(self);
|
||||
}
|
||||
else
|
||||
{
|
||||
return abs(self, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_add_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_add_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return add(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_lshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_slli_epi16(self, other);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_lshift(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_rshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
__m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
|
||||
__m512i zeros = _mm512_setzero_si512();
|
||||
__mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
|
||||
__m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
__m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
__m512i res = _mm512_srai_epi16(self, other);
|
||||
#endif
|
||||
return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srai_epi16(self, other);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srli_epi16(self, other);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// eq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
|
||||
}
|
||||
|
||||
// ge
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
|
||||
}
|
||||
|
||||
// gt
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
|
||||
}
|
||||
|
||||
// le
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
|
||||
}
|
||||
|
||||
// lt
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
|
||||
}
|
||||
|
||||
// max
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_max_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_max_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_max_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_max_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// min
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_min_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_min_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_min_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_min_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
__m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
|
||||
__m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
|
||||
return _mm512_or_si512(upper, lower);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_mullo_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return mul(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// neq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
|
||||
}
|
||||
|
||||
// sadd
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_adds_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_adds_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_adds_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_adds_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// select
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(cond, true_br, false_br, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// slide_left
|
||||
namespace detail
|
||||
{
|
||||
template <size_t... Is>
|
||||
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is == 0 ? 8 : Is - 1)... };
|
||||
}
|
||||
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is >= N ? Is - N : 0)... };
|
||||
}
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is >= N ? 0xFFFF : 0x0000)... };
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 512)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
batch<T, A> xx;
|
||||
if (N & 1)
|
||||
{
|
||||
alignas(A::alignment()) uint64_t buffer[8];
|
||||
_mm512_store_epi64(&buffer[0], x);
|
||||
for (int i = 7; i > 0; --i)
|
||||
buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
|
||||
buffer[0] = buffer[0] << 8;
|
||||
xx = _mm512_load_epi64(&buffer[0]);
|
||||
|
||||
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
|
||||
__m512i xl = _mm512_slli_epi64(x, 8);
|
||||
__m512i xr = _mm512_srli_epi64(x, 56);
|
||||
xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
|
||||
xx = _mm512_or_si512(xr, xl);
|
||||
if (N == 1)
|
||||
return xx;
|
||||
}
|
||||
else
|
||||
{
|
||||
xx = x;
|
||||
}
|
||||
alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
|
||||
}
|
||||
|
||||
// slide_right
|
||||
namespace detail
|
||||
{
|
||||
template <size_t... Is>
|
||||
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is + 1)... };
|
||||
}
|
||||
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is < (32 - N) ? Is + N : 0)... };
|
||||
}
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
|
||||
}
|
||||
}
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 512)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
batch<T, A> xx;
|
||||
if (N & 1)
|
||||
{
|
||||
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
|
||||
__m512i xr = _mm512_srli_epi64(x, 8);
|
||||
__m512i xl = _mm512_slli_epi64(x, 56);
|
||||
xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
|
||||
xx = _mm512_or_si512(xr, xl);
|
||||
if (N == 1)
|
||||
return xx;
|
||||
}
|
||||
else
|
||||
{
|
||||
xx = x;
|
||||
}
|
||||
alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
|
||||
}
|
||||
|
||||
// ssub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_subs_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_subs_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_subs_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_subs_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_sub_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_sub_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sub(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// swizzle
|
||||
|
||||
template <class A, uint16_t... Vs>
|
||||
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
|
||||
}
|
||||
|
||||
template <class A, uint16_t... Vs>
|
||||
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
|
||||
}
|
||||
|
||||
template <class A, uint8_t... Vs>
|
||||
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
|
||||
}
|
||||
|
||||
template <class A, uint8_t... Vs>
|
||||
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
|
||||
}
|
||||
|
||||
// zip_hi
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
__m512i lo, hi;
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi8(self, other);
|
||||
hi = _mm512_unpackhi_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi16(self, other);
|
||||
hi = _mm512_unpackhi_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return zip_hi(self, other, avx512f {});
|
||||
}
|
||||
return _mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
|
||||
_mm512_extracti32x4_epi32(lo, 3),
|
||||
2),
|
||||
_mm512_extracti32x4_epi32(hi, 2),
|
||||
1);
|
||||
}
|
||||
|
||||
// zip_lo
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
__m512i lo, hi;
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi8(self, other);
|
||||
hi = _mm512_unpackhi_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi16(self, other);
|
||||
hi = _mm512_unpackhi_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return zip_lo(self, other, avx512f {});
|
||||
}
|
||||
return _mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
|
||||
_mm512_extracti32x4_epi32(hi, 1),
|
||||
3),
|
||||
_mm512_extracti32x4_epi32(lo, 1),
|
||||
2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,28 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512CD_HPP
|
||||
#define XSIMD_AVX512CD_HPP
|
||||
|
||||
#include "../types/xsimd_avx512cd_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
// Nothing there yet.
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,212 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512_DQHPP
|
||||
#define XSIMD_AVX512_D_HPP
|
||||
|
||||
#include "../types/xsimd_avx512dq_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// bitwise_and
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_and_ps(self, other);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_and_pd(self, other);
|
||||
}
|
||||
|
||||
// bitwise_andnot
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_andnot_ps(other, self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_andnot_pd(other, self);
|
||||
}
|
||||
|
||||
// bitwise_not
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
|
||||
{
|
||||
return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
|
||||
{
|
||||
return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
|
||||
}
|
||||
|
||||
// bitwise_or
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_or_ps(self, other);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_or_pd(self, other);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
using register_type = typename batch_bool<T, A>::register_type;
|
||||
return register_type(self.data | other.data);
|
||||
}
|
||||
|
||||
// bitwise_xor
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_xor_ps(self, other);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_xor_pd(self, other);
|
||||
}
|
||||
|
||||
// haddp
|
||||
template <class A>
|
||||
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
// The following folds over the vector once:
|
||||
// tmp1 = [a0..8, b0..8]
|
||||
// tmp2 = [a8..f, b8..f]
|
||||
#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
|
||||
batch<float, avx512f> res##I; \
|
||||
{ \
|
||||
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
|
||||
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
|
||||
res##I = _mm512_add_ps(tmp1, tmp2); \
|
||||
}
|
||||
|
||||
XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
|
||||
XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
|
||||
XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
|
||||
XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
|
||||
XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
|
||||
XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
|
||||
XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
|
||||
XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
|
||||
|
||||
#undef XSIMD_AVX512_HADDP_STEP1
|
||||
|
||||
// The following flds the code and shuffles so that hadd_ps produces the correct result
|
||||
// tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3)
|
||||
// tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4)
|
||||
// tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
|
||||
#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
|
||||
batch<float, avx2> halfx##I; \
|
||||
{ \
|
||||
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
|
||||
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
|
||||
\
|
||||
auto resx1 = _mm512_add_ps(tmp1, tmp2); \
|
||||
\
|
||||
auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
|
||||
auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
|
||||
\
|
||||
auto resx2 = _mm512_add_ps(tmp3, tmp4); \
|
||||
\
|
||||
auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
|
||||
auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
|
||||
\
|
||||
auto resx3 = _mm512_add_ps(tmp5, tmp6); \
|
||||
\
|
||||
halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
|
||||
_mm512_extractf32x8_ps(resx3, 1)); \
|
||||
}
|
||||
|
||||
XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
|
||||
XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
|
||||
|
||||
#undef XSIMD_AVX512_HADDP_STEP2
|
||||
|
||||
auto concat = _mm512_castps256_ps512(halfx0);
|
||||
concat = _mm512_insertf32x8(concat, halfx1, 1);
|
||||
return concat;
|
||||
}
|
||||
|
||||
// ldexp
|
||||
template <class A>
|
||||
inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A>
|
||||
inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_mullo_epi64(self, other);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_mullo_epi64(self, other);
|
||||
}
|
||||
|
||||
// nearbyint_as_int
|
||||
template <class A>
|
||||
inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
|
||||
requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_cvtpd_epi64(self);
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A>
|
||||
inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
|
||||
{
|
||||
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
|
||||
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
|
||||
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
|
||||
return reduce_add(batch<float, avx2>(res1), avx2 {});
|
||||
}
|
||||
|
||||
// convert
|
||||
namespace detail
|
||||
{
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_cvtepi64_pd(self);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_cvttpd_epi64(self);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,384 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
|
||||
#define XSIMD_NUMERICAL_CONSTANT_HPP
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "../types/xsimd_utils.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace constants
|
||||
{
|
||||
|
||||
#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
|
||||
template <class T> \
|
||||
inline T NAME() noexcept \
|
||||
{ \
|
||||
return T(NAME<typename T::value_type>()); \
|
||||
} \
|
||||
template <> \
|
||||
inline float NAME<float>() noexcept \
|
||||
{ \
|
||||
return SINGLE; \
|
||||
} \
|
||||
template <> \
|
||||
inline double NAME<double>() noexcept \
|
||||
{ \
|
||||
return DOUBLE; \
|
||||
}
|
||||
|
||||
#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
|
||||
template <class T> \
|
||||
inline T NAME() noexcept \
|
||||
{ \
|
||||
return T(NAME<typename T::value_type>()); \
|
||||
} \
|
||||
template <> \
|
||||
inline float NAME<float>() noexcept \
|
||||
{ \
|
||||
return bit_cast<float>((uint32_t)SINGLE); \
|
||||
} \
|
||||
template <> \
|
||||
inline double NAME<double>() noexcept \
|
||||
{ \
|
||||
return bit_cast<double>((uint64_t)DOUBLE); \
|
||||
}
|
||||
|
||||
XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
|
||||
XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
|
||||
XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
|
||||
XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
|
||||
XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
|
||||
XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
|
||||
XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
|
||||
XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
|
||||
XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
|
||||
XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
|
||||
XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
|
||||
XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
|
||||
XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
|
||||
XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
|
||||
XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
|
||||
|
||||
#undef XSIMD_DEFINE_CONSTANT
|
||||
#undef XSIMD_DEFINE_CONSTANT_HEX
|
||||
|
||||
template <class T>
|
||||
constexpr T allbits() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> mask1frexp() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> mask2frexp() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> maxexponent() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> maxexponentm1() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr int32_t nmb() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr T zero() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr T minvalue() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr T maxvalue() noexcept;
|
||||
|
||||
/**************************
|
||||
* allbits implementation *
|
||||
**************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, bool = std::is_integral<T>::value>
|
||||
struct allbits_impl
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return T(~0);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct allbits_impl<T, false>
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return nan<T>();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline constexpr T allbits() noexcept
|
||||
{
|
||||
return T(detail::allbits_impl<typename T::value_type>::get_value());
|
||||
}
|
||||
|
||||
/*****************************
|
||||
* mask1frexp implementation *
|
||||
*****************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> mask1frexp() noexcept
|
||||
{
|
||||
return as_integer_t<T>(mask1frexp<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t mask1frexp<float>() noexcept
|
||||
{
|
||||
return 0x7f800000;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t mask1frexp<double>() noexcept
|
||||
{
|
||||
return 0x7ff0000000000000;
|
||||
}
|
||||
|
||||
/*****************************
|
||||
* mask2frexp implementation *
|
||||
*****************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> mask2frexp() noexcept
|
||||
{
|
||||
return as_integer_t<T>(mask2frexp<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t mask2frexp<float>() noexcept
|
||||
{
|
||||
return 0x3f000000;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t mask2frexp<double>() noexcept
|
||||
{
|
||||
return 0x3fe0000000000000;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* maxexponent implementation *
|
||||
******************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> maxexponent() noexcept
|
||||
{
|
||||
return as_integer_t<T>(maxexponent<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t maxexponent<float>() noexcept
|
||||
{
|
||||
return 127;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t maxexponent<double>() noexcept
|
||||
{
|
||||
return 1023;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* maxexponent implementation *
|
||||
******************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> maxexponentm1() noexcept
|
||||
{
|
||||
return as_integer_t<T>(maxexponentm1<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t maxexponentm1<float>() noexcept
|
||||
{
|
||||
return 126;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t maxexponentm1<double>() noexcept
|
||||
{
|
||||
return 1022;
|
||||
}
|
||||
|
||||
/**********************
|
||||
* nmb implementation *
|
||||
**********************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr int32_t nmb() noexcept
|
||||
{
|
||||
return nmb<typename T::value_type>();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t nmb<float>() noexcept
|
||||
{
|
||||
return 23;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t nmb<double>() noexcept
|
||||
{
|
||||
return 52;
|
||||
}
|
||||
|
||||
/***********************
|
||||
* zero implementation *
|
||||
***********************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr T zero() noexcept
|
||||
{
|
||||
return T(typename T::value_type(0));
|
||||
}
|
||||
|
||||
/***************************
|
||||
* minvalue implementation *
|
||||
***************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T>
|
||||
struct minvalue_impl
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return std::numeric_limits<typename T::value_type>::min();
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct minvalue_common
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return std::numeric_limits<T>::min();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct minvalue_impl<int8_t> : minvalue_common<int8_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<int16_t> : minvalue_common<int16_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<int32_t> : minvalue_common<int32_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<int64_t> : minvalue_common<int64_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct minvalue_impl<float>
|
||||
{
|
||||
static float get_value() noexcept
|
||||
{
|
||||
return bit_cast<float>((uint32_t)0xff7fffff);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct minvalue_impl<double>
|
||||
{
|
||||
static double get_value() noexcept
|
||||
{
|
||||
return bit_cast<double>((uint64_t)0xffefffffffffffff);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline constexpr T minvalue() noexcept
|
||||
{
|
||||
return T(detail::minvalue_impl<typename T::value_type>::get_value());
|
||||
}
|
||||
|
||||
/***************************
|
||||
* maxvalue implementation *
|
||||
***************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr T maxvalue() noexcept
|
||||
{
|
||||
return T(std::numeric_limits<typename T::value_type>::max());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,80 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX_HPP
|
||||
#define XSIMD_FMA3_AVX_HPP
|
||||
|
||||
#include "../types/xsimd_fma3_avx_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// fnma
|
||||
template <class A>
|
||||
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A>
|
||||
inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A>
|
||||
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A>
|
||||
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX2_HPP
|
||||
#define XSIMD_FMA3_AVX2_HPP
|
||||
|
||||
#include "../types/xsimd_fma3_avx2_register.hpp"
|
||||
|
||||
// Allow inclusion of xsimd_fma3_avx.hpp
|
||||
#ifdef XSIMD_FMA3_AVX_HPP
|
||||
#undef XSIMD_FMA3_AVX_HPP
|
||||
#define XSIMD_FORCE_FMA3_AVX_HPP
|
||||
#endif
|
||||
|
||||
// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
|
||||
#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#define XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
|
||||
#endif
|
||||
|
||||
// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
|
||||
#define avx avx2
|
||||
#include "./xsimd_fma3_avx.hpp"
|
||||
#undef avx
|
||||
#undef XSIMD_FMA3_AVX_HPP
|
||||
|
||||
// Carefully restore guards
|
||||
#ifdef XSIMD_FORCE_FMA3_AVX_HPP
|
||||
#define XSIMD_FMA3_AVX_HPP
|
||||
#undef XSIMD_FORCE_FMA3_AVX_HPP
|
||||
#endif
|
||||
|
||||
#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
|
||||
#undef XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,79 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_SSE_HPP
|
||||
#define XSIMD_FMA3_SSE_HPP
|
||||
|
||||
#include "../types/xsimd_fma3_sse_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
// fnma
|
||||
template <class A>
|
||||
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A>
|
||||
inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A>
|
||||
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A>
|
||||
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,79 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA4_HPP
|
||||
#define XSIMD_FMA4_HPP
|
||||
|
||||
#include "../types/xsimd_fma4_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// fnma
|
||||
template <class A>
|
||||
inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmacc_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmacc_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A>
|
||||
inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A>
|
||||
inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_macc_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_macc_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A>
|
||||
inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_msub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_msub_pd(x, y, z);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,23 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_HPP
|
||||
#define XSIMD_GENERIC_HPP
|
||||
|
||||
#include "./generic/xsimd_generic_arithmetic.hpp"
|
||||
#include "./generic/xsimd_generic_complex.hpp"
|
||||
#include "./generic/xsimd_generic_logical.hpp"
|
||||
#include "./generic/xsimd_generic_math.hpp"
|
||||
#include "./generic/xsimd_generic_memory.hpp"
|
||||
#include "./generic/xsimd_generic_rounding.hpp"
|
||||
#include "./generic/xsimd_generic_trigo.hpp"
|
||||
|
||||
#endif
|
|
@ -1,38 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_FWD_HPP
|
||||
#define XSIMD_GENERIC_FWD_HPP
|
||||
|
||||
#include "../types/xsimd_batch_constant.hpp"
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace kernel
|
||||
{
|
||||
// forward declaration
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,86 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ISA_HPP
|
||||
#define XSIMD_ISA_HPP
|
||||
|
||||
#include "../config/xsimd_arch.hpp"
|
||||
|
||||
#include "./xsimd_generic_fwd.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
#include "./xsimd_sse2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE3
|
||||
#include "./xsimd_sse3.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#include "./xsimd_ssse3.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#include "./xsimd_sse4_1.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#include "./xsimd_sse4_2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#include "./xsimd_fma3_sse.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA4
|
||||
#include "./xsimd_fma4.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
#include "./xsimd_avx.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#include "./xsimd_fma3_avx.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
#include "./xsimd_avx2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#include "./xsimd_fma3_avx2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX512F
|
||||
#include "./xsimd_avx512f.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX512BW
|
||||
#include "./xsimd_avx512bw.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_NEON
|
||||
#include "./xsimd_neon.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_NEON64
|
||||
#include "./xsimd_neon64.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SVE
|
||||
#include "./xsimd_sve.hpp"
|
||||
#endif
|
||||
|
||||
// Must come last to have access to all conversion specializations.
|
||||
#include "./xsimd_generic.hpp"
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,64 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE3_HPP
|
||||
#define XSIMD_SSE3_HPP
|
||||
|
||||
#include "../types/xsimd_sse3_register.hpp"
|
||||
#include <type_traits>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// haddp
|
||||
template <class A>
|
||||
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
|
||||
{
|
||||
return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
|
||||
_mm_hadd_ps(row[2], row[3]));
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
|
||||
{
|
||||
return _mm_hadd_pd(row[0], row[1]);
|
||||
}
|
||||
|
||||
// load_unaligned
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
|
||||
{
|
||||
return _mm_lddqu_si128((__m128i const*)mem);
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A>
|
||||
inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
|
||||
{
|
||||
__m128 tmp0 = _mm_hadd_ps(self, self);
|
||||
__m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
|
||||
return _mm_cvtss_f32(tmp1);
|
||||
}
|
||||
template <class A>
|
||||
inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
|
||||
{
|
||||
__m128d tmp0 = _mm_hadd_pd(self, self);
|
||||
return _mm_cvtsd_f64(tmp0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,350 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_1_HPP
|
||||
#define XSIMD_SSE4_1_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_sse4_1_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
// any
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return !_mm_testz_si128(self, self);
|
||||
}
|
||||
// ceil
|
||||
template <class A>
|
||||
inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_ceil_ps(self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_ceil_pd(self);
|
||||
}
|
||||
|
||||
// fast_cast
|
||||
namespace detail
|
||||
{
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
__m128i xH = _mm_srai_epi32(x, 16);
|
||||
xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
|
||||
xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
|
||||
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52
|
||||
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
|
||||
return _mm_add_pd(f, _mm_castsi128_pd(xL));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
__m128i xH = _mm_srli_epi64(x, 32);
|
||||
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
|
||||
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
|
||||
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
|
||||
return _mm_add_pd(f, _mm_castsi128_pd(xL));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_castps_si128(
|
||||
_mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
|
||||
_mm_castsi128_ps(_mm_xor_si128(
|
||||
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
|
||||
_mm_set1_epi32(1u << 31))),
|
||||
_mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
|
||||
}
|
||||
}
|
||||
|
||||
// eq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm_cmpeq_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return eq(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
|
||||
// floor
|
||||
template <class A>
|
||||
inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_floor_ps(self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_floor_pd(self);
|
||||
}
|
||||
|
||||
// insert
|
||||
template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_insert_epi8(self, val, I);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_insert_epi32(self, val, I);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
|
||||
return _mm_insert_epi64(self, val, I);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
|
||||
memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
|
||||
return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return insert(self, val, pos, ssse3 {});
|
||||
}
|
||||
}
|
||||
|
||||
// max
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_max_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_max_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_max_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_max_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_max_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_max_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// min
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_min_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_min_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_min_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_min_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_min_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_min_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_or_si128(
|
||||
_mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
|
||||
_mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_mullo_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_mullo_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm_add_epi64(
|
||||
_mm_mul_epu32(self, other),
|
||||
_mm_slli_epi64(
|
||||
_mm_add_epi64(
|
||||
_mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
|
||||
_mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
|
||||
32));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// nearbyint
|
||||
template <class A>
|
||||
inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
|
||||
}
|
||||
|
||||
// select
|
||||
namespace detail
|
||||
{
|
||||
template <class T>
|
||||
inline constexpr T interleave(T const& cond) noexcept
|
||||
{
|
||||
return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_blendv_ps(false_br, true_br, cond);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_blendv_pd(false_br, true_br, cond);
|
||||
}
|
||||
|
||||
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_blend_epi16(false_br, true_br, mask);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
constexpr int imask = detail::interleave(mask);
|
||||
return _mm_blend_epi16(false_br, true_br, imask);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
constexpr int imask = detail::interleave(mask);
|
||||
constexpr int imask2 = detail::interleave(imask);
|
||||
return _mm_blend_epi16(false_br, true_br, imask2);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
|
||||
}
|
||||
}
|
||||
template <class A, bool... Values>
|
||||
inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
|
||||
return _mm_blend_ps(false_br, true_br, mask);
|
||||
}
|
||||
template <class A, bool... Values>
|
||||
inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
|
||||
return _mm_blend_pd(false_br, true_br, mask);
|
||||
}
|
||||
|
||||
// trunc
|
||||
template <class A>
|
||||
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,44 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_2_HPP
|
||||
#define XSIMD_SSE4_2_HPP
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "../types/xsimd_sse4_2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// lt
|
||||
template <class A>
|
||||
inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
|
||||
{
|
||||
return _mm_cmpgt_epi64(other, self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
|
||||
{
|
||||
auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
|
||||
auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
|
||||
return _mm_cmpgt_epi64(xother, xself);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,142 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSSE3_HPP
|
||||
#define XSIMD_SSSE3_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_ssse3_register.hpp"
|
||||
#include "../types/xsimd_utils.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// abs
|
||||
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_abs_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_abs_epi16(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_abs_epi32(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm_abs_epi64(self);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// extract_pair
|
||||
namespace detail
|
||||
{
|
||||
|
||||
template <class T, class A>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
|
||||
{
|
||||
return other;
|
||||
}
|
||||
|
||||
template <class T, class A, std::size_t I, std::size_t... Is>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
|
||||
{
|
||||
if (i == I)
|
||||
{
|
||||
return _mm_alignr_epi8(self, other, sizeof(T) * I);
|
||||
}
|
||||
else
|
||||
return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
constexpr std::size_t size = batch<T, A>::size;
|
||||
assert(0 <= i && i < size && "index in bounds");
|
||||
return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
__m128i tmp1 = _mm_hadd_epi16(self, self);
|
||||
__m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
|
||||
__m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
|
||||
return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
__m128i tmp1 = _mm_hadd_epi32(self, self);
|
||||
__m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
|
||||
return _mm_cvtsi128_si32(tmp2);
|
||||
}
|
||||
else
|
||||
{
|
||||
return reduce_add(self, sse3 {});
|
||||
}
|
||||
}
|
||||
|
||||
// swizzle
|
||||
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
|
||||
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
|
||||
2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
|
||||
mask8;
|
||||
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
|
||||
}
|
||||
|
||||
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
|
||||
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
|
||||
}
|
||||
|
||||
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
|
||||
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
|
||||
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
|
||||
}
|
||||
|
||||
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
|
||||
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
|
||||
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,249 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ARCH_HPP
|
||||
#define XSIMD_ARCH_HPP
|
||||
|
||||
#include <initializer_list>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "../types/xsimd_all_registers.hpp"
|
||||
#include "./xsimd_config.hpp"
|
||||
#include "./xsimd_cpuid.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Checks whether T appears in Tys.
|
||||
template <class T, class... Tys>
|
||||
struct contains;
|
||||
|
||||
template <class T>
|
||||
struct contains<T> : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class Ty, class... Tys>
|
||||
struct contains<T, Ty, Tys...>
|
||||
: std::conditional<std::is_same<Ty, T>::value, std::true_type,
|
||||
contains<T, Tys...>>::type
|
||||
{
|
||||
};
|
||||
|
||||
template <class... Archs>
|
||||
struct is_sorted;
|
||||
|
||||
template <>
|
||||
struct is_sorted<> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class Arch>
|
||||
struct is_sorted<Arch> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class A0, class A1, class... Archs>
|
||||
struct is_sorted<A0, A1, Archs...>
|
||||
: std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
|
||||
std::false_type>::type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr T max_of(T value) noexcept
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T, typename... Ts>
|
||||
inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
|
||||
{
|
||||
return max_of((head0 > head1 ? head0 : head1), tail...);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// An arch_list is a list of architectures, sorted by version number.
|
||||
template <class... Archs>
|
||||
struct arch_list
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
static_assert(detail::is_sorted<Archs...>::value,
|
||||
"architecture list must be sorted by version");
|
||||
#endif
|
||||
|
||||
template <class Arch>
|
||||
using add = arch_list<Archs..., Arch>;
|
||||
|
||||
template <class... OtherArchs>
|
||||
using extend = arch_list<Archs..., OtherArchs...>;
|
||||
|
||||
template <class Arch>
|
||||
static constexpr bool contains() noexcept
|
||||
{
|
||||
return detail::contains<Arch, Archs...>::value;
|
||||
}
|
||||
|
||||
template <class F>
|
||||
static void for_each(F&& f) noexcept
|
||||
{
|
||||
(void)std::initializer_list<bool> { (f(Archs {}), true)... };
|
||||
}
|
||||
|
||||
static constexpr std::size_t alignment() noexcept
|
||||
{
|
||||
// all alignments are a power of two
|
||||
return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
|
||||
}
|
||||
};
|
||||
|
||||
struct unavailable
|
||||
{
|
||||
static constexpr bool supported() noexcept { return false; }
|
||||
static constexpr bool available() noexcept { return false; }
|
||||
static constexpr unsigned version() noexcept { return 0; }
|
||||
static constexpr std::size_t alignment() noexcept { return 0; }
|
||||
static constexpr bool requires_alignment() noexcept { return false; }
|
||||
static constexpr char const* name() noexcept { return "<none>"; }
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Pick the best architecture in arch_list L, which is the last
|
||||
// because architectures are sorted by version.
|
||||
template <class L>
|
||||
struct best;
|
||||
|
||||
template <>
|
||||
struct best<arch_list<>>
|
||||
{
|
||||
using type = unavailable;
|
||||
};
|
||||
|
||||
template <class Arch, class... Archs>
|
||||
struct best<arch_list<Arch, Archs...>>
|
||||
{
|
||||
using type = Arch;
|
||||
};
|
||||
|
||||
// Filter archlists Archs, picking only supported archs and adding
|
||||
// them to L.
|
||||
template <class L, class... Archs>
|
||||
struct supported_helper;
|
||||
|
||||
template <class L>
|
||||
struct supported_helper<L, arch_list<>>
|
||||
{
|
||||
using type = L;
|
||||
};
|
||||
|
||||
template <class L, class Arch, class... Archs>
|
||||
struct supported_helper<L, arch_list<Arch, Archs...>>
|
||||
: supported_helper<
|
||||
typename std::conditional<Arch::supported(),
|
||||
typename L::template add<Arch>, L>::type,
|
||||
arch_list<Archs...>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class... Archs>
|
||||
struct supported : supported_helper<arch_list<>, Archs...>
|
||||
{
|
||||
};
|
||||
|
||||
// Joins all arch_list Archs in a single arch_list.
|
||||
template <class... Archs>
|
||||
struct join;
|
||||
|
||||
template <class Arch>
|
||||
struct join<Arch>
|
||||
{
|
||||
using type = Arch;
|
||||
};
|
||||
|
||||
template <class Arch, class... Archs, class... Args>
|
||||
struct join<Arch, arch_list<Archs...>, Args...>
|
||||
: join<typename Arch::template extend<Archs...>, Args...>
|
||||
{
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
struct unsupported
|
||||
{
|
||||
};
|
||||
using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
|
||||
using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
|
||||
using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
|
||||
using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
|
||||
|
||||
using supported_architectures = typename detail::supported<all_architectures>::type;
|
||||
|
||||
using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
|
||||
using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
|
||||
// using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
|
||||
using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
|
||||
arm_arch,
|
||||
x86_arch>::type;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class F, class ArchList>
|
||||
class dispatcher
|
||||
{
|
||||
|
||||
const unsigned best_arch;
|
||||
F functor;
|
||||
|
||||
template <class Arch, class... Tys>
|
||||
auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
|
||||
{
|
||||
assert(Arch::available() && "At least one arch must be supported during dispatch");
|
||||
return functor(Arch {}, std::forward<Tys>(args)...);
|
||||
}
|
||||
|
||||
template <class Arch, class ArchNext, class... Archs, class... Tys>
|
||||
auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
|
||||
{
|
||||
if (Arch::version() <= best_arch)
|
||||
return functor(Arch {}, std::forward<Tys>(args)...);
|
||||
else
|
||||
return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
|
||||
}
|
||||
|
||||
public:
|
||||
dispatcher(F f) noexcept
|
||||
: best_arch(available_architectures().best)
|
||||
, functor(f)
|
||||
{
|
||||
}
|
||||
|
||||
template <class... Tys>
|
||||
auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
|
||||
{
|
||||
return walk_archs(ArchList {}, std::forward<Tys>(args)...);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Generic function dispatch, à la ifunc
|
||||
template <class ArchList = supported_architectures, class F>
|
||||
inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
|
||||
{
|
||||
return { std::forward<F>(f) };
|
||||
}
|
||||
|
||||
} // namespace xsimd
|
||||
|
||||
#endif
|
|
@ -1,341 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_CONFIG_HPP
|
||||
#define XSIMD_CONFIG_HPP
|
||||
|
||||
#define XSIMD_VERSION_MAJOR 10
|
||||
#define XSIMD_VERSION_MINOR 0
|
||||
#define XSIMD_VERSION_PATCH 0
|
||||
|
||||
/**
|
||||
* high level free functions
|
||||
*
|
||||
* @defgroup xsimd_config_macro Instruction Set Detection
|
||||
*/
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE2__
|
||||
#define XSIMD_WITH_SSE2 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE3__
|
||||
#define XSIMD_WITH_SSE3 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE3 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSSE3__
|
||||
#define XSIMD_WITH_SSSE3 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSSE3 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE4_1__
|
||||
#define XSIMD_WITH_SSE4_1 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE4_1 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE4_2__
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE4_2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX__
|
||||
#define XSIMD_WITH_AVX 1
|
||||
#else
|
||||
#define XSIMD_WITH_AVX 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX2__
|
||||
#define XSIMD_WITH_AVX2 1
|
||||
#else
|
||||
#define XSIMD_WITH_AVX2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA__
|
||||
|
||||
#if defined(__SSE__)
|
||||
#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_SSE 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_SSE 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_SSE 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA__
|
||||
|
||||
#if defined(__AVX__)
|
||||
#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_AVX 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX 0
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_AVX2 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX2 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX 0
|
||||
#define XSIMD_WITH_FMA3_AVX2 0
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA4__
|
||||
#define XSIMD_WITH_FMA4 1
|
||||
#else
|
||||
#define XSIMD_WITH_FMA4 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512F__
|
||||
// AVX512 instructions are supported starting with gcc 6
|
||||
// see https://www.gnu.org/software/gcc/gcc-6/changes.html
|
||||
// check clang first, newer clang always defines __GNUC__ = 4
|
||||
#if defined(__clang__) && __clang_major__ >= 6
|
||||
#define XSIMD_WITH_AVX512F 1
|
||||
#elif defined(__GNUC__) && __GNUC__ < 6
|
||||
#define XSIMD_WITH_AVX512F 0
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512F 1
|
||||
#if __GNUC__ == 6
|
||||
#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512F 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512CD__
|
||||
// Avoids repeating the GCC workaround over and over
|
||||
#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512CD 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512DQ__
|
||||
#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512DQ 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512BW__
|
||||
#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512BW 0
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if NEON is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#if __ARM_ARCH >= 7
|
||||
#define XSIMD_WITH_NEON 1
|
||||
#else
|
||||
#define XSIMD_WITH_NEON 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __aarch64__
|
||||
#define XSIMD_WITH_NEON64 1
|
||||
#else
|
||||
#define XSIMD_WITH_NEON64 0
|
||||
#endif
|
||||
#else
|
||||
#define XSIMD_WITH_NEON 0
|
||||
#define XSIMD_WITH_NEON64 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
|
||||
#define XSIMD_WITH_SVE 1
|
||||
#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
|
||||
#else
|
||||
#define XSIMD_WITH_SVE 0
|
||||
#define XSIMD_SVE_BITS 0
|
||||
#endif
|
||||
|
||||
// Workaround for MSVC compiler
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#if XSIMD_WITH_AVX512
|
||||
#undef XSIMD_WITH_AVX2
|
||||
#define XSIMD_WITH_AVX2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
#undef XSIMD_WITH_AVX
|
||||
#define XSIMD_WITH_AVX 1
|
||||
#undef XSIMD_WITH_FMA3_AVX
|
||||
#define XSIMD_WITH_FMA3_AVX 1
|
||||
#undef XSIMD_WITH_FMA3_AVX2
|
||||
#define XSIMD_WITH_FMA3_AVX2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
#undef XSIMD_WITH_SSE4_2
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#endif
|
||||
|
||||
#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
|
||||
#undef XSIMD_WITH_SSE4_2
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#undef XSIMD_WITH_SSE4_1
|
||||
#define XSIMD_WITH_SSE4_1 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#undef XSIMD_WITH_SSSE3
|
||||
#define XSIMD_WITH_SSSE3 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#undef XSIMD_WITH_SSE3
|
||||
#define XSIMD_WITH_SSE3 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#undef XSIMD_WITH_SSE2
|
||||
#define XSIMD_WITH_SSE2 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
|
||||
#define XSIMD_NO_SUPPORTED_ARCHITECTURE
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,180 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_CPUID_HPP
|
||||
#define XSIMD_CPUID_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// Contains the definition of __cpuidex
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "../types/xsimd_all_registers.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
struct supported_arch
|
||||
{
|
||||
unsigned sse2 : 1;
|
||||
unsigned sse3 : 1;
|
||||
unsigned ssse3 : 1;
|
||||
unsigned sse4_1 : 1;
|
||||
unsigned sse4_2 : 1;
|
||||
unsigned sse4a : 1;
|
||||
unsigned fma3_sse : 1;
|
||||
unsigned fma4 : 1;
|
||||
unsigned xop : 1;
|
||||
unsigned avx : 1;
|
||||
unsigned fma3_avx : 1;
|
||||
unsigned avx2 : 1;
|
||||
unsigned fma3_avx2 : 1;
|
||||
unsigned avx512f : 1;
|
||||
unsigned avx512cd : 1;
|
||||
unsigned avx512dq : 1;
|
||||
unsigned avx512bw : 1;
|
||||
unsigned neon : 1;
|
||||
unsigned neon64 : 1;
|
||||
|
||||
// version number of the best arch available
|
||||
unsigned best;
|
||||
|
||||
supported_arch() noexcept
|
||||
{
|
||||
memset(this, 0, sizeof(supported_arch));
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
neon = 1;
|
||||
neon64 = 1;
|
||||
best = neon64::version();
|
||||
#elif defined(__ARM_NEON) || defined(_M_ARM)
|
||||
#if defined(__linux__)
|
||||
neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
|
||||
#else
|
||||
// that's very conservative :-/
|
||||
neon = 0;
|
||||
#endif
|
||||
neon64 = 0;
|
||||
best = neon::version() * neon;
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
|
||||
auto get_cpuid = [](int reg[4], int func_id) noexcept
|
||||
{
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
__cpuidex(reg, func_id, 0);
|
||||
|
||||
#elif defined(__INTEL_COMPILER)
|
||||
__cpuid(reg, func_id);
|
||||
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#if defined(__i386__) && defined(__PIC__)
|
||||
// %ebx may be the PIC register
|
||||
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
"cpuid\n\t"
|
||||
"xchg{l}\t{%%}ebx, %1\n\t"
|
||||
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
|
||||
"=d"(reg[3])
|
||||
: "a"(func_id), "c"(0));
|
||||
|
||||
#else
|
||||
__asm__("cpuid\n\t"
|
||||
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
|
||||
"=d"(reg[3])
|
||||
: "a"(func_id), "c"(0));
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error "Unsupported configuration"
|
||||
#endif
|
||||
};
|
||||
|
||||
int regs[4];
|
||||
|
||||
get_cpuid(regs, 0x1);
|
||||
|
||||
sse2 = regs[3] >> 26 & 1;
|
||||
best = std::max(best, sse2::version() * sse2);
|
||||
|
||||
sse3 = regs[2] >> 0 & 1;
|
||||
best = std::max(best, sse3::version() * sse3);
|
||||
|
||||
ssse3 = regs[2] >> 9 & 1;
|
||||
best = std::max(best, ssse3::version() * ssse3);
|
||||
|
||||
sse4_1 = regs[2] >> 19 & 1;
|
||||
best = std::max(best, sse4_1::version() * sse4_1);
|
||||
|
||||
sse4_2 = regs[2] >> 20 & 1;
|
||||
best = std::max(best, sse4_2::version() * sse4_2);
|
||||
|
||||
fma3_sse = regs[2] >> 12 & 1;
|
||||
if (sse4_2)
|
||||
best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
|
||||
|
||||
get_cpuid(regs, 0x80000001);
|
||||
fma4 = regs[2] >> 16 & 1;
|
||||
best = std::max(best, fma4::version() * fma4);
|
||||
|
||||
// sse4a = regs[2] >> 6 & 1;
|
||||
// best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
|
||||
|
||||
// xop = regs[2] >> 11 & 1;
|
||||
// best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
|
||||
|
||||
avx = regs[2] >> 28 & 1;
|
||||
best = std::max(best, avx::version() * avx);
|
||||
|
||||
fma3_avx = avx && fma3_sse;
|
||||
best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
|
||||
|
||||
get_cpuid(regs, 0x7);
|
||||
avx2 = regs[1] >> 5 & 1;
|
||||
best = std::max(best, avx2::version() * avx2);
|
||||
|
||||
fma3_avx2 = avx2 && fma3_sse;
|
||||
best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
|
||||
|
||||
avx512f = regs[1] >> 16 & 1;
|
||||
best = std::max(best, avx512f::version() * avx512f);
|
||||
|
||||
avx512cd = regs[1] >> 28 & 1;
|
||||
best = std::max(best, avx512cd::version() * avx512cd * avx512f);
|
||||
|
||||
avx512dq = regs[1] >> 17 & 1;
|
||||
best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
|
||||
|
||||
avx512bw = regs[1] >> 30 & 1;
|
||||
best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
|
||||
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
inline detail::supported_arch available_architectures() noexcept
|
||||
{
|
||||
static detail::supported_arch supported;
|
||||
return supported;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,719 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
|
||||
/* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
#if defined(_MSC_VER)
|
||||
#define ONCE0 \
|
||||
__pragma(warning(push)) \
|
||||
__pragma(warning(disable : 4127)) while (0) \
|
||||
__pragma(warning(pop)) /**/
|
||||
#else
|
||||
#define ONCE0 while (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ====================================================
|
||||
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
|
||||
*
|
||||
* Developed at SunPro, a Sun Microsystems, Inc. business.
|
||||
* Permission to use, copy, modify, and distribute this
|
||||
* software is freely granted, provided that this notice
|
||||
* is preserved.
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define XSIMD_LITTLE_ENDIAN
|
||||
#endif
|
||||
#elif defined(_WIN32)
|
||||
// We can safely assume that Windows is always little endian
|
||||
#define XSIMD_LITTLE_ENDIAN
|
||||
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
|
||||
#define XSIMD_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
#ifdef XSIMD_LITTLE_ENDIAN
|
||||
#define LOW_WORD_IDX 0
|
||||
#define HIGH_WORD_IDX sizeof(std::uint32_t)
|
||||
#else
|
||||
#define LOW_WORD_IDX sizeof(std::uint32_t)
|
||||
#define HIGH_WORD_IDX 0
|
||||
#endif
|
||||
|
||||
#define GET_HIGH_WORD(i, d) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
|
||||
sizeof(std::uint32_t)); \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
#define GET_LOW_WORD(i, d) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
|
||||
sizeof(std::uint32_t)); \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
#define SET_HIGH_WORD(d, v) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::uint32_t value = (v); \
|
||||
std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
|
||||
&value, sizeof(std::uint32_t)); \
|
||||
(d) = f; \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
#define SET_LOW_WORD(d, v) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::uint32_t value = (v); \
|
||||
std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
|
||||
&value, sizeof(std::uint32_t)); \
|
||||
(d) = f; \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
/*
|
||||
* __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
|
||||
* double x[],y[]; int e0,nx,prec; int ipio2[];
|
||||
*
|
||||
* __kernel_rem_pio2 return the last three digits of N with
|
||||
* y = x - N*pi/2
|
||||
* so that |y| < pi/2.
|
||||
*
|
||||
* The method is to compute the integer (mod 8) and fraction parts of
|
||||
* (2/pi)*x without doing the full multiplication. In general we
|
||||
* skip the part of the product that are known to be a huge integer (
|
||||
* more accurately, = 0 mod 8 ). Thus the number of operations are
|
||||
* independent of the exponent of the input.
|
||||
*
|
||||
* (2/pi) is represented by an array of 24-bit integers in ipio2[].
|
||||
*
|
||||
* Input parameters:
|
||||
* x[] The input value (must be positive) is broken into nx
|
||||
* pieces of 24-bit integers in double precision format.
|
||||
* x[i] will be the i-th 24 bit of x. The scaled exponent
|
||||
* of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
|
||||
* match x's up to 24 bits.
|
||||
*
|
||||
* Example of breaking a double positive z into x[0]+x[1]+x[2]:
|
||||
* e0 = ilogb(z)-23
|
||||
* z = scalbn(z,-e0)
|
||||
* for i = 0,1,2
|
||||
* x[i] = floor(z)
|
||||
* z = (z-x[i])*2**24
|
||||
*
|
||||
*
|
||||
* y[] ouput result in an array of double precision numbers.
|
||||
* The dimension of y[] is:
|
||||
* 24-bit precision 1
|
||||
* 53-bit precision 2
|
||||
* 64-bit precision 2
|
||||
* 113-bit precision 3
|
||||
* The actual value is the sum of them. Thus for 113-bit
|
||||
* precison, one may have to do something like:
|
||||
*
|
||||
* long double t,w,r_head, r_tail;
|
||||
* t = (long double)y[2] + (long double)y[1];
|
||||
* w = (long double)y[0];
|
||||
* r_head = t+w;
|
||||
* r_tail = w - (r_head - t);
|
||||
*
|
||||
* e0 The exponent of x[0]
|
||||
*
|
||||
* nx dimension of x[]
|
||||
*
|
||||
* prec an integer indicating the precision:
|
||||
* 0 24 bits (single)
|
||||
* 1 53 bits (double)
|
||||
* 2 64 bits (extended)
|
||||
* 3 113 bits (quad)
|
||||
*
|
||||
* ipio2[]
|
||||
* integer array, contains the (24*i)-th to (24*i+23)-th
|
||||
* bit of 2/pi after binary point. The corresponding
|
||||
* floating value is
|
||||
*
|
||||
* ipio2[i] * 2^(-24(i+1)).
|
||||
*
|
||||
* External function:
|
||||
* double scalbn(), floor();
|
||||
*
|
||||
*
|
||||
* Here is the description of some local variables:
|
||||
*
|
||||
* jk jk+1 is the initial number of terms of ipio2[] needed
|
||||
* in the computation. The recommended value is 2,3,4,
|
||||
* 6 for single, double, extended,and quad.
|
||||
*
|
||||
* jz local integer variable indicating the number of
|
||||
* terms of ipio2[] used.
|
||||
*
|
||||
* jx nx - 1
|
||||
*
|
||||
* jv index for pointing to the suitable ipio2[] for the
|
||||
* computation. In general, we want
|
||||
* ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
|
||||
* is an integer. Thus
|
||||
* e0-3-24*jv >= 0 or (e0-3)/24 >= jv
|
||||
* Hence jv = max(0,(e0-3)/24).
|
||||
*
|
||||
* jp jp+1 is the number of terms in PIo2[] needed, jp = jk.
|
||||
*
|
||||
* q[] double array with integral value, representing the
|
||||
* 24-bits chunk of the product of x and 2/pi.
|
||||
*
|
||||
* q0 the corresponding exponent of q[0]. Note that the
|
||||
* exponent for q[i] would be q0-24*i.
|
||||
*
|
||||
* PIo2[] double precision array, obtained by cutting pi/2
|
||||
* into 24 bits chunks.
|
||||
*
|
||||
* f[] ipio2[] in floating point
|
||||
*
|
||||
* iq[] integer array by breaking up q[] in 24-bits chunk.
|
||||
*
|
||||
* fq[] final product of x*(2/pi) in fq[0],..,fq[jk]
|
||||
*
|
||||
* ih integer. If >0 it indicates q[] is >= 0.5, hence
|
||||
* it also indicates the *sign* of the result.
|
||||
*
|
||||
*/
|
||||
|
||||
inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
|
||||
{
|
||||
static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
|
||||
|
||||
static const double PIo2[] = {
|
||||
1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
|
||||
7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
|
||||
5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
|
||||
3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
|
||||
1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
|
||||
1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
|
||||
2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
|
||||
2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
|
||||
};
|
||||
|
||||
static const double
|
||||
zero
|
||||
= 0.0,
|
||||
one = 1.0,
|
||||
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
|
||||
twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
|
||||
|
||||
int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
|
||||
double z, fw, f[20], fq[20], q[20];
|
||||
|
||||
/* initialize jk*/
|
||||
jk = init_jk[prec];
|
||||
jp = jk;
|
||||
|
||||
/* determine jx,jv,q0, note that 3>q0 */
|
||||
jx = nx - 1;
|
||||
jv = (e0 - 3) / 24;
|
||||
if (jv < 0)
|
||||
jv = 0;
|
||||
q0 = e0 - 24 * (jv + 1);
|
||||
|
||||
/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
|
||||
j = jv - jx;
|
||||
m = jx + jk;
|
||||
for (i = 0; i <= m; i++, j++)
|
||||
f[i] = (j < 0) ? zero : (double)ipio2[j];
|
||||
|
||||
/* compute q[0],q[1],...q[jk] */
|
||||
for (i = 0; i <= jk; i++)
|
||||
{
|
||||
for (j = 0, fw = 0.0; j <= jx; j++)
|
||||
fw += x[j] * f[jx + i - j];
|
||||
q[i] = fw;
|
||||
}
|
||||
|
||||
jz = jk;
|
||||
|
||||
recompute:
|
||||
/* distill q[] into iq[] reversingly */
|
||||
for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
|
||||
{
|
||||
fw = (double)((int32_t)(twon24 * z));
|
||||
iq[i] = (int)(z - two24 * fw);
|
||||
z = q[j - 1] + fw;
|
||||
}
|
||||
|
||||
/* compute n */
|
||||
z = std::scalbn(z, q0); /* actual value of z */
|
||||
z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
|
||||
n = (int32_t)z;
|
||||
z -= (double)n;
|
||||
ih = 0;
|
||||
if (q0 > 0)
|
||||
{ /* need iq[jz-1] to determine n */
|
||||
i = (iq[jz - 1] >> (24 - q0));
|
||||
n += i;
|
||||
iq[jz - 1] -= i << (24 - q0);
|
||||
ih = iq[jz - 1] >> (23 - q0);
|
||||
}
|
||||
else if (q0 == 0)
|
||||
ih = iq[jz - 1] >> 23;
|
||||
else if (z >= 0.5)
|
||||
ih = 2;
|
||||
|
||||
if (ih > 0)
|
||||
{ /* q > 0.5 */
|
||||
n += 1;
|
||||
carry = 0;
|
||||
for (i = 0; i < jz; i++)
|
||||
{ /* compute 1-q */
|
||||
j = iq[i];
|
||||
if (carry == 0)
|
||||
{
|
||||
if (j != 0)
|
||||
{
|
||||
carry = 1;
|
||||
iq[i] = 0x1000000 - j;
|
||||
}
|
||||
}
|
||||
else
|
||||
iq[i] = 0xffffff - j;
|
||||
}
|
||||
if (q0 > 0)
|
||||
{ /* rare case: chance is 1 in 12 */
|
||||
switch (q0)
|
||||
{
|
||||
case 1:
|
||||
iq[jz - 1] &= 0x7fffff;
|
||||
break;
|
||||
case 2:
|
||||
iq[jz - 1] &= 0x3fffff;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ih == 2)
|
||||
{
|
||||
z = one - z;
|
||||
if (carry != 0)
|
||||
z -= std::scalbn(one, q0);
|
||||
}
|
||||
}
|
||||
|
||||
/* check if recomputation is needed */
|
||||
if (z == zero)
|
||||
{
|
||||
j = 0;
|
||||
for (i = jz - 1; i >= jk; i--)
|
||||
j |= iq[i];
|
||||
if (j == 0)
|
||||
{ /* need recomputation */
|
||||
for (k = 1; iq[jk - k] == 0; k++)
|
||||
; /* k = no. of terms needed */
|
||||
|
||||
for (i = jz + 1; i <= jz + k; i++)
|
||||
{ /* add q[jz+1] to q[jz+k] */
|
||||
f[jx + i] = (double)ipio2[jv + i];
|
||||
for (j = 0, fw = 0.0; j <= jx; j++)
|
||||
fw += x[j] * f[jx + i - j];
|
||||
q[i] = fw;
|
||||
}
|
||||
jz += k;
|
||||
goto recompute;
|
||||
}
|
||||
}
|
||||
|
||||
/* chop off zero terms */
|
||||
if (z == 0.0)
|
||||
{
|
||||
jz -= 1;
|
||||
q0 -= 24;
|
||||
while (iq[jz] == 0)
|
||||
{
|
||||
jz--;
|
||||
q0 -= 24;
|
||||
}
|
||||
}
|
||||
else
|
||||
{ /* break z into 24-bit if necessary */
|
||||
z = std::scalbn(z, -q0);
|
||||
if (z >= two24)
|
||||
{
|
||||
fw = (double)((int32_t)(twon24 * z));
|
||||
iq[jz] = (int32_t)(z - two24 * fw);
|
||||
jz += 1;
|
||||
q0 += 24;
|
||||
iq[jz] = (int32_t)fw;
|
||||
}
|
||||
else
|
||||
iq[jz] = (int32_t)z;
|
||||
}
|
||||
|
||||
/* convert integer "bit" chunk to floating-point value */
|
||||
fw = scalbn(one, q0);
|
||||
for (i = jz; i >= 0; i--)
|
||||
{
|
||||
q[i] = fw * (double)iq[i];
|
||||
fw *= twon24;
|
||||
}
|
||||
|
||||
/* compute PIo2[0,...,jp]*q[jz,...,0] */
|
||||
for (i = jz; i >= 0; i--)
|
||||
{
|
||||
for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
|
||||
fw += PIo2[k] * q[i + k];
|
||||
fq[jz - i] = fw;
|
||||
}
|
||||
|
||||
/* compress fq[] into y[] */
|
||||
switch (prec)
|
||||
{
|
||||
case 0:
|
||||
fw = 0.0;
|
||||
for (i = jz; i >= 0; i--)
|
||||
fw += fq[i];
|
||||
y[0] = (ih == 0) ? fw : -fw;
|
||||
break;
|
||||
case 1:
|
||||
case 2:
|
||||
fw = 0.0;
|
||||
for (i = jz; i >= 0; i--)
|
||||
fw += fq[i];
|
||||
y[0] = (ih == 0) ? fw : -fw;
|
||||
fw = fq[0] - fw;
|
||||
for (i = 1; i <= jz; i++)
|
||||
fw += fq[i];
|
||||
y[1] = (ih == 0) ? fw : -fw;
|
||||
break;
|
||||
case 3: /* painful */
|
||||
for (i = jz; i > 0; i--)
|
||||
{
|
||||
fw = fq[i - 1] + fq[i];
|
||||
fq[i] += fq[i - 1] - fw;
|
||||
fq[i - 1] = fw;
|
||||
}
|
||||
for (i = jz; i > 1; i--)
|
||||
{
|
||||
fw = fq[i - 1] + fq[i];
|
||||
fq[i] += fq[i - 1] - fw;
|
||||
fq[i - 1] = fw;
|
||||
}
|
||||
for (fw = 0.0, i = jz; i >= 2; i--)
|
||||
fw += fq[i];
|
||||
if (ih == 0)
|
||||
{
|
||||
y[0] = fq[0];
|
||||
y[1] = fq[1];
|
||||
y[2] = fw;
|
||||
}
|
||||
else
|
||||
{
|
||||
y[0] = -fq[0];
|
||||
y[1] = -fq[1];
|
||||
y[2] = -fw;
|
||||
}
|
||||
}
|
||||
return n & 7;
|
||||
}
|
||||
|
||||
inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
|
||||
{
|
||||
static const std::int32_t two_over_pi[] = {
|
||||
0xA2F983,
|
||||
0x6E4E44,
|
||||
0x1529FC,
|
||||
0x2757D1,
|
||||
0xF534DD,
|
||||
0xC0DB62,
|
||||
0x95993C,
|
||||
0x439041,
|
||||
0xFE5163,
|
||||
0xABDEBB,
|
||||
0xC561B7,
|
||||
0x246E3A,
|
||||
0x424DD2,
|
||||
0xE00649,
|
||||
0x2EEA09,
|
||||
0xD1921C,
|
||||
0xFE1DEB,
|
||||
0x1CB129,
|
||||
0xA73EE8,
|
||||
0x8235F5,
|
||||
0x2EBB44,
|
||||
0x84E99C,
|
||||
0x7026B4,
|
||||
0x5F7E41,
|
||||
0x3991D6,
|
||||
0x398353,
|
||||
0x39F49C,
|
||||
0x845F8B,
|
||||
0xBDF928,
|
||||
0x3B1FF8,
|
||||
0x97FFDE,
|
||||
0x05980F,
|
||||
0xEF2F11,
|
||||
0x8B5A0A,
|
||||
0x6D1F6D,
|
||||
0x367ECF,
|
||||
0x27CB09,
|
||||
0xB74F46,
|
||||
0x3F669E,
|
||||
0x5FEA2D,
|
||||
0x7527BA,
|
||||
0xC7EBE5,
|
||||
0xF17B3D,
|
||||
0x0739F7,
|
||||
0x8A5292,
|
||||
0xEA6BFB,
|
||||
0x5FB11F,
|
||||
0x8D5D08,
|
||||
0x560330,
|
||||
0x46FC7B,
|
||||
0x6BABF0,
|
||||
0xCFBC20,
|
||||
0x9AF436,
|
||||
0x1DA9E3,
|
||||
0x91615E,
|
||||
0xE61B08,
|
||||
0x659985,
|
||||
0x5F14A0,
|
||||
0x68408D,
|
||||
0xFFD880,
|
||||
0x4D7327,
|
||||
0x310606,
|
||||
0x1556CA,
|
||||
0x73A8C9,
|
||||
0x60E27B,
|
||||
0xC08C6B,
|
||||
};
|
||||
|
||||
static const std::int32_t npio2_hw[] = {
|
||||
0x3FF921FB,
|
||||
0x400921FB,
|
||||
0x4012D97C,
|
||||
0x401921FB,
|
||||
0x401F6A7A,
|
||||
0x4022D97C,
|
||||
0x4025FDBB,
|
||||
0x402921FB,
|
||||
0x402C463A,
|
||||
0x402F6A7A,
|
||||
0x4031475C,
|
||||
0x4032D97C,
|
||||
0x40346B9C,
|
||||
0x4035FDBB,
|
||||
0x40378FDB,
|
||||
0x403921FB,
|
||||
0x403AB41B,
|
||||
0x403C463A,
|
||||
0x403DD85A,
|
||||
0x403F6A7A,
|
||||
0x40407E4C,
|
||||
0x4041475C,
|
||||
0x4042106C,
|
||||
0x4042D97C,
|
||||
0x4043A28C,
|
||||
0x40446B9C,
|
||||
0x404534AC,
|
||||
0x4045FDBB,
|
||||
0x4046C6CB,
|
||||
0x40478FDB,
|
||||
0x404858EB,
|
||||
0x404921FB,
|
||||
};
|
||||
|
||||
/*
|
||||
* invpio2: 53 bits of 2/pi
|
||||
* pio2_1: first 33 bit of pi/2
|
||||
* pio2_1t: pi/2 - pio2_1
|
||||
* pio2_2: second 33 bit of pi/2
|
||||
* pio2_2t: pi/2 - (pio2_1+pio2_2)
|
||||
* pio2_3: third 33 bit of pi/2
|
||||
* pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
|
||||
*/
|
||||
|
||||
static const double
|
||||
zero
|
||||
= 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
|
||||
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
|
||||
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
|
||||
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
|
||||
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
|
||||
pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
|
||||
pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
|
||||
pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
|
||||
pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
|
||||
pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
|
||||
|
||||
double z = 0., w, t, r, fn;
|
||||
double tx[3];
|
||||
std::int32_t e0, i, j, nx, n, ix, hx;
|
||||
std::uint32_t low;
|
||||
|
||||
GET_HIGH_WORD(hx, x); /* high word of x */
|
||||
ix = hx & 0x7fffffff;
|
||||
if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
|
||||
{
|
||||
y[0] = x;
|
||||
y[1] = 0;
|
||||
return 0;
|
||||
}
|
||||
if (ix < 0x4002d97c)
|
||||
{ /* |x| < 3pi/4, special case with n=+-1 */
|
||||
if (hx > 0)
|
||||
{
|
||||
z = x - pio2_1;
|
||||
if (ix != 0x3ff921fb)
|
||||
{ /* 33+53 bit pi is good enough */
|
||||
y[0] = z - pio2_1t;
|
||||
y[1] = (z - y[0]) - pio2_1t;
|
||||
}
|
||||
else
|
||||
{ /* near pi/2, use 33+33+53 bit pi */
|
||||
z -= pio2_2;
|
||||
y[0] = z - pio2_2t;
|
||||
y[1] = (z - y[0]) - pio2_2t;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{ /* negative x */
|
||||
z = x + pio2_1;
|
||||
if (ix != 0x3ff921fb)
|
||||
{ /* 33+53 bit pi is good enough */
|
||||
y[0] = z + pio2_1t;
|
||||
y[1] = (z - y[0]) + pio2_1t;
|
||||
}
|
||||
else
|
||||
{ /* near pi/2, use 33+33+53 bit pi */
|
||||
z += pio2_2;
|
||||
y[0] = z + pio2_2t;
|
||||
y[1] = (z - y[0]) + pio2_2t;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (ix <= 0x413921fb)
|
||||
{ /* |x| ~<= 2^19*(pi/2), medium_ size */
|
||||
t = std::fabs(x);
|
||||
n = (std::int32_t)(t * invpio2 + half);
|
||||
fn = (double)n;
|
||||
r = t - fn * pio2_1;
|
||||
w = fn * pio2_1t; /* 1st round good to 85 bit */
|
||||
if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
|
||||
{
|
||||
y[0] = r - w; /* quick check no cancellation */
|
||||
}
|
||||
else
|
||||
{
|
||||
std::uint32_t high;
|
||||
j = ix >> 20;
|
||||
y[0] = r - w;
|
||||
GET_HIGH_WORD(high, y[0]);
|
||||
i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
|
||||
if (i > 16)
|
||||
{ /* 2nd iteration needed, good to 118 */
|
||||
t = r;
|
||||
w = fn * pio2_2;
|
||||
r = t - w;
|
||||
w = fn * pio2_2t - ((t - r) - w);
|
||||
y[0] = r - w;
|
||||
GET_HIGH_WORD(high, y[0]);
|
||||
i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
|
||||
if (i > 49)
|
||||
{ /* 3rd iteration need, 151 bits acc */
|
||||
t = r; /* will cover all possible cases */
|
||||
w = fn * pio2_3;
|
||||
r = t - w;
|
||||
w = fn * pio2_3t - ((t - r) - w);
|
||||
y[0] = r - w;
|
||||
}
|
||||
}
|
||||
}
|
||||
y[1] = (r - y[0]) - w;
|
||||
if (hx < 0)
|
||||
{
|
||||
y[0] = -y[0];
|
||||
y[1] = -y[1];
|
||||
return -n;
|
||||
}
|
||||
else
|
||||
return n;
|
||||
}
|
||||
/*
|
||||
* all other (large) arguments
|
||||
*/
|
||||
if (ix >= 0x7ff00000)
|
||||
{ /* x is inf or NaN */
|
||||
y[0] = y[1] = x - x;
|
||||
return 0;
|
||||
}
|
||||
/* set z = scalbn(|x|,ilogb(x)-23) */
|
||||
GET_LOW_WORD(low, x);
|
||||
SET_LOW_WORD(z, low);
|
||||
e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
|
||||
SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
tx[i] = (double)((std::int32_t)(z));
|
||||
z = (z - tx[i]) * two24;
|
||||
}
|
||||
tx[2] = z;
|
||||
nx = 3;
|
||||
while (tx[nx - 1] == zero)
|
||||
nx--; /* skip zero term */
|
||||
n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
|
||||
if (hx < 0)
|
||||
{
|
||||
y[0] = -y[0];
|
||||
y[1] = -y[1];
|
||||
return -n;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
#undef XSIMD_LITTLE_ENDIAN
|
||||
#undef SET_LOW_WORD
|
||||
#undef SET_HIGH_WORD
|
||||
#undef GET_LOW_WORD
|
||||
#undef GET_HIGH_WORD
|
||||
#undef HIGH_WORD_IDX
|
||||
#undef LOW_WORD_IDX
|
||||
#undef ONCE0
|
||||
}
|
|
@ -1,349 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
|
||||
#define XSIMD_ALIGNED_ALLOCATOR_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
#ifdef _WIN32
|
||||
#include <malloc.h>
|
||||
#else
|
||||
#include <cstdlib>
|
||||
#endif
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
|
||||
#include "../config/xsimd_arch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @class aligned_allocator
|
||||
* @brief Allocator for aligned memory
|
||||
*
|
||||
* The aligned_allocator class template is an allocator that
|
||||
* performs memory allocation aligned by the specified value.
|
||||
*
|
||||
* @tparam T type of objects to allocate.
|
||||
* @tparam Align alignment in bytes.
|
||||
*/
|
||||
template <class T, size_t Align = default_arch::alignment()>
|
||||
class aligned_allocator
|
||||
{
|
||||
public:
|
||||
using value_type = T;
|
||||
using pointer = T*;
|
||||
using const_pointer = const T*;
|
||||
using reference = T&;
|
||||
using const_reference = const T&;
|
||||
using size_type = size_t;
|
||||
using difference_type = ptrdiff_t;
|
||||
|
||||
static constexpr size_t alignment = Align;
|
||||
|
||||
template <class U>
|
||||
struct rebind
|
||||
{
|
||||
using other = aligned_allocator<U, Align>;
|
||||
};
|
||||
|
||||
aligned_allocator() noexcept;
|
||||
aligned_allocator(const aligned_allocator& rhs) noexcept;
|
||||
|
||||
template <class U>
|
||||
aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
|
||||
|
||||
~aligned_allocator();
|
||||
|
||||
pointer address(reference) noexcept;
|
||||
const_pointer address(const_reference) const noexcept;
|
||||
|
||||
pointer allocate(size_type n, const void* hint = 0);
|
||||
void deallocate(pointer p, size_type n);
|
||||
|
||||
size_type max_size() const noexcept;
|
||||
size_type size_max() const noexcept;
|
||||
|
||||
template <class U, class... Args>
|
||||
void construct(U* p, Args&&... args);
|
||||
|
||||
template <class U>
|
||||
void destroy(U* p);
|
||||
};
|
||||
|
||||
template <class T1, size_t Align1, class T2, size_t Align2>
|
||||
bool operator==(const aligned_allocator<T1, Align1>& lhs,
|
||||
const aligned_allocator<T2, Align2>& rhs) noexcept;
|
||||
|
||||
template <class T1, size_t Align1, class T2, size_t Align2>
|
||||
bool operator!=(const aligned_allocator<T1, Align1>& lhs,
|
||||
const aligned_allocator<T2, Align2>& rhs) noexcept;
|
||||
|
||||
void* aligned_malloc(size_t size, size_t alignment);
|
||||
void aligned_free(void* ptr);
|
||||
|
||||
template <class T>
|
||||
size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
|
||||
|
||||
/************************************
|
||||
* aligned_allocator implementation *
|
||||
************************************/
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline aligned_allocator<T, A>::aligned_allocator() noexcept
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Extended copy constructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
template <class U>
|
||||
inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline aligned_allocator<T, A>::~aligned_allocator()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the actual address of \c r even in presence of overloaded \c operator&.
|
||||
* @param r the object to acquire address of.
|
||||
* @return the actual address of \c r.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::address(reference r) noexcept -> pointer
|
||||
{
|
||||
return &r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the actual address of \c r even in presence of overloaded \c operator&.
|
||||
* @param r the object to acquire address of.
|
||||
* @return the actual address of \c r.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
|
||||
{
|
||||
return &r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
|
||||
* The alignment may require some extra memory allocation.
|
||||
* @param n the number of objects to allocate storage for.
|
||||
* @param hint unused parameter provided for standard compliance.
|
||||
* @return a pointer to the first byte of a memory block suitably aligned and sufficient to
|
||||
* hold an array of \c n objects of type \c T.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
|
||||
{
|
||||
pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
|
||||
#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
|
||||
if (res == nullptr)
|
||||
throw std::bad_alloc();
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
|
||||
* an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
|
||||
* to allocate() that originally produced \c p; otherwise, the behavior is undefined.
|
||||
* @param p pointer obtained from allocate().
|
||||
* @param n number of objects earlier passed to allocate().
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
|
||||
{
|
||||
aligned_free(p);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum theoretically possible value of \c n, for which the
|
||||
* call allocate(n, 0) could succeed.
|
||||
* @return the maximum supported allocated size.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::max_size() const noexcept -> size_type
|
||||
{
|
||||
return size_type(-1) / sizeof(T);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is deprecated, use max_size() instead
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::size_max() const noexcept -> size_type
|
||||
{
|
||||
return size_type(-1) / sizeof(T);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an object of type \c T in allocated uninitialized memory
|
||||
* pointed to by \c p, using placement-new.
|
||||
* @param p pointer to allocated uninitialized memory.
|
||||
* @param args the constructor arguments to use.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
template <class U, class... Args>
|
||||
inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
|
||||
{
|
||||
new ((void*)p) U(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls the destructor of the object pointed to by \c p.
|
||||
* @param p pointer to the object that is going to be destroyed.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
template <class U>
|
||||
inline void aligned_allocator<T, A>::destroy(U* p)
|
||||
{
|
||||
p->~U();
|
||||
}
|
||||
|
||||
/**
|
||||
* @defgroup allocator_comparison Comparison operators
|
||||
*/
|
||||
|
||||
/**
|
||||
* @ingroup allocator_comparison
|
||||
* Compares two aligned memory allocator for equality. Since allocators
|
||||
* are stateless, return \c true iff <tt>A1 == A2</tt>.
|
||||
* @param lhs aligned_allocator to compare.
|
||||
* @param rhs aligned_allocator to compare.
|
||||
* @return true if the allocators have the same alignment.
|
||||
*/
|
||||
template <class T1, size_t A1, class T2, size_t A2>
|
||||
inline bool operator==(const aligned_allocator<T1, A1>& lhs,
|
||||
const aligned_allocator<T2, A2>& rhs) noexcept
|
||||
{
|
||||
return lhs.alignment == rhs.alignment;
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup allocator_comparison
|
||||
* Compares two aligned memory allocator for inequality. Since allocators
|
||||
* are stateless, return \c true iff <tt>A1 != A2</tt>.
|
||||
* @param lhs aligned_allocator to compare.
|
||||
* @param rhs aligned_allocator to compare.
|
||||
* @return true if the allocators have different alignments.
|
||||
*/
|
||||
template <class T1, size_t A1, class T2, size_t A2>
|
||||
inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
|
||||
const aligned_allocator<T2, A2>& rhs) noexcept
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
/****************************************
|
||||
* aligned malloc / free implementation *
|
||||
****************************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
inline void* xaligned_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
|
||||
assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
|
||||
void* res = nullptr;
|
||||
#ifdef _WIN32
|
||||
res = _aligned_malloc(size, alignment);
|
||||
#else
|
||||
if (posix_memalign(&res, alignment, size) != 0)
|
||||
{
|
||||
res = nullptr;
|
||||
}
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
||||
inline void xaligned_free(void* ptr)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
inline void* aligned_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
return detail::xaligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
inline void aligned_free(void* ptr)
|
||||
{
|
||||
detail::xaligned_free(ptr);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
|
||||
{
|
||||
// size_t block_size = simd_traits<T>::size;
|
||||
if (block_size == 1)
|
||||
{
|
||||
// The simd_block consists of exactly one scalar so that all
|
||||
// elements of the array
|
||||
// are "well" aligned.
|
||||
return 0;
|
||||
}
|
||||
else if (size_t(p) & (sizeof(T) - 1))
|
||||
{
|
||||
// The array is not aligned to the size of a single element, so that
|
||||
// no element
|
||||
// of the array is well aligned
|
||||
return size;
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t block_mask = block_size - 1;
|
||||
return std::min<size_t>(
|
||||
(block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class A = default_arch>
|
||||
using default_allocator = typename std::conditional<A::requires_alignment(),
|
||||
aligned_allocator<T, A::alignment()>,
|
||||
std::allocator<T>>::type;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,76 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ALIGNMENT_HPP
|
||||
#define XSIMD_ALIGNMENT_HPP
|
||||
|
||||
#include "../types/xsimd_utils.hpp"
|
||||
#include "xsimd_aligned_allocator.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @struct aligned_mode
|
||||
* @brief tag for load and store of aligned memory.
|
||||
*/
|
||||
struct aligned_mode
|
||||
{
|
||||
};
|
||||
|
||||
/**
|
||||
* @struct unaligned_mode
|
||||
* @brief tag for load and store of unaligned memory.
|
||||
*/
|
||||
struct unaligned_mode
|
||||
{
|
||||
};
|
||||
|
||||
/***********************
|
||||
* Allocator alignment *
|
||||
***********************/
|
||||
|
||||
template <class A>
|
||||
struct allocator_alignment
|
||||
{
|
||||
using type = unaligned_mode;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct allocator_alignment<aligned_allocator<T>>
|
||||
{
|
||||
using type = aligned_mode;
|
||||
};
|
||||
|
||||
template <class A>
|
||||
using allocator_alignment_t = typename allocator_alignment<A>::type;
|
||||
|
||||
/***********************
|
||||
* container alignment *
|
||||
***********************/
|
||||
|
||||
template <class C, class = void>
|
||||
struct container_alignment
|
||||
{
|
||||
using type = unaligned_mode;
|
||||
};
|
||||
|
||||
template <class C>
|
||||
struct container_alignment<C, detail::void_t<typename C::allocator_type>>
|
||||
{
|
||||
using type = allocator_alignment_t<typename C::allocator_type>;
|
||||
};
|
||||
|
||||
template <class C>
|
||||
using container_alignment_t = typename container_alignment<C>::type;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,32 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#include "xsimd_fma3_sse_register.hpp"
|
||||
#include "xsimd_fma4_register.hpp"
|
||||
#include "xsimd_sse2_register.hpp"
|
||||
#include "xsimd_sse3_register.hpp"
|
||||
#include "xsimd_sse4_1_register.hpp"
|
||||
#include "xsimd_sse4_2_register.hpp"
|
||||
|
||||
#include "xsimd_avx2_register.hpp"
|
||||
#include "xsimd_avx_register.hpp"
|
||||
#include "xsimd_fma3_avx2_register.hpp"
|
||||
#include "xsimd_fma3_avx_register.hpp"
|
||||
|
||||
#include "xsimd_avx512bw_register.hpp"
|
||||
#include "xsimd_avx512cd_register.hpp"
|
||||
#include "xsimd_avx512dq_register.hpp"
|
||||
#include "xsimd_avx512f_register.hpp"
|
||||
|
||||
#include "xsimd_neon64_register.hpp"
|
||||
#include "xsimd_neon_register.hpp"
|
||||
|
||||
#include "xsimd_sve_register.hpp"
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,40 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX2_REGISTER_HPP
|
||||
#define XSIMD_AVX2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX2 instructions
|
||||
*/
|
||||
struct avx2 : avx
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,48 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512BW_REGISTER_HPP
|
||||
#define XSIMD_AVX512BW_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx512dq_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512BW instructions
|
||||
*/
|
||||
struct avx512bw : avx512dq
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx512bw"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512BW
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512bw>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
|
@ -1,48 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512CD_REGISTER_HPP
|
||||
#define XSIMD_AVX512CD_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx512f_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512CD instrutions
|
||||
*/
|
||||
struct avx512cd : avx512f
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx512cd"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512CD
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512cd>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
|
@ -1,48 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512DQ_REGISTER_HPP
|
||||
#define XSIMD_AVX512DQ_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx512cd_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512DQ instructions
|
||||
*/
|
||||
struct avx512dq : avx512cd
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx512dq"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512DQ
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512dq>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
|
@ -1,75 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512F_REGISTER_HPP
|
||||
#define XSIMD_AVX512F_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_generic_arch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512F instructions
|
||||
*/
|
||||
struct avx512f : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
|
||||
static constexpr std::size_t alignment() noexcept { return 64; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr char const* name() noexcept { return "avx512f"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512F
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct simd_avx512_bool_register
|
||||
{
|
||||
using register_type = typename std::conditional<
|
||||
(sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
|
||||
std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
|
||||
register_type data;
|
||||
simd_avx512_bool_register() = default;
|
||||
simd_avx512_bool_register(register_type r) { data = r; }
|
||||
operator register_type() const noexcept { return data; }
|
||||
};
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512f>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,62 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX_REGISTER_HPP
|
||||
#define XSIMD_AVX_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_generic_arch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX instructions
|
||||
*/
|
||||
struct avx : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
|
||||
static constexpr std::size_t alignment() noexcept { return 32; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr char const* name() noexcept { return "avx"; }
|
||||
};
|
||||
}
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -1,147 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_BATCH_CONSTANT_HPP
|
||||
#define XSIMD_BATCH_CONSTANT_HPP
|
||||
|
||||
#include "./xsimd_batch.hpp"
|
||||
#include "./xsimd_utils.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @brief batch of boolean constant
|
||||
*
|
||||
* Abstract representation of a batch of boolean constants.
|
||||
*
|
||||
* @tparam batch_type the type of the associated batch values.
|
||||
* @tparam Values boolean constant represented by this batch
|
||||
**/
|
||||
template <class batch_type, bool... Values>
|
||||
struct batch_bool_constant
|
||||
{
|
||||
static constexpr std::size_t size = sizeof...(Values);
|
||||
using arch_type = typename batch_type::arch_type;
|
||||
using value_type = bool;
|
||||
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
|
||||
|
||||
operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
|
||||
|
||||
bool get(size_t i) const noexcept
|
||||
{
|
||||
return std::array<value_type, size> { { Values... } }[i];
|
||||
}
|
||||
|
||||
static constexpr int mask() noexcept
|
||||
{
|
||||
return mask_helper(0, static_cast<int>(Values)...);
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr int mask_helper(int acc) noexcept { return acc; }
|
||||
template <class... Tys>
|
||||
static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
|
||||
{
|
||||
return mask_helper(acc | mask, (masks << 1)...);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief batch of integral constants
|
||||
*
|
||||
* Abstract representation of a batch of integral constants.
|
||||
*
|
||||
* @tparam batch_type the type of the associated batch values.
|
||||
* @tparam Values constants represented by this batch
|
||||
**/
|
||||
template <class batch_type, typename batch_type::value_type... Values>
|
||||
struct batch_constant
|
||||
{
|
||||
static constexpr std::size_t size = sizeof...(Values);
|
||||
using arch_type = typename batch_type::arch_type;
|
||||
using value_type = typename batch_type::value_type;
|
||||
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
|
||||
|
||||
/**
|
||||
* @brief Generate a batch of @p batch_type from this @p batch_constant
|
||||
*/
|
||||
operator batch_type() const noexcept { return { Values... }; }
|
||||
|
||||
/**
|
||||
* @brief Get the @p i th element of this @p batch_constant
|
||||
*/
|
||||
constexpr value_type get(size_t i) const noexcept
|
||||
{
|
||||
return get(i, std::array<value_type, size> { Values... });
|
||||
}
|
||||
|
||||
private:
|
||||
constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
|
||||
{
|
||||
return values[i];
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class batch_type, class G, std::size_t... Is>
|
||||
inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
|
||||
-> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
|
||||
{
|
||||
return {};
|
||||
}
|
||||
template <class batch_type, class G, std::size_t... Is>
|
||||
inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
|
||||
-> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
* @brief Build a @c batch_constant out of a generator function
|
||||
*
|
||||
* @tparam batch_type type of the (non-constant) batch to build
|
||||
* @tparam G type used to generate that batch. That type must have a static
|
||||
* member @c get that's used to generate the batch constant. Conversely, the
|
||||
* generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
|
||||
*
|
||||
* The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
|
||||
*
|
||||
* @code
|
||||
* struct Rot
|
||||
* {
|
||||
* static constexpr unsigned get(unsigned i, unsigned n)
|
||||
* {
|
||||
* return (i + n - 1) % n;
|
||||
* }
|
||||
* };
|
||||
* @endcode
|
||||
*/
|
||||
template <class batch_type, class G>
|
||||
inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
|
||||
{
|
||||
return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
|
||||
}
|
||||
|
||||
template <class batch_type, class G>
|
||||
inline constexpr auto make_batch_bool_constant() noexcept
|
||||
-> decltype(detail::make_batch_bool_constant<batch_type, G>(
|
||||
detail::make_index_sequence<batch_type::size>()))
|
||||
{
|
||||
return detail::make_batch_bool_constant<batch_type, G>(
|
||||
detail::make_index_sequence<batch_type::size>());
|
||||
}
|
||||
|
||||
} // namespace xsimd
|
||||
|
||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
|
||||
#define XSIMD_FMA3_AVX2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <typename arch>
|
||||
struct fma3;
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX2 + FMA instructions
|
||||
*/
|
||||
template <>
|
||||
struct fma3<avx2> : avx2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
|
||||
static constexpr char const* name() noexcept { return "fma3+avx2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#define XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <typename arch>
|
||||
struct fma3;
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX + FMA instructions
|
||||
*/
|
||||
template <>
|
||||
struct fma3<avx> : avx
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
|
||||
static constexpr char const* name() noexcept { return "fma3+avx"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -1,46 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
|
||||
#define XSIMD_FMA3_SSE_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse4_2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <typename arch>
|
||||
struct fma3;
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE4.2 + FMA instructions
|
||||
*/
|
||||
template <>
|
||||
struct fma3<sse4_2> : sse4_2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
|
||||
static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -1,42 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA4_REGISTER_HPP
|
||||
#define XSIMD_FMA4_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse4_2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* FMA4 instructions
|
||||
*/
|
||||
struct fma4 : sse4_2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
|
||||
static constexpr char const* name() noexcept { return "fma4"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA4
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -1,35 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_ARCH_HPP
|
||||
#define XSIMD_GENERIC_ARCH_HPP
|
||||
|
||||
#include "../config/xsimd_config.hpp"
|
||||
|
||||
/**
|
||||
* @defgroup arch Architecture description
|
||||
* */
|
||||
namespace xsimd
|
||||
{
|
||||
struct generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return true; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 0; }
|
||||
static constexpr bool requires_alignment() noexcept { return false; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
|
||||
|
||||
protected:
|
||||
static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,52 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_NEON64_REGISTER_HPP
|
||||
#define XSIMD_NEON64_REGISTER_HPP
|
||||
|
||||
#include "xsimd_neon_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* NEON instructions for arm64
|
||||
*/
|
||||
struct neon64 : neon
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
|
||||
static constexpr char const* name() noexcept { return "arm64+neon"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_NEON64
|
||||
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
|
||||
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, neon64>
|
||||
: detail::neon_bool_simd_register<T, neon64>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,155 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_NEON_REGISTER_HPP
|
||||
#define XSIMD_NEON_REGISTER_HPP
|
||||
|
||||
#include "xsimd_generic_arch.hpp"
|
||||
#include "xsimd_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* NEON instructions for arm32
|
||||
*/
|
||||
struct neon : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
|
||||
static constexpr char const* name() noexcept { return "arm32+neon"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_NEON
|
||||
namespace types
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template <size_t S>
|
||||
struct neon_vector_type_impl;
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<8>
|
||||
{
|
||||
using signed_type = int8x16_t;
|
||||
using unsigned_type = uint8x16_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<16>
|
||||
{
|
||||
using signed_type = int16x8_t;
|
||||
using unsigned_type = uint16x8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<32>
|
||||
{
|
||||
using signed_type = int32x4_t;
|
||||
using unsigned_type = uint32x4_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<64>
|
||||
{
|
||||
using signed_type = int64x2_t;
|
||||
using unsigned_type = uint64x2_t;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
|
||||
|
||||
template <class T>
|
||||
using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
|
||||
|
||||
template <class T>
|
||||
using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
|
||||
signed_neon_vector_type<T>,
|
||||
unsigned_neon_vector_type<T>>::type;
|
||||
|
||||
using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
|
||||
signed_neon_vector_type<char>,
|
||||
unsigned_neon_vector_type<char>>::type;
|
||||
}
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
|
||||
XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t S>
|
||||
struct get_unsigned_type;
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<1>
|
||||
{
|
||||
using type = uint8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<2>
|
||||
{
|
||||
using type = uint16_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<4>
|
||||
{
|
||||
using type = uint32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<8>
|
||||
{
|
||||
using type = uint64_t;
|
||||
};
|
||||
|
||||
template <size_t S>
|
||||
using get_unsigned_type_t = typename get_unsigned_type<S>::type;
|
||||
|
||||
template <class T, class A>
|
||||
struct neon_bool_simd_register
|
||||
{
|
||||
using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, neon>
|
||||
: detail::neon_bool_simd_register<T, neon>
|
||||
{
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,94 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_REGISTER_HPP
|
||||
#define XSIMD_REGISTER_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace types
|
||||
{
|
||||
template <class T, class A>
|
||||
struct has_simd_register : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class Arch>
|
||||
struct simd_register
|
||||
{
|
||||
struct register_type
|
||||
{
|
||||
};
|
||||
};
|
||||
|
||||
#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
|
||||
template <> \
|
||||
struct simd_register<SCALAR_TYPE, ISA> \
|
||||
{ \
|
||||
using register_type = VECTOR_TYPE; \
|
||||
register_type data; \
|
||||
operator register_type() const noexcept \
|
||||
{ \
|
||||
return data; \
|
||||
} \
|
||||
}; \
|
||||
template <> \
|
||||
struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type \
|
||||
{ \
|
||||
}
|
||||
|
||||
#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA) \
|
||||
template <> \
|
||||
struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
|
||||
{ \
|
||||
}
|
||||
|
||||
#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE) \
|
||||
template <class T> \
|
||||
struct simd_register<T, ISA> : simd_register<T, ISA_BASE> \
|
||||
{ \
|
||||
using register_type = typename simd_register<T, ISA_BASE>::register_type; \
|
||||
simd_register(register_type reg) noexcept \
|
||||
: simd_register<T, ISA_BASE> { reg } \
|
||||
{ \
|
||||
} \
|
||||
simd_register() = default; \
|
||||
}; \
|
||||
template <class T> \
|
||||
struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE> \
|
||||
{ \
|
||||
}
|
||||
|
||||
template <class T, class Arch>
|
||||
struct get_bool_simd_register
|
||||
{
|
||||
using type = simd_register<T, Arch>;
|
||||
};
|
||||
|
||||
template <class T, class Arch>
|
||||
using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
|
||||
}
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
template <class A>
|
||||
// makes requires_arch equal to A const&, using type_traits functions
|
||||
using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
|
||||
template <class T>
|
||||
struct convert
|
||||
{
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,61 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE2_REGISTER_HPP
|
||||
#define XSIMD_SSE2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_generic_arch.hpp"
|
||||
#include "./xsimd_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE2 instructions
|
||||
*/
|
||||
struct sse2 : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr char const* name() noexcept { return "sse2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,45 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE3_REGISTER_HPP
|
||||
#define XSIMD_SSE3_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse2_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE3
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE3 instructions
|
||||
*/
|
||||
struct sse3 : sse2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
|
||||
static constexpr char const* name() noexcept { return "sse3"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE3
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,44 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_1_REGISTER_HPP
|
||||
#define XSIMD_SSE4_1_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_ssse3_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE4.1 instructions
|
||||
*/
|
||||
struct sse4_1 : ssse3
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
|
||||
static constexpr char const* name() noexcept { return "sse4.1"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,44 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_2_REGISTER_HPP
|
||||
#define XSIMD_SSE4_2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse4_1_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE4.2 instructions
|
||||
*/
|
||||
struct sse4_2 : sse4_1
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
|
||||
static constexpr char const* name() noexcept { return "sse4.2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,44 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSSE3_REGISTER_HPP
|
||||
#define XSIMD_SSSE3_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse3_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSSE3 instructions
|
||||
*/
|
||||
struct ssse3 : sse3
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
|
||||
static constexpr char const* name() noexcept { return "ssse3"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,155 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* Copyright (c) Yibo Cai *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SVE_REGISTER_HPP
|
||||
#define XSIMD_SVE_REGISTER_HPP
|
||||
|
||||
#include "xsimd_generic_arch.hpp"
|
||||
#include "xsimd_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SVE instructions (fixed vector size) for arm64
|
||||
*/
|
||||
template <size_t Width>
|
||||
struct sve : xsimd::generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
|
||||
static constexpr char const* name() noexcept { return "arm64+sve"; }
|
||||
};
|
||||
}
|
||||
|
||||
#if XSIMD_WITH_SVE
|
||||
|
||||
using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
|
||||
|
||||
namespace types
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
// define fixed size alias per SVE sizeless type
|
||||
#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
|
||||
using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
|
||||
using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
|
||||
using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
|
||||
using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
|
||||
using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
|
||||
using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
|
||||
using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
|
||||
using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
|
||||
using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
|
||||
using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
|
||||
using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
|
||||
#undef SVE_TO_FIXED_SIZE
|
||||
|
||||
template <size_t S>
|
||||
struct sve_vector_type_impl;
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<8>
|
||||
{
|
||||
using signed_type = sve_int8_t;
|
||||
using unsigned_type = sve_uint8_t;
|
||||
using floating_point_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<16>
|
||||
{
|
||||
using signed_type = sve_int16_t;
|
||||
using unsigned_type = sve_uint16_t;
|
||||
using floating_point_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<32>
|
||||
{
|
||||
using signed_type = sve_int32_t;
|
||||
using unsigned_type = sve_uint32_t;
|
||||
using floating_point_type = sve_float32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<64>
|
||||
{
|
||||
using signed_type = sve_int64_t;
|
||||
using unsigned_type = sve_uint64_t;
|
||||
using floating_point_type = sve_float64_t;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
|
||||
|
||||
template <class T>
|
||||
using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
|
||||
|
||||
template <class T>
|
||||
using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
|
||||
|
||||
template <class T>
|
||||
using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
|
||||
floating_point_sve_vector_type<T>,
|
||||
signed_int_sve_vector_type<T>>::type;
|
||||
|
||||
template <class T>
|
||||
using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
|
||||
signed_int_or_floating_point_sve_vector_type<T>,
|
||||
unsigned_int_sve_vector_type<T>>::type;
|
||||
} // namespace detail
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
|
||||
|
||||
namespace detail
|
||||
{
|
||||
struct sve_bool_simd_register
|
||||
{
|
||||
using register_type = sve_bool_t;
|
||||
register_type data;
|
||||
operator register_type() const noexcept { return data; }
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, sve>
|
||||
{
|
||||
using type = detail::sve_bool_simd_register;
|
||||
};
|
||||
} // namespace types
|
||||
#endif
|
||||
} // namespace xsimd
|
||||
|
||||
#endif
|
|
@ -1,251 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_TRAITS_HPP
|
||||
#define XSIMD_TRAITS_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "xsimd_batch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**************************************
|
||||
* simd_traits and revert_simd_traits *
|
||||
**************************************/
|
||||
|
||||
template <class T, class A = default_arch>
|
||||
struct has_simd_register : types::has_simd_register<T, A>
|
||||
{
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, bool>
|
||||
struct simd_traits_impl;
|
||||
|
||||
template <class T>
|
||||
struct simd_traits_impl<T, false>
|
||||
{
|
||||
using type = T;
|
||||
using bool_type = bool;
|
||||
static constexpr size_t size = 1;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t simd_traits_impl<T, false>::size;
|
||||
|
||||
template <class T>
|
||||
struct simd_traits_impl<T, true>
|
||||
{
|
||||
using type = batch<T>;
|
||||
using bool_type = typename type::batch_bool_type;
|
||||
static constexpr size_t size = type::size;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t simd_traits_impl<T, true>::size;
|
||||
|
||||
template <class T, class A>
|
||||
struct static_check_supported_config_emitter
|
||||
{
|
||||
|
||||
static_assert(A::supported(),
|
||||
"usage of batch type with unsupported architecture");
|
||||
static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
|
||||
"usage of batch type with unsupported type");
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T, class A, bool i3ec>
|
||||
struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
|
||||
{
|
||||
};
|
||||
#endif
|
||||
|
||||
// consistency checker
|
||||
template <class T, class A>
|
||||
void static_check_supported_config()
|
||||
{
|
||||
(void)static_check_supported_config_emitter<T, A>();
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct simd_traits<std::complex<T>>
|
||||
: detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T, bool i3ec>
|
||||
struct simd_traits<xtl::xcomplex<T, T, i3ec>>
|
||||
: detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
|
||||
{
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
struct revert_simd_traits
|
||||
{
|
||||
using type = T;
|
||||
static constexpr size_t size = simd_traits<type>::size;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t revert_simd_traits<T>::size;
|
||||
|
||||
template <class T>
|
||||
struct revert_simd_traits<batch<T>>
|
||||
{
|
||||
using type = T;
|
||||
static constexpr size_t size = batch<T>::size;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t revert_simd_traits<batch<T>>::size;
|
||||
|
||||
template <class T>
|
||||
using simd_type = typename simd_traits<T>::type;
|
||||
|
||||
template <class T>
|
||||
using simd_bool_type = typename simd_traits<T>::bool_type;
|
||||
|
||||
template <class T>
|
||||
using revert_simd_type = typename revert_simd_traits<T>::type;
|
||||
|
||||
/********************
|
||||
* simd_return_type *
|
||||
********************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T1, class T2>
|
||||
struct simd_condition
|
||||
{
|
||||
static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
|
||||
};
|
||||
|
||||
template <class T1, class T2, class A>
|
||||
struct simd_return_type_impl
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T2, class A>
|
||||
struct simd_return_type_impl<bool, T2, A>
|
||||
: std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T2, class A>
|
||||
struct simd_return_type_impl<bool, std::complex<T2>, A>
|
||||
: std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, class A>
|
||||
struct simd_return_type_impl<std::complex<T1>, T2, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, class A>
|
||||
struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T1, class T2, class A = default_arch>
|
||||
using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
|
||||
|
||||
/************
|
||||
* is_batch *
|
||||
************/
|
||||
|
||||
template <class V>
|
||||
struct is_batch : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct is_batch<batch<T, A>> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
/*****************
|
||||
* is_batch_bool *
|
||||
*****************/
|
||||
|
||||
template <class V>
|
||||
struct is_batch_bool : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct is_batch_bool<batch_bool<T, A>> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
/********************
|
||||
* is_batch_complex *
|
||||
********************/
|
||||
|
||||
template <class V>
|
||||
struct is_batch_complex : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,530 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_UTILS_HPP
|
||||
#define XSIMD_UTILS_HPP
|
||||
|
||||
#include <complex>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
#include "xtl/xcomplex.hpp"
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
template <class T, class A>
|
||||
class batch;
|
||||
|
||||
template <class T, class A>
|
||||
class batch_bool;
|
||||
|
||||
/**************
|
||||
* index *
|
||||
**************/
|
||||
|
||||
template <size_t I>
|
||||
using index = std::integral_constant<size_t, I>;
|
||||
|
||||
/**************
|
||||
* as_integer *
|
||||
**************/
|
||||
|
||||
template <class T>
|
||||
struct as_integer : std::make_signed<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_integer<float>
|
||||
{
|
||||
using type = int32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_integer<double>
|
||||
{
|
||||
using type = int64_t;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct as_integer<batch<T, A>>
|
||||
{
|
||||
using type = batch<typename as_integer<T>::type, A>;
|
||||
};
|
||||
|
||||
template <class B>
|
||||
using as_integer_t = typename as_integer<B>::type;
|
||||
|
||||
/***********************
|
||||
* as_unsigned_integer *
|
||||
***********************/
|
||||
|
||||
template <class T>
|
||||
struct as_unsigned_integer : std::make_unsigned<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_unsigned_integer<float>
|
||||
{
|
||||
using type = uint32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_unsigned_integer<double>
|
||||
{
|
||||
using type = uint64_t;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct as_unsigned_integer<batch<T, A>>
|
||||
{
|
||||
using type = batch<typename as_unsigned_integer<T>::type, A>;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
|
||||
|
||||
/*********************
|
||||
* as_signed_integer *
|
||||
*********************/
|
||||
|
||||
template <class T>
|
||||
struct as_signed_integer : std::make_signed<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_signed_integer_t = typename as_signed_integer<T>::type;
|
||||
|
||||
/******************
|
||||
* flip_sign_type *
|
||||
******************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, bool is_signed>
|
||||
struct flipped_sign_type_impl : std::make_signed<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct flipped_sign_type
|
||||
: detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using flipped_sign_type_t = typename flipped_sign_type<T>::type;
|
||||
|
||||
/***********
|
||||
* as_float *
|
||||
************/
|
||||
|
||||
template <class T>
|
||||
struct as_float;
|
||||
|
||||
template <>
|
||||
struct as_float<int32_t>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_float<int64_t>
|
||||
{
|
||||
using type = double;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct as_float<batch<T, A>>
|
||||
{
|
||||
using type = batch<typename as_float<T>::type, A>;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_float_t = typename as_float<T>::type;
|
||||
|
||||
/**************
|
||||
* as_logical *
|
||||
**************/
|
||||
|
||||
template <class T>
|
||||
struct as_logical;
|
||||
|
||||
template <class T, class A>
|
||||
struct as_logical<batch<T, A>>
|
||||
{
|
||||
using type = batch_bool<T, A>;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_logical_t = typename as_logical<T>::type;
|
||||
|
||||
/********************
|
||||
* bit_cast *
|
||||
********************/
|
||||
|
||||
template <class To, class From>
|
||||
inline To bit_cast(From val) noexcept
|
||||
{
|
||||
static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
|
||||
// FIXME: Some old version of GCC don't support that trait
|
||||
// static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
|
||||
// static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
|
||||
To res;
|
||||
std::memcpy(&res, &val, sizeof(val));
|
||||
return res;
|
||||
}
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
/**************************************
|
||||
* enabling / disabling metafunctions *
|
||||
**************************************/
|
||||
|
||||
template <class T>
|
||||
using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
|
||||
|
||||
/********************************
|
||||
* Matching & mismatching sizes *
|
||||
********************************/
|
||||
|
||||
template <class T, class U, class B = int>
|
||||
using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
|
||||
|
||||
template <class T, class U, class B = int>
|
||||
using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
|
||||
|
||||
template <class T, class U, class B = int>
|
||||
using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
|
||||
} // namespace detail
|
||||
} // namespace kernel
|
||||
|
||||
/*****************************************
|
||||
* Backport of index_sequence from c++14 *
|
||||
*****************************************/
|
||||
|
||||
// TODO: Remove this once we drop C++11 support
|
||||
namespace detail
|
||||
{
|
||||
template <typename T>
|
||||
struct identity
|
||||
{
|
||||
using type = T;
|
||||
};
|
||||
|
||||
#ifdef __cpp_lib_integer_sequence
|
||||
using std::index_sequence;
|
||||
using std::integer_sequence;
|
||||
using std::make_index_sequence;
|
||||
using std::make_integer_sequence;
|
||||
|
||||
using std::index_sequence_for;
|
||||
#else
|
||||
template <typename T, T... Is>
|
||||
struct integer_sequence
|
||||
{
|
||||
using value_type = T;
|
||||
static constexpr std::size_t size() noexcept { return sizeof...(Is); }
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs>
|
||||
struct make_integer_sequence_concat;
|
||||
|
||||
template <typename T, T... Lhs, T... Rhs>
|
||||
struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
|
||||
integer_sequence<T, Rhs...>>
|
||||
: identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct make_integer_sequence_impl;
|
||||
|
||||
template <typename T>
|
||||
struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, T N>
|
||||
struct make_integer_sequence_impl<std::integral_constant<T, N>>
|
||||
: make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
|
||||
typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, T N>
|
||||
using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
|
||||
|
||||
template <std::size_t... Is>
|
||||
using index_sequence = integer_sequence<std::size_t, Is...>;
|
||||
|
||||
template <std::size_t N>
|
||||
using make_index_sequence = make_integer_sequence<std::size_t, N>;
|
||||
|
||||
template <typename... Ts>
|
||||
using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
|
||||
|
||||
#endif
|
||||
|
||||
template <int... Is>
|
||||
using int_sequence = integer_sequence<int, Is...>;
|
||||
|
||||
template <int N>
|
||||
using make_int_sequence = make_integer_sequence<int, N>;
|
||||
|
||||
template <typename... Ts>
|
||||
using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
|
||||
|
||||
// Type-casted index sequence.
|
||||
template <class P, size_t... Is>
|
||||
inline P indexes_from(index_sequence<Is...>) noexcept
|
||||
{
|
||||
return { static_cast<typename P::value_type>(Is)... };
|
||||
}
|
||||
|
||||
template <class P>
|
||||
inline P make_sequence_as_batch() noexcept
|
||||
{
|
||||
return indexes_from<P>(make_index_sequence<P::size>());
|
||||
}
|
||||
}
|
||||
|
||||
/***********************************
|
||||
* Backport of std::get from C++14 *
|
||||
***********************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, class... Types, size_t I, size_t... Is>
|
||||
inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
|
||||
{
|
||||
return std::get<I>(t);
|
||||
}
|
||||
|
||||
template <class T, class U, class... Types, size_t I, size_t... Is>
|
||||
inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
|
||||
{
|
||||
using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
|
||||
return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
|
||||
}
|
||||
|
||||
template <class T, class... Types>
|
||||
inline const T& get(const std::tuple<Types...>& t) noexcept
|
||||
{
|
||||
using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
|
||||
return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
|
||||
}
|
||||
}
|
||||
|
||||
/*********************************
|
||||
* Backport of void_t from C++17 *
|
||||
*********************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class... T>
|
||||
struct make_void
|
||||
{
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <class... T>
|
||||
using void_t = typename make_void<T...>::type;
|
||||
}
|
||||
|
||||
/**************************************************
|
||||
* Equivalent of void_t but with size_t parameter *
|
||||
**************************************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <std::size_t>
|
||||
struct check_size
|
||||
{
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <std::size_t S>
|
||||
using check_size_t = typename check_size<S>::type;
|
||||
}
|
||||
|
||||
/*****************************************
|
||||
* Supplementary std::array constructors *
|
||||
*****************************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// std::array constructor from scalar value ("broadcast")
|
||||
template <typename T, std::size_t... Is>
|
||||
inline constexpr std::array<T, sizeof...(Is)>
|
||||
array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
|
||||
{
|
||||
// You can safely ignore this silly ternary, the "scalar" is all
|
||||
// that matters. The rest is just a dirty workaround...
|
||||
return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
inline constexpr std::array<T, N>
|
||||
array_from_scalar(const T& scalar) noexcept
|
||||
{
|
||||
return array_from_scalar_impl(scalar, make_index_sequence<N>());
|
||||
}
|
||||
|
||||
// std::array constructor from C-style pointer (handled as an array)
|
||||
template <typename T, std::size_t... Is>
|
||||
inline constexpr std::array<T, sizeof...(Is)>
|
||||
array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
|
||||
{
|
||||
return std::array<T, sizeof...(Is)> { c_array[Is]... };
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
inline constexpr std::array<T, N>
|
||||
array_from_pointer(const T* c_array) noexcept
|
||||
{
|
||||
return array_from_pointer_impl(c_array, make_index_sequence<N>());
|
||||
}
|
||||
}
|
||||
|
||||
/************************
|
||||
* is_array_initializer *
|
||||
************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <bool...>
|
||||
struct bool_pack;
|
||||
|
||||
template <bool... bs>
|
||||
using all_true = std::is_same<
|
||||
bool_pack<bs..., true>, bool_pack<true, bs...>>;
|
||||
|
||||
template <typename T, typename... Args>
|
||||
using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
|
||||
|
||||
template <typename T, std::size_t N, typename... Args>
|
||||
using is_array_initializer = std::enable_if<
|
||||
(sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
|
||||
|
||||
// Check that a variadic argument pack is a list of N values of type T,
|
||||
// as usable for instantiating a value of type std::array<T, N>.
|
||||
template <typename T, std::size_t N, typename... Args>
|
||||
using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
|
||||
}
|
||||
|
||||
/**************
|
||||
* is_complex *
|
||||
**************/
|
||||
|
||||
// This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
|
||||
// However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
|
||||
// so we cannot define is_complex in xsimd_traits.hpp. Besides, if
|
||||
// no file defining batches is included, we still need this definition
|
||||
// in xsimd_traits.hpp, so let's define it here.
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T>
|
||||
struct is_complex : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct is_complex<std::complex<T>> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T, bool i3ec>
|
||||
struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
|
||||
{
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
/*******************
|
||||
* real_batch_type *
|
||||
*******************/
|
||||
|
||||
template <class B>
|
||||
struct real_batch_type
|
||||
{
|
||||
using type = B;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct real_batch_type<batch<std::complex<T>, A>>
|
||||
{
|
||||
using type = batch<T, A>;
|
||||
};
|
||||
|
||||
template <class B>
|
||||
using real_batch_type_t = typename real_batch_type<B>::type;
|
||||
|
||||
/**********************
|
||||
* complex_batch_type *
|
||||
**********************/
|
||||
|
||||
template <class B>
|
||||
struct complex_batch_type
|
||||
{
|
||||
using real_value_type = typename B::value_type;
|
||||
using arch_type = typename B::arch_type;
|
||||
using type = batch<std::complex<real_value_type>, arch_type>;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct complex_batch_type<batch<std::complex<T>, A>>
|
||||
{
|
||||
using type = batch<std::complex<T>, A>;
|
||||
};
|
||||
|
||||
template <class B>
|
||||
using complex_batch_type_t = typename complex_batch_type<B>::type;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -1,68 +0,0 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_HPP
|
||||
#define XSIMD_HPP
|
||||
|
||||
#if defined(__has_cpp_attribute)
|
||||
// if this check passes, then the compiler supports feature test macros
|
||||
#if __has_cpp_attribute(nodiscard) >= 201603L
|
||||
// if this check passes, then the compiler supports [[nodiscard]] without a message
|
||||
#define XSIMD_NO_DISCARD [[nodiscard]]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
|
||||
// this means that the previous tests failed, but we are using C++17 or higher
|
||||
#define XSIMD_NO_DISCARD [[nodiscard]]
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
|
||||
// this means that the previous checks failed, but we are using GCC or Clang
|
||||
#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_NO_DISCARD)
|
||||
// this means that all the previous checks failed, so we fallback to doing nothing
|
||||
#define XSIMD_NO_DISCARD
|
||||
#endif
|
||||
|
||||
#ifdef __cpp_if_constexpr
|
||||
// this means that the compiler supports the `if constexpr` construct
|
||||
#define XSIMD_IF_CONSTEXPR if constexpr
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
|
||||
// this means that the previous test failed, but we are using C++17 or higher
|
||||
#define XSIMD_IF_CONSTEXPR if constexpr
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_IF_CONSTEXPR)
|
||||
// this means that all the previous checks failed, so we fallback to a normal `if`
|
||||
#define XSIMD_IF_CONSTEXPR if
|
||||
#endif
|
||||
|
||||
#include "config/xsimd_config.hpp"
|
||||
|
||||
#include "arch/xsimd_scalar.hpp"
|
||||
#include "memory/xsimd_aligned_allocator.hpp"
|
||||
|
||||
#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
|
||||
// to type definition or anything appart from scalar definition and aligned allocator
|
||||
#else
|
||||
#include "types/xsimd_batch.hpp"
|
||||
#include "types/xsimd_batch_constant.hpp"
|
||||
#include "types/xsimd_traits.hpp"
|
||||
|
||||
// This include must come last
|
||||
#include "types/xsimd_api.hpp"
|
||||
#endif
|
||||
#endif
|
|
@ -1,37 +0,0 @@
|
|||
schema: 1
|
||||
|
||||
bugzilla:
|
||||
product: Toolkit
|
||||
component: "General"
|
||||
|
||||
origin:
|
||||
name: xsimd
|
||||
description: C++ wrappers for SIMD intrinsics
|
||||
|
||||
url: https://github.com/QuantStack/xsimd
|
||||
|
||||
release: 5186173c33515769d49bae8cb8bc8469770427b8 (2022-12-06T11:35:51Z).
|
||||
revision: 5186173c33515769d49bae8cb8bc8469770427b8
|
||||
|
||||
license: BSD-3-Clause
|
||||
|
||||
vendoring:
|
||||
url: https://github.com/QuantStack/xsimd
|
||||
source-hosting: github
|
||||
tracking: commit
|
||||
|
||||
exclude:
|
||||
- ".*"
|
||||
- "*.md"
|
||||
- "*.yml"
|
||||
- "*.txt"
|
||||
- "*.in"
|
||||
- "*.sh"
|
||||
- benchmark
|
||||
- cmake
|
||||
- docs
|
||||
- examples
|
||||
- test
|
||||
|
||||
keep:
|
||||
- include/
|
Загрузка…
Ссылка в новой задаче