зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1801557 - import xsimd to third_party r=glandium
Differential Revision: https://phabricator.services.mozilla.com/D162537
This commit is contained in:
Родитель
ec72d27e4d
Коммит
46a6cbf6ca
|
@ -40,6 +40,9 @@ with Files('rust/**'):
|
|||
with Files('webkit/**'):
|
||||
BUG_COMPONENT = ('Firefox Build System', 'General')
|
||||
|
||||
with Files('xsimd/**'):
|
||||
BUG_COMPONENT = ('Firefox Build System', 'General')
|
||||
|
||||
with Files('prio/**'):
|
||||
BUG_COMPONENT = ('Firefox Build System', 'General')
|
||||
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
|
||||
Copyright (c) 2016, QuantStack
|
||||
Copyright (c) 2018, Serge Guelton
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
|
||||
* Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
* Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
* Neither the name of the copyright holder nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
152
third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
поставляемый
Normal file
152
third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
поставляемый
Normal file
|
@ -0,0 +1,152 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
|
||||
#define XSIMD_GENERIC_ARITHMETIC_HPP
|
||||
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// bitwise_lshift
|
||||
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x << y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// bitwise_rshift
|
||||
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x >> y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// div
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept -> T
|
||||
{ return x / y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return x * y + z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return x * y - z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// fnma
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return -x * y + z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A, class T>
|
||||
inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return -x * y - z;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
|
||||
auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
|
||||
return { res_r, res_i };
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept -> T
|
||||
{ return x * y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// sadd
|
||||
template <class A>
|
||||
inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return add(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return add(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
|
||||
// ssub
|
||||
template <class A>
|
||||
inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return sub(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return sub(self, other); // no saturated arithmetic on floating point numbers
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,96 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_COMPLEX_HPP
|
||||
#define XSIMD_GENERIC_COMPLEX_HPP
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// real
|
||||
template <class A, class T>
|
||||
inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self.real();
|
||||
}
|
||||
|
||||
// imag
|
||||
template <class A, class T>
|
||||
inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self.imag();
|
||||
}
|
||||
|
||||
// arg
|
||||
template <class A, class T>
|
||||
inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return atan2(imag(self), real(self));
|
||||
}
|
||||
|
||||
// conj
|
||||
template <class A, class T>
|
||||
inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { real(self), -imag(self) };
|
||||
}
|
||||
|
||||
// norm
|
||||
template <class A, class T>
|
||||
inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { fma(real(self), real(self), imag(self) * imag(self)) };
|
||||
}
|
||||
|
||||
// proj
|
||||
template <class A, class T>
|
||||
inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = complex_batch_type_t<batch<T, A>>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
using real_value_type = typename real_batch::value_type;
|
||||
auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
|
||||
return select(cond,
|
||||
batch_type(constants::infinity<real_batch>(),
|
||||
copysign(real_batch(real_value_type(0)), imag(self))),
|
||||
batch_type(self));
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,239 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_DETAILS_HPP
|
||||
#define XSIMD_GENERIC_DETAILS_HPP
|
||||
|
||||
#include <complex>
|
||||
|
||||
#include "../../math/xsimd_rem_pio2.hpp"
|
||||
#include "../../types/xsimd_generic_arch.hpp"
|
||||
#include "../../types/xsimd_utils.hpp"
|
||||
#include "../xsimd_constants.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
// Forward declaration. Should we put them in a separate file?
|
||||
template <class T, class A>
|
||||
inline batch<T, A> abs(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline bool any(batch_bool<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
|
||||
template <class A, class T_out, class T_in>
|
||||
inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
|
||||
template <class B, class T, class A>
|
||||
inline B bitwise_cast(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> cos(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> exp(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
|
||||
template <class T, class A, uint64_t... Coefs>
|
||||
inline batch<T, A> horner(const batch<T, A>& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> log(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
|
||||
template <class T, class A>
|
||||
inline T reduce_add(batch<T, A> const&) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sign(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sin(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> tan(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
|
||||
template <class T, class A>
|
||||
inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class F, class A, class T, class... Batches>
|
||||
inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
|
||||
{
|
||||
constexpr std::size_t size = batch<T, A>::size;
|
||||
alignas(A::alignment()) T self_buffer[size];
|
||||
alignas(A::alignment()) T other_buffer[size];
|
||||
self.store_aligned(&self_buffer[0]);
|
||||
other.store_aligned(&other_buffer[0]);
|
||||
for (std::size_t i = 0; i < size; ++i)
|
||||
{
|
||||
self_buffer[i] = func(self_buffer[i], other_buffer[i]);
|
||||
}
|
||||
return batch<T, A>::load_aligned(self_buffer);
|
||||
}
|
||||
|
||||
template <class U, class F, class A, class T>
|
||||
inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<U, A>::size,
|
||||
"Source and destination sizes must match");
|
||||
constexpr std::size_t src_size = batch<T, A>::size;
|
||||
constexpr std::size_t dest_size = batch<U, A>::size;
|
||||
alignas(A::alignment()) T self_buffer[src_size];
|
||||
alignas(A::alignment()) U other_buffer[dest_size];
|
||||
self.store_aligned(&self_buffer[0]);
|
||||
for (std::size_t i = 0; i < src_size; ++i)
|
||||
{
|
||||
other_buffer[i] = func(self_buffer[i]);
|
||||
}
|
||||
return batch<U, A>::load_aligned(other_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Generic conversion handling machinery. Each architecture must define
|
||||
// conversion function when such conversions exits in the form of
|
||||
// intrinsic. Then we use that information to automatically decide whether
|
||||
// to use scalar or vector conversion when doing load / store / batch_cast
|
||||
struct with_fast_conversion
|
||||
{
|
||||
};
|
||||
struct with_slow_conversion
|
||||
{
|
||||
};
|
||||
|
||||
template <class A, class From, class To, class = void>
|
||||
struct conversion_type_impl
|
||||
{
|
||||
using type = with_slow_conversion;
|
||||
};
|
||||
|
||||
using xsimd::detail::void_t;
|
||||
|
||||
template <class A, class From, class To>
|
||||
struct conversion_type_impl<A, From, To,
|
||||
void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
|
||||
std::declval<const batch<To, A>&>(),
|
||||
std::declval<const A&>()))>>
|
||||
{
|
||||
using type = with_fast_conversion;
|
||||
};
|
||||
|
||||
template <class A, class From, class To>
|
||||
using conversion_type = typename conversion_type_impl<A, From, To>::type;
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
/* origin: boost/simdfunction/horn.hpp*/
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class B, uint64_t c>
|
||||
inline B coef() noexcept
|
||||
{
|
||||
using value_type = typename B::value_type;
|
||||
return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
|
||||
}
|
||||
template <class B>
|
||||
inline B horner(const B&) noexcept
|
||||
{
|
||||
return B(typename B::value_type(0.));
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0>
|
||||
inline B horner(const B&) noexcept
|
||||
{
|
||||
return coef<B, c0>();
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
|
||||
inline B horner(const B& self) noexcept
|
||||
{
|
||||
return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
|
||||
}
|
||||
|
||||
/* origin: boost/simdfunction/horn1.hpp*/
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class B>
|
||||
inline B horner1(const B&) noexcept
|
||||
{
|
||||
return B(1.);
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0>
|
||||
inline B horner1(const B& x) noexcept
|
||||
{
|
||||
return x + detail::coef<B, c0>();
|
||||
}
|
||||
|
||||
template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
|
||||
inline B horner1(const B& x) noexcept
|
||||
{
|
||||
return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,163 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_LOGICAL_HPP
|
||||
#define XSIMD_GENERIC_LOGICAL_HPP
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// from mask
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
|
||||
// This is inefficient but should never be called. It's just a
|
||||
// temporary implementation until arm support is added.
|
||||
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
|
||||
buffer[i] = mask & (1ull << i);
|
||||
return batch_bool<T, A>::load_aligned(buffer);
|
||||
}
|
||||
|
||||
// ge
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return other <= self;
|
||||
}
|
||||
|
||||
// gt
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return other < self;
|
||||
}
|
||||
|
||||
// is_even
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return is_flint(self * T(0.5));
|
||||
}
|
||||
|
||||
// is_flint
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
|
||||
return frac == T(0.);
|
||||
}
|
||||
|
||||
// is_odd
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return is_even(self - T(1.));
|
||||
}
|
||||
|
||||
// isinf
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(false);
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return abs(self) == std::numeric_limits<float>::infinity();
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return abs(self) == std::numeric_limits<double>::infinity();
|
||||
}
|
||||
|
||||
// isfinite
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(true);
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return (self - self) == 0.f;
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return (self - self) == 0.;
|
||||
}
|
||||
|
||||
// isnan
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
|
||||
{
|
||||
return batch_bool<T, A>(false);
|
||||
}
|
||||
|
||||
// le
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return (self < other) || (self == other);
|
||||
}
|
||||
|
||||
// neq
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return !(other == self);
|
||||
}
|
||||
|
||||
// logical_and
|
||||
template <class A, class T>
|
||||
inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x && y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// logical_or
|
||||
template <class A, class T>
|
||||
inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::apply([](T x, T y) noexcept
|
||||
{ return x || y; },
|
||||
self, other);
|
||||
}
|
||||
|
||||
// mask
|
||||
template <class A, class T>
|
||||
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
|
||||
self.store_aligned(buffer);
|
||||
// This is inefficient but should never be called. It's just a
|
||||
// temporary implementation until arm support is added.
|
||||
uint64_t res = 0;
|
||||
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
|
||||
if (buffer[i])
|
||||
res |= 1ul << i;
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,397 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_MEMORY_HPP
|
||||
#define XSIMD_GENERIC_MEMORY_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <complex>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "../../types/xsimd_batch_constant.hpp"
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <class batch_type, typename batch_type::value_type... Values>
|
||||
struct batch_constant;
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// extract_pair
|
||||
template <class A, class T>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
|
||||
{
|
||||
constexpr std::size_t size = batch<T, A>::size;
|
||||
assert(i < size && "index in bounds");
|
||||
|
||||
alignas(A::alignment()) T self_buffer[size];
|
||||
self.store_aligned(self_buffer);
|
||||
|
||||
alignas(A::alignment()) T other_buffer[size];
|
||||
other.store_aligned(other_buffer);
|
||||
|
||||
alignas(A::alignment()) T concat_buffer[size];
|
||||
|
||||
for (std::size_t j = 0; j < (size - i); ++j)
|
||||
{
|
||||
concat_buffer[j] = other_buffer[i + j];
|
||||
if (j < i)
|
||||
{
|
||||
concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
|
||||
}
|
||||
}
|
||||
return batch<T, A>::load_aligned(concat_buffer);
|
||||
}
|
||||
|
||||
// gather
|
||||
namespace detail
|
||||
{
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
|
||||
inline batch<T, A> gather(U const* src, batch<V, A> const& index,
|
||||
::xsimd::index<N> I) noexcept
|
||||
{
|
||||
return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
|
||||
}
|
||||
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
|
||||
inline batch<T, A>
|
||||
gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
|
||||
{
|
||||
static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
|
||||
|
||||
const auto test = gather<N - 1, T, A>(src, index, {});
|
||||
return insert(test, static_cast<T>(src[index.get(I)]), I);
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <typename T, typename A, typename V>
|
||||
inline batch<T, A>
|
||||
gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Index and destination sizes must match");
|
||||
|
||||
return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
|
||||
}
|
||||
|
||||
// Gather with runtime indexes and mismatched strides.
|
||||
template <typename T, typename A, typename U, typename V>
|
||||
inline detail::sizes_mismatch_t<T, U, batch<T, A>>
|
||||
gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Index and destination sizes must match");
|
||||
|
||||
return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
|
||||
}
|
||||
|
||||
// Gather with runtime indexes and matching strides.
|
||||
template <typename T, typename A, typename U, typename V>
|
||||
inline detail::stride_match_t<T, U, batch<T, A>>
|
||||
gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Index and destination sizes must match");
|
||||
|
||||
return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
|
||||
}
|
||||
|
||||
// insert
|
||||
template <class A, class T, size_t I>
|
||||
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
|
||||
{
|
||||
struct index_mask
|
||||
{
|
||||
static constexpr bool get(size_t index, size_t /* size*/)
|
||||
{
|
||||
return index != I;
|
||||
}
|
||||
};
|
||||
batch<T, A> tmp(val);
|
||||
return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
|
||||
}
|
||||
|
||||
// get
|
||||
template <class A, size_t I, class T>
|
||||
inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[I];
|
||||
}
|
||||
|
||||
template <class A, size_t I, class T>
|
||||
inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[I];
|
||||
}
|
||||
|
||||
template <class A, size_t I, class T>
|
||||
inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[I];
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) T buffer[batch<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
|
||||
{
|
||||
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
|
||||
{
|
||||
using T2 = typename batch<std::complex<T>, A>::value_type;
|
||||
alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
|
||||
self.store_aligned(&buffer[0]);
|
||||
return buffer[i];
|
||||
}
|
||||
|
||||
// load_aligned
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
|
||||
{
|
||||
using batch_type_in = batch<T_in, A>;
|
||||
using batch_type_out = batch<T_out, A>;
|
||||
return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
|
||||
}
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
|
||||
using batch_type_out = batch<T_out, A>;
|
||||
alignas(A::alignment()) T_out buffer[batch_type_out::size];
|
||||
std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
|
||||
return batch_type_out::load_aligned(buffer);
|
||||
}
|
||||
}
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
|
||||
}
|
||||
|
||||
// load_unaligned
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
|
||||
{
|
||||
using batch_type_in = batch<T_in, A>;
|
||||
using batch_type_out = batch<T_out, A>;
|
||||
return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
|
||||
}
|
||||
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
|
||||
return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
|
||||
}
|
||||
}
|
||||
template <class A, class T_in, class T_out>
|
||||
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Scatter with runtime indexes.
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
|
||||
inline void scatter(batch<T, A> const& src, U* dst,
|
||||
batch<V, A> const& index,
|
||||
::xsimd::index<N> I) noexcept
|
||||
{
|
||||
dst[index.get(I)] = static_cast<U>(src.get(I));
|
||||
}
|
||||
|
||||
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
|
||||
inline void
|
||||
scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
|
||||
::xsimd::index<N> I) noexcept
|
||||
{
|
||||
static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
|
||||
|
||||
kernel::detail::scatter<N - 1, T, A, U, V>(
|
||||
src, dst, index, {});
|
||||
dst[index.get(I)] = static_cast<U>(src.get(I));
|
||||
}
|
||||
} // namespace detail
|
||||
|
||||
template <typename A, typename T, typename V>
|
||||
inline void
|
||||
scatter(batch<T, A> const& src, T* dst,
|
||||
batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Source and index sizes must match");
|
||||
kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
|
||||
src, dst, index, {});
|
||||
}
|
||||
|
||||
template <typename A, typename T, typename U, typename V>
|
||||
inline detail::sizes_mismatch_t<T, U, void>
|
||||
scatter(batch<T, A> const& src, U* dst,
|
||||
batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Source and index sizes must match");
|
||||
kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
|
||||
src, dst, index, {});
|
||||
}
|
||||
|
||||
template <typename A, typename T, typename U, typename V>
|
||||
inline detail::stride_match_t<T, U, void>
|
||||
scatter(batch<T, A> const& src, U* dst,
|
||||
batch<V, A> const& index,
|
||||
kernel::requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(batch<T, A>::size == batch<V, A>::size,
|
||||
"Source and index sizes must match");
|
||||
const auto tmp = batch_cast<U>(src);
|
||||
kernel::scatter<A>(tmp, dst, index, A {});
|
||||
}
|
||||
|
||||
// store
|
||||
template <class T, class A>
|
||||
inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
constexpr auto size = batch_bool<T, A>::size;
|
||||
alignas(A::alignment()) T buffer[size];
|
||||
kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
|
||||
for (std::size_t i = 0; i < size; ++i)
|
||||
mem[i] = bool(buffer[i]);
|
||||
}
|
||||
|
||||
// store_aligned
|
||||
template <class A, class T_in, class T_out>
|
||||
inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
|
||||
alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
|
||||
store_aligned(&buffer[0], self);
|
||||
std::copy(std::begin(buffer), std::end(buffer), mem);
|
||||
}
|
||||
|
||||
// store_unaligned
|
||||
template <class A, class T_in, class T_out>
|
||||
inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
|
||||
return store_aligned<A>(mem, self, generic {});
|
||||
}
|
||||
|
||||
// swizzle
|
||||
template <class A, class T, class ITy, ITy... Vs>
|
||||
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
|
||||
}
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
|
||||
{
|
||||
static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
|
||||
}
|
||||
}
|
||||
|
||||
// load_complex_aligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_out, A>;
|
||||
T_in const* buffer = reinterpret_cast<T_in const*>(mem);
|
||||
real_batch hi = real_batch::load_aligned(buffer),
|
||||
lo = real_batch::load_aligned(buffer + real_batch::size);
|
||||
return detail::load_complex(hi, lo, A {});
|
||||
}
|
||||
|
||||
// load_complex_unaligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_out, A>;
|
||||
T_in const* buffer = reinterpret_cast<T_in const*>(mem);
|
||||
real_batch hi = real_batch::load_unaligned(buffer),
|
||||
lo = real_batch::load_unaligned(buffer + real_batch::size);
|
||||
return detail::load_complex(hi, lo, A {});
|
||||
}
|
||||
|
||||
// store_complex_aligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_in, A>;
|
||||
real_batch hi = detail::complex_high(src, A {});
|
||||
real_batch lo = detail::complex_low(src, A {});
|
||||
T_out* buffer = reinterpret_cast<T_out*>(dst);
|
||||
lo.store_aligned(buffer);
|
||||
hi.store_aligned(buffer + real_batch::size);
|
||||
}
|
||||
|
||||
// store_compelx_unaligned
|
||||
template <class A, class T_out, class T_in>
|
||||
inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = batch<T_in, A>;
|
||||
real_batch hi = detail::complex_high(src, A {});
|
||||
real_batch lo = detail::complex_low(src, A {});
|
||||
T_out* buffer = reinterpret_cast<T_out*>(dst);
|
||||
lo.store_unaligned(buffer);
|
||||
hi.store_unaligned(buffer + real_batch::size);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,72 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_ROUNDING_HPP
|
||||
#define XSIMD_GENERIC_ROUNDING_HPP
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
|
||||
using namespace types;
|
||||
|
||||
// ceil
|
||||
template <class A, class T>
|
||||
inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
batch<T, A> truncated_self = trunc(self);
|
||||
return select(truncated_self < self, truncated_self + 1, truncated_self);
|
||||
}
|
||||
|
||||
// floor
|
||||
template <class A, class T>
|
||||
inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
batch<T, A> truncated_self = trunc(self);
|
||||
return select(truncated_self > self, truncated_self - 1, truncated_self);
|
||||
}
|
||||
|
||||
// round
|
||||
template <class A, class T>
|
||||
inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto v = abs(self);
|
||||
auto c = ceil(v);
|
||||
auto cp = select(c - 0.5 > v, c - 1, c);
|
||||
return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
|
||||
}
|
||||
|
||||
// trunc
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return self;
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,969 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_TRIGO_HPP
|
||||
#define XSIMD_GENERIC_TRIGO_HPP
|
||||
|
||||
#include "./xsimd_generic_details.hpp"
|
||||
|
||||
#include <array>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
/* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
using namespace types;
|
||||
|
||||
// acos
|
||||
template <class A, class T>
|
||||
inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = abs(self);
|
||||
auto x_larger_05 = x > batch_type(0.5);
|
||||
x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
|
||||
x = asin(x);
|
||||
x = select(x_larger_05, x + x, x);
|
||||
x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
|
||||
return select(x_larger_05, x, constants::pio2<batch_type>() - x);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
batch_type tmp = asin(z);
|
||||
return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
|
||||
}
|
||||
|
||||
// acosh
|
||||
/* origin: boost/simd/arch/common/simd/function/acosh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = self - batch_type(1.);
|
||||
auto test = x > constants::oneotwoeps<batch_type>();
|
||||
batch_type z = select(test, self, x + sqrt(x + x + x * x));
|
||||
batch_type l1pz = log1p(z);
|
||||
return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
batch_type w = acos(z);
|
||||
w = batch_type(-w.imag(), w.real());
|
||||
return w;
|
||||
}
|
||||
|
||||
// asin
|
||||
template <class A>
|
||||
inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type x = abs(self);
|
||||
batch_type sign = bitofsign(self);
|
||||
auto x_larger_05 = x > batch_type(0.5);
|
||||
batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
|
||||
x = select(x_larger_05, sqrt(z), x);
|
||||
batch_type z1 = detail::horner<batch_type,
|
||||
0x3e2aaae4,
|
||||
0x3d9980f6,
|
||||
0x3d3a3ec7,
|
||||
0x3cc617e3,
|
||||
0x3d2cb352>(z);
|
||||
z1 = fma(z1, z * x, x);
|
||||
z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
|
||||
return z ^ sign;
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type x = abs(self);
|
||||
auto small_cond = x < constants::sqrteps<batch_type>();
|
||||
batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
|
||||
batch_type zz1 = batch_type(1.) - x;
|
||||
batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
|
||||
zz1 = sqrt(zz1 + zz1);
|
||||
batch_type z = constants::pio4<batch_type>() - zz1;
|
||||
zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
|
||||
z = z - zz1;
|
||||
zz1 = z + constants::pio4<batch_type>();
|
||||
batch_type zz2 = self * self;
|
||||
z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
|
||||
zz2 = fma(x, z, x);
|
||||
return select(x > batch_type(1.), constants::nan<batch_type>(),
|
||||
select(small_cond, x,
|
||||
select(x > ct1, zz1, zz2))
|
||||
^ bitofsign(self));
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch x = z.real();
|
||||
real_batch y = z.imag();
|
||||
|
||||
batch_type ct(-y, x);
|
||||
batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
|
||||
zz = log(ct + sqrt(zz));
|
||||
batch_type resg(zz.imag(), -zz.real());
|
||||
|
||||
return select(y == real_batch(0.),
|
||||
select(fabs(x) > real_batch(1.),
|
||||
batch_type(constants::pio2<real_batch>(), real_batch(0.)),
|
||||
batch_type(asin(x), real_batch(0.))),
|
||||
resg);
|
||||
}
|
||||
|
||||
// asinh
|
||||
/* origin: boost/simd/arch/common/simd/function/asinh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A>
|
||||
average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
|
||||
{
|
||||
return (x1 & x2) + ((x1 ^ x2) >> 1);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A>
|
||||
averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
|
||||
{
|
||||
return averagef(x1, x2);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
|
||||
{
|
||||
return averagef(x1, x2);
|
||||
}
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type x = abs(self);
|
||||
auto lthalf = x < batch_type(0.5);
|
||||
batch_type x2 = x * x;
|
||||
batch_type bts = bitofsign(self);
|
||||
batch_type z(0.);
|
||||
if (any(lthalf))
|
||||
{
|
||||
z = detail::horner<batch_type,
|
||||
0x3f800000,
|
||||
0xbe2aa9ad,
|
||||
0x3d9949b1,
|
||||
0xbd2ee581,
|
||||
0x3ca4d6e6>(x2)
|
||||
* x;
|
||||
if (all(lthalf))
|
||||
return z ^ bts;
|
||||
}
|
||||
batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
|
||||
#ifndef XSIMD_NO_NANS
|
||||
return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
|
||||
#else
|
||||
return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
|
||||
#endif
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type x = abs(self);
|
||||
auto test = x > constants::oneosqrteps<batch_type>();
|
||||
batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
|
||||
#ifndef XSIMD_NO_INFINITIES
|
||||
z = select(x == constants::infinity<batch_type>(), x, z);
|
||||
#endif
|
||||
batch_type l1pz = log1p(z);
|
||||
z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
|
||||
return bitofsign(self) ^ z;
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
batch_type w = asin(batch_type(-z.imag(), z.real()));
|
||||
w = batch_type(w.imag(), -w.real());
|
||||
return w;
|
||||
}
|
||||
|
||||
// atan
|
||||
namespace detail
|
||||
{
|
||||
template <class A>
|
||||
static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
const auto flag1 = x < constants::tan3pio8<batch_type>();
|
||||
const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
|
||||
batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
|
||||
yy = select(flag2, constants::pio4<batch_type>(), yy);
|
||||
batch_type xx = select(flag1, x, -recx);
|
||||
xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
|
||||
const batch_type z = xx * xx;
|
||||
batch_type z1 = detail::horner<batch_type,
|
||||
0xbeaaaa2aul,
|
||||
0x3e4c925ful,
|
||||
0xbe0e1b85ul,
|
||||
0x3da4f0d1ul>(z);
|
||||
z1 = fma(xx, z1 * z, xx);
|
||||
z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
|
||||
z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
|
||||
return yy + z1;
|
||||
}
|
||||
template <class A>
|
||||
static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
const auto flag1 = x < constants::tan3pio8<batch_type>();
|
||||
const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
|
||||
batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
|
||||
yy = select(flag2, constants::pio4<batch_type>(), yy);
|
||||
batch_type xx = select(flag1, x, -recx);
|
||||
xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
|
||||
batch_type z = xx * xx;
|
||||
z *= detail::horner<batch_type,
|
||||
0xc0503669fd28ec8eull,
|
||||
0xc05eb8bf2d05ba25ull,
|
||||
0xc052c08c36880273ull,
|
||||
0xc03028545b6b807aull,
|
||||
0xbfec007fa1f72594ull>(z)
|
||||
/ detail::horner1<batch_type,
|
||||
0x4068519efbbd62ecull,
|
||||
0x407e563f13b049eaull,
|
||||
0x407b0e18d2e2be3bull,
|
||||
0x4064a0dd43b8fa25ull,
|
||||
0x4038dbc45b14603cull>(z);
|
||||
z = fma(xx, z, xx);
|
||||
z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
|
||||
z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
|
||||
return yy + z;
|
||||
}
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type absa = abs(self);
|
||||
const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
|
||||
return x ^ bitofsign(self);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch x = z.real();
|
||||
real_batch y = z.imag();
|
||||
real_batch x2 = x * x;
|
||||
real_batch one(1.);
|
||||
real_batch a = one - x2 - (y * y);
|
||||
real_batch w = 0.5 * atan2(2. * x, a);
|
||||
real_batch num = y + one;
|
||||
num = x2 + num * num;
|
||||
real_batch den = y - one;
|
||||
den = x2 + den * den;
|
||||
batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
|
||||
batch_type(real_batch(0.), constants::infinity<real_batch>()),
|
||||
batch_type(w, 0.25 * log(num / den)));
|
||||
return res;
|
||||
}
|
||||
|
||||
// atanh
|
||||
/* origin: boost/simd/arch/common/simd/function/acosh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = abs(self);
|
||||
batch_type t = x + x;
|
||||
batch_type z = batch_type(1.) - x;
|
||||
auto test = x < batch_type(0.5);
|
||||
batch_type tmp = select(test, x, t) / z;
|
||||
return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
batch_type w = atan(batch_type(-z.imag(), z.real()));
|
||||
w = batch_type(w.imag(), -w.real());
|
||||
return w;
|
||||
}
|
||||
|
||||
// atan2
|
||||
template <class A, class T>
|
||||
inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type q = abs(self / other);
|
||||
const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
|
||||
return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
|
||||
}
|
||||
|
||||
// cos
|
||||
namespace detail
|
||||
{
|
||||
template <class T, class A>
|
||||
inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
|
||||
{
|
||||
return x & batch<T, A>(3);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
|
||||
{
|
||||
return to_float(quadrant(to_int(x)));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type a = x * batch_type(0.25);
|
||||
return (a - floor(a)) * batch_type(4.);
|
||||
}
|
||||
/* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0x3d2aaaa5,
|
||||
0xbab60619,
|
||||
0x37ccf5ce>(z);
|
||||
return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0xbe2aaaa2,
|
||||
0x3c08839d,
|
||||
0xb94ca1f9>(z);
|
||||
return fma(y * z, x, x);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type zz = z * z;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0x3eaaaa6f,
|
||||
0x3e0896dd,
|
||||
0x3d5ac5c9,
|
||||
0x3cc821b5,
|
||||
0x3b4c779c,
|
||||
0x3c19c53b>(zz);
|
||||
return fma(y, zz * z, z);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, y, -batch_type(1.) / y);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, batch_type(1.) / y, -y);
|
||||
}
|
||||
|
||||
/* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A>
|
||||
static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0x3fe0000000000000ull,
|
||||
0xbfa5555555555551ull,
|
||||
0x3f56c16c16c15d47ull,
|
||||
0xbefa01a019ddbcd9ull,
|
||||
0x3e927e4f8e06d9a5ull,
|
||||
0xbe21eea7c1e514d4ull,
|
||||
0x3da8ff831ad9b219ull>(z);
|
||||
return batch_type(1.) - y * z;
|
||||
}
|
||||
|
||||
template <class A>
|
||||
static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = detail::horner<batch_type,
|
||||
0xbfc5555555555548ull,
|
||||
0x3f8111111110f7d0ull,
|
||||
0xbf2a01a019bfdf03ull,
|
||||
0x3ec71de3567d4896ull,
|
||||
0xbe5ae5e5a9291691ull,
|
||||
0x3de5d8fd1fcf0ec1ull>(z);
|
||||
return fma(y * z, x, x);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type zz = z * z;
|
||||
batch_type num = detail::horner<batch_type,
|
||||
0xc1711fead3299176ull,
|
||||
0x413199eca5fc9dddull,
|
||||
0xc0c992d8d24f3f38ull>(zz);
|
||||
batch_type den = detail::horner1<batch_type,
|
||||
0xc189afe03cbe5a31ull,
|
||||
0x4177d98fc2ead8efull,
|
||||
0xc13427bc582abc96ull,
|
||||
0x40cab8a5eeb36572ull>(zz);
|
||||
return fma(z, (zz * (num / den)), z);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, y, -batch_type(1.) / y);
|
||||
}
|
||||
|
||||
template <class A, class BB>
|
||||
static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type y = base_tancot_eval(z);
|
||||
return select(test, batch_type(1.) / y, -y);
|
||||
}
|
||||
/* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
struct trigo_radian_tag
|
||||
{
|
||||
};
|
||||
struct trigo_pi_tag
|
||||
{
|
||||
};
|
||||
|
||||
template <class B, class Tag = trigo_radian_tag>
|
||||
struct trigo_reducer
|
||||
{
|
||||
static inline B reduce(const B& x, B& xr) noexcept
|
||||
{
|
||||
if (all(x <= constants::pio4<B>()))
|
||||
{
|
||||
xr = x;
|
||||
return B(0.);
|
||||
}
|
||||
else if (all(x <= constants::pio2<B>()))
|
||||
{
|
||||
auto test = x > constants::pio4<B>();
|
||||
xr = x - constants::pio2_1<B>();
|
||||
xr -= constants::pio2_2<B>();
|
||||
xr -= constants::pio2_3<B>();
|
||||
xr = select(test, xr, x);
|
||||
return select(test, B(1.), B(0.));
|
||||
}
|
||||
else if (all(x <= constants::twentypi<B>()))
|
||||
{
|
||||
B xi = nearbyint(x * constants::twoopi<B>());
|
||||
xr = fnma(xi, constants::pio2_1<B>(), x);
|
||||
xr -= xi * constants::pio2_2<B>();
|
||||
xr -= xi * constants::pio2_3<B>();
|
||||
return quadrant(xi);
|
||||
}
|
||||
else if (all(x <= constants::mediumpi<B>()))
|
||||
{
|
||||
B fn = nearbyint(x * constants::twoopi<B>());
|
||||
B r = x - fn * constants::pio2_1<B>();
|
||||
B w = fn * constants::pio2_1t<B>();
|
||||
B t = r;
|
||||
w = fn * constants::pio2_2<B>();
|
||||
r = t - w;
|
||||
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
|
||||
t = r;
|
||||
w = fn * constants::pio2_3<B>();
|
||||
r = t - w;
|
||||
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
|
||||
xr = r - w;
|
||||
return quadrant(fn);
|
||||
}
|
||||
else
|
||||
{
|
||||
static constexpr std::size_t size = B::size;
|
||||
using value_type = typename B::value_type;
|
||||
alignas(B) std::array<value_type, size> tmp;
|
||||
alignas(B) std::array<value_type, size> txr;
|
||||
alignas(B) std::array<value_type, size> args;
|
||||
x.store_aligned(args.data());
|
||||
|
||||
for (std::size_t i = 0; i < size; ++i)
|
||||
{
|
||||
double arg = args[i];
|
||||
if (arg == std::numeric_limits<value_type>::infinity())
|
||||
{
|
||||
tmp[i] = 0.;
|
||||
txr[i] = std::numeric_limits<value_type>::quiet_NaN();
|
||||
}
|
||||
else
|
||||
{
|
||||
double y[2];
|
||||
std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
|
||||
tmp[i] = value_type(n & 3);
|
||||
txr[i] = value_type(y[0]);
|
||||
}
|
||||
}
|
||||
xr = B::load_aligned(&txr[0]);
|
||||
B res = B::load_aligned(&tmp[0]);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <class B>
|
||||
struct trigo_reducer<B, trigo_pi_tag>
|
||||
{
|
||||
static inline B reduce(const B& x, B& xr) noexcept
|
||||
{
|
||||
B xi = nearbyint(x * B(2.));
|
||||
B x2 = x - xi * B(0.5);
|
||||
xr = x2 * constants::pi<B>();
|
||||
return quadrant(xi);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type z = xr * xr;
|
||||
const batch_type se = detail::sin_eval(z, xr);
|
||||
const batch_type ce = detail::cos_eval(z);
|
||||
const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
|
||||
return z1 ^ sign_bit;
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
|
||||
}
|
||||
|
||||
// cosh
|
||||
|
||||
/* origin: boost/simd/arch/common/simd/function/cosh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type x = abs(self);
|
||||
auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
|
||||
batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
|
||||
batch_type tmp = exp(x * fac);
|
||||
batch_type tmp1 = batch_type(0.5) * tmp;
|
||||
return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto x = z.real();
|
||||
auto y = z.imag();
|
||||
return { cosh(x) * cos(y), sinh(x) * sin(y) };
|
||||
}
|
||||
|
||||
// sin
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T, class Tag = trigo_radian_tag>
|
||||
inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type z = xr * xr;
|
||||
const batch_type se = detail::sin_eval(z, xr);
|
||||
const batch_type ce = detail::cos_eval(z);
|
||||
const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
|
||||
return z1 ^ sign_bit;
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
return detail::sin(self);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
|
||||
}
|
||||
|
||||
// sincos
|
||||
template <class A, class T>
|
||||
inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
const batch_type z = xr * xr;
|
||||
const batch_type se = detail::sin_eval(z, xr);
|
||||
const batch_type ce = detail::cos_eval(z);
|
||||
auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
|
||||
auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
|
||||
const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
|
||||
return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
|
||||
sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch rcos = cos(z.real());
|
||||
real_batch rsin = sin(z.real());
|
||||
real_batch icosh = cosh(z.imag());
|
||||
real_batch isinh = sinh(z.imag());
|
||||
return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
|
||||
}
|
||||
|
||||
// sinh
|
||||
namespace detail
|
||||
{
|
||||
/* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A>
|
||||
inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type sqr_self = self * self;
|
||||
return detail::horner<batch_type,
|
||||
0x3f800000, // 1.0f
|
||||
0x3e2aaacc, // 1.66667160211E-1f
|
||||
0x3c087bbe, // 8.33028376239E-3f
|
||||
0x39559e2f // 2.03721912945E-4f
|
||||
>(sqr_self)
|
||||
* self;
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type sqrself = self * self;
|
||||
return fma(self, (detail::horner<batch_type,
|
||||
0xc115782bdbf6ab05ull, // -3.51754964808151394800E5
|
||||
0xc0c694b8c71d6182ull, // -1.15614435765005216044E4,
|
||||
0xc064773a398ff4feull, // -1.63725857525983828727E2,
|
||||
0xbfe9435fe8bb3cd6ull // -7.89474443963537015605E-1
|
||||
>(sqrself)
|
||||
/ detail::horner1<batch_type,
|
||||
0xc1401a20e4f90044ull, // -2.11052978884890840399E6
|
||||
0x40e1a7ba7ed72245ull, // 3.61578279834431989373E4,
|
||||
0xc0715b6096e96484ull // -2.77711081420602794433E2,
|
||||
>(sqrself))
|
||||
* sqrself,
|
||||
self);
|
||||
}
|
||||
}
|
||||
/* origin: boost/simd/arch/common/simd/function/sinh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type half(0.5);
|
||||
batch_type x = abs(a);
|
||||
auto lt1 = x < batch_type(1.);
|
||||
batch_type bts = bitofsign(a);
|
||||
batch_type z(0.);
|
||||
if (any(lt1))
|
||||
{
|
||||
z = detail::sinh_kernel(x);
|
||||
if (all(lt1))
|
||||
return z ^ bts;
|
||||
}
|
||||
auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
|
||||
batch_type fac = select(test1, half, batch_type(1.));
|
||||
batch_type tmp = exp(x * fac);
|
||||
batch_type tmp1 = half * tmp;
|
||||
batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
|
||||
return select(lt1, z, r) ^ bts;
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
auto x = z.real();
|
||||
auto y = z.imag();
|
||||
return { sinh(x) * cos(y), cosh(x) * sin(y) };
|
||||
}
|
||||
|
||||
// tan
|
||||
template <class A, class T>
|
||||
inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
const batch_type x = abs(self);
|
||||
batch_type xr = constants::nan<batch_type>();
|
||||
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
|
||||
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
|
||||
auto swap_bit = fma(batch_type(-2.), tmp, n);
|
||||
auto test = (swap_bit == batch_type(0.));
|
||||
const batch_type y = detail::tan_eval(xr, test);
|
||||
return y ^ bitofsign(self);
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<std::complex<T>, A>;
|
||||
using real_batch = typename batch_type::real_batch;
|
||||
real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
|
||||
batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
|
||||
real_batch wreal = sin(2 * z.real()) / d;
|
||||
real_batch wimag = sinh(2 * z.imag());
|
||||
batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
|
||||
return select(d == real_batch(0.), winf, wres);
|
||||
}
|
||||
|
||||
// tanh
|
||||
namespace detail
|
||||
{
|
||||
/* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class B>
|
||||
struct tanh_kernel;
|
||||
|
||||
template <class A>
|
||||
struct tanh_kernel<batch<float, A>>
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
static inline batch_type tanh(const batch_type& x) noexcept
|
||||
{
|
||||
batch_type sqrx = x * x;
|
||||
return fma(detail::horner<batch_type,
|
||||
0xbeaaaa99, // -3.33332819422E-1F
|
||||
0x3e088393, // +1.33314422036E-1F
|
||||
0xbd5c1e2d, // -5.37397155531E-2F
|
||||
0x3ca9134e, // +2.06390887954E-2F
|
||||
0xbbbaf0ea // -5.70498872745E-3F
|
||||
>(sqrx)
|
||||
* sqrx,
|
||||
x, x);
|
||||
}
|
||||
|
||||
static inline batch_type cotanh(const batch_type& x) noexcept
|
||||
{
|
||||
return batch_type(1.) / tanh(x);
|
||||
}
|
||||
};
|
||||
|
||||
template <class A>
|
||||
struct tanh_kernel<batch<double, A>>
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
static inline batch_type tanh(const batch_type& x) noexcept
|
||||
{
|
||||
batch_type sqrx = x * x;
|
||||
return fma(sqrx * p(sqrx) / q(sqrx), x, x);
|
||||
}
|
||||
|
||||
static inline batch_type cotanh(const batch_type& x) noexcept
|
||||
{
|
||||
batch_type sqrx = x * x;
|
||||
batch_type qval = q(sqrx);
|
||||
return qval / (x * fma(p(sqrx), sqrx, qval));
|
||||
}
|
||||
|
||||
static inline batch_type p(const batch_type& x) noexcept
|
||||
{
|
||||
return detail::horner<batch_type,
|
||||
0xc0993ac030580563, // -1.61468768441708447952E3
|
||||
0xc058d26a0e26682d, // -9.92877231001918586564E1,
|
||||
0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
|
||||
>(x);
|
||||
}
|
||||
|
||||
static inline batch_type q(const batch_type& x) noexcept
|
||||
{
|
||||
return detail::horner1<batch_type,
|
||||
0x40b2ec102442040c, // 4.84406305325125486048E3
|
||||
0x40a176fa0e5535fa, // 2.23548839060100448583E3,
|
||||
0x405c33f28a581B86 // 1.12811678491632931402E2,
|
||||
>(x);
|
||||
}
|
||||
};
|
||||
|
||||
}
|
||||
/* origin: boost/simd/arch/common/simd/function/tanh.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
template <class A, class T>
|
||||
inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
|
||||
{
|
||||
using batch_type = batch<T, A>;
|
||||
batch_type one(1.);
|
||||
batch_type x = abs(self);
|
||||
auto test = x < (batch_type(5.) / batch_type(8.));
|
||||
batch_type bts = bitofsign(self);
|
||||
batch_type z = one;
|
||||
if (any(test))
|
||||
{
|
||||
z = detail::tanh_kernel<batch_type>::tanh(x);
|
||||
if (all(test))
|
||||
return z ^ bts;
|
||||
}
|
||||
batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
|
||||
return select(test, z, r) ^ bts;
|
||||
}
|
||||
template <class A, class T>
|
||||
inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
|
||||
{
|
||||
using real_batch = typename batch<std::complex<T>, A>::real_batch;
|
||||
auto x = z.real();
|
||||
auto y = z.imag();
|
||||
real_batch two(2);
|
||||
auto d = cosh(two * x) + cos(two * y);
|
||||
return { sinh(two * x) / d, sin(two * y) / d };
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,940 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX2_HPP
|
||||
#define XSIMD_AVX2_HPP
|
||||
|
||||
#include <complex>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_avx2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// abs
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_abs_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_abs_epi16(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_abs_epi32(self);
|
||||
}
|
||||
else
|
||||
{
|
||||
return abs(self, avx {});
|
||||
}
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
// add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_add_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_add_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_add_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_add_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return add(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_and
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_and_si256(self, other);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_and_si256(self, other);
|
||||
}
|
||||
|
||||
// bitwise_andnot
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_andnot_si256(other, self);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_andnot_si256(other, self);
|
||||
}
|
||||
|
||||
// bitwise_not
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
|
||||
}
|
||||
|
||||
// bitwise_lshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_slli_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_slli_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_slli_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_lshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_sllv_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_sllv_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_lshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_or
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_or_si256(self, other);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_or_si256(self, other);
|
||||
}
|
||||
|
||||
// bitwise_rshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
__m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
|
||||
__m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
|
||||
__m256i res = _mm256_srai_epi16(self, other);
|
||||
return _mm256_or_si256(
|
||||
detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
|
||||
{ return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
|
||||
sign_mask, cmp_is_negative),
|
||||
_mm256_andnot_si256(sign_mask, res));
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_srai_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srai_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_srli_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srli_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_srli_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srav_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_srlv_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_srlv_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_xor
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, other);
|
||||
}
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_xor_si256(self, other);
|
||||
}
|
||||
|
||||
// complex_low
|
||||
template <class A>
|
||||
inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
__m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
|
||||
__m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
|
||||
return _mm256_blend_pd(tmp0, tmp1, 10);
|
||||
}
|
||||
|
||||
// complex_high
|
||||
template <class A>
|
||||
inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
__m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
|
||||
__m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
|
||||
return _mm256_blend_pd(tmp0, tmp1, 10);
|
||||
}
|
||||
|
||||
// fast_cast
|
||||
namespace detail
|
||||
{
|
||||
|
||||
template <class A>
|
||||
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
|
||||
{
|
||||
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
|
||||
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
|
||||
__m256 cnst65536f = _mm256_set1_ps(65536.0f);
|
||||
|
||||
__m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */
|
||||
__m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */
|
||||
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
|
||||
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
|
||||
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
|
||||
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
// adapted to avx
|
||||
__m256i xH = _mm256_srli_epi64(x, 32);
|
||||
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
|
||||
__m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
|
||||
0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
|
||||
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
|
||||
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
|
||||
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
// adapted to avx
|
||||
__m256i xH = _mm256_srai_epi32(x, 16);
|
||||
xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
|
||||
xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67
|
||||
__m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
|
||||
0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
|
||||
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
|
||||
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
|
||||
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
|
||||
}
|
||||
}
|
||||
|
||||
// eq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_cmpeq_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_cmpeq_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_cmpeq_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_cmpeq_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return eq(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// gather
|
||||
template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
|
||||
inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
|
||||
kernel::requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
|
||||
}
|
||||
|
||||
template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
|
||||
inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
|
||||
kernel::requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
|
||||
}
|
||||
|
||||
template <class A, class U,
|
||||
detail::enable_sized_integral_t<U, 4> = 0>
|
||||
inline batch<float, A> gather(batch<float, A> const&, float const* src,
|
||||
batch<U, A> const& index,
|
||||
kernel::requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i32gather_ps(src, index, sizeof(float));
|
||||
}
|
||||
|
||||
template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
|
||||
inline batch<double, A> gather(batch<double, A> const&, double const* src,
|
||||
batch<U, A> const& index,
|
||||
requires_arch<avx2>) noexcept
|
||||
{
|
||||
// scatter for this one is AVX512F+AVX512VL
|
||||
return _mm256_i64gather_pd(src, index, sizeof(double));
|
||||
}
|
||||
|
||||
// gather: handmade conversions
|
||||
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
|
||||
inline batch<float, A> gather(batch<float, A> const&, double const* src,
|
||||
batch<V, A> const& index,
|
||||
requires_arch<avx2>) noexcept
|
||||
{
|
||||
const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
|
||||
const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
|
||||
return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
|
||||
}
|
||||
|
||||
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
|
||||
inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
|
||||
batch<V, A> const& index,
|
||||
requires_arch<avx2>) noexcept
|
||||
{
|
||||
const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
|
||||
const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
|
||||
return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
|
||||
}
|
||||
|
||||
// lt
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_cmpgt_epi8(other, self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_cmpgt_epi16(other, self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_cmpgt_epi32(other, self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_cmpgt_epi64(other, self);
|
||||
}
|
||||
else
|
||||
{
|
||||
return lt(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
return lt(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// load_complex
|
||||
template <class A>
|
||||
inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
|
||||
{
|
||||
using batch_type = batch<float, A>;
|
||||
batch_type real = _mm256_castpd_ps(
|
||||
_mm256_permute4x64_pd(
|
||||
_mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
|
||||
_MM_SHUFFLE(3, 1, 2, 0)));
|
||||
batch_type imag = _mm256_castpd_ps(
|
||||
_mm256_permute4x64_pd(
|
||||
_mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
|
||||
_MM_SHUFFLE(3, 1, 2, 0)));
|
||||
return { real, imag };
|
||||
}
|
||||
template <class A>
|
||||
inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
|
||||
{
|
||||
using batch_type = batch<double, A>;
|
||||
batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
return { real, imag };
|
||||
}
|
||||
// mask
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
|
||||
return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
|
||||
}
|
||||
else
|
||||
{
|
||||
return mask(self, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// max
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_max_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_max_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_max_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_max_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_max_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_max_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// min
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_min_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_min_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_min_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_min_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_min_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_min_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_mullo_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_mullo_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return mul(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
__m256i tmp1 = _mm256_hadd_epi32(self, self);
|
||||
__m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
|
||||
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
|
||||
__m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
|
||||
return _mm_cvtsi128_si32(tmp4);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
__m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
|
||||
__m256i tmp2 = _mm256_add_epi64(self, tmp1);
|
||||
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
|
||||
__m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
|
||||
#if defined(__x86_64__)
|
||||
return _mm_cvtsi128_si64(res);
|
||||
#else
|
||||
__m128i m;
|
||||
_mm_storel_epi64(&m, res);
|
||||
int64_t i;
|
||||
std::memcpy(&i, &m, sizeof(i));
|
||||
return i;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return reduce_add(self, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// sadd
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_adds_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_adds_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_adds_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_adds_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// select
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(cond, true_br, false_br, avx {});
|
||||
}
|
||||
}
|
||||
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
|
||||
// FIXME: for some reason mask here is not considered as an immediate,
|
||||
// but it's okay for _mm256_blend_epi32
|
||||
// case 2: return _mm256_blend_epi16(false_br, true_br, mask);
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_blend_epi32(false_br, true_br, mask);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
constexpr int imask = detail::interleave(mask);
|
||||
return _mm256_blend_epi32(false_br, true_br, imask);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
|
||||
}
|
||||
}
|
||||
|
||||
// slide_left
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 256)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
if (BitCount > 128)
|
||||
{
|
||||
constexpr unsigned M = (BitCount - 128) / 8;
|
||||
auto y = _mm256_bslli_epi128(x, M);
|
||||
return _mm256_permute2x128_si256(y, y, 0x28);
|
||||
}
|
||||
if (BitCount == 128)
|
||||
{
|
||||
return _mm256_permute2x128_si256(x, x, 0x28);
|
||||
}
|
||||
// shifting by [0, 128[ bits
|
||||
constexpr unsigned M = BitCount / 8;
|
||||
auto y = _mm256_bslli_epi128(x, M);
|
||||
auto z = _mm256_bsrli_epi128(x, 16 - M);
|
||||
auto w = _mm256_permute2x128_si256(z, z, 0x28);
|
||||
return _mm256_or_si256(y, w);
|
||||
}
|
||||
|
||||
// slide_right
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 256)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
if (BitCount > 128)
|
||||
{
|
||||
constexpr unsigned M = (BitCount - 128) / 8;
|
||||
auto y = _mm256_bsrli_epi128(x, M);
|
||||
return _mm256_permute2x128_si256(y, y, 0x81);
|
||||
}
|
||||
if (BitCount == 128)
|
||||
{
|
||||
return _mm256_permute2x128_si256(x, x, 0x81);
|
||||
}
|
||||
// shifting by [0, 128[ bits
|
||||
constexpr unsigned M = BitCount / 8;
|
||||
auto y = _mm256_bsrli_epi128(x, M);
|
||||
auto z = _mm256_bslli_epi128(x, 16 - M);
|
||||
auto w = _mm256_permute2x128_si256(z, z, 0x81);
|
||||
return _mm256_or_si256(y, w);
|
||||
}
|
||||
|
||||
// ssub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_subs_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_subs_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_subs_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_subs_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm256_sub_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm256_sub_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm256_sub_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm256_sub_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sub(self, other, avx {});
|
||||
}
|
||||
}
|
||||
|
||||
// swizzle
|
||||
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
|
||||
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
|
||||
}
|
||||
|
||||
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
|
||||
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
|
||||
return _mm256_permute4x64_pd(self, mask);
|
||||
}
|
||||
|
||||
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
|
||||
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
|
||||
{
|
||||
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
|
||||
return _mm256_permute4x64_epi64(self, mask);
|
||||
}
|
||||
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
|
||||
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
|
||||
}
|
||||
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
|
||||
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
|
||||
}
|
||||
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
|
||||
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
|
||||
}
|
||||
|
||||
// zip_hi
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi8(self, other);
|
||||
auto hi = _mm256_unpackhi_epi8(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi16(self, other);
|
||||
auto hi = _mm256_unpackhi_epi16(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi32(self, other);
|
||||
auto hi = _mm256_unpackhi_epi32(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi64(self, other);
|
||||
auto hi = _mm256_unpackhi_epi64(self, other);
|
||||
return _mm256_permute2f128_si256(lo, hi, 0x31);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// zip_lo
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi8(self, other);
|
||||
auto hi = _mm256_unpackhi_epi8(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi16(self, other);
|
||||
auto hi = _mm256_unpackhi_epi16(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi32(self, other);
|
||||
auto hi = _mm256_unpackhi_epi32(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
auto lo = _mm256_unpacklo_epi64(self, other);
|
||||
auto hi = _mm256_unpackhi_epi64(self, other);
|
||||
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,627 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512BW_HPP
|
||||
#define XSIMD_AVX512BW_HPP
|
||||
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_avx512bw_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class A, class T, int Cmp>
|
||||
inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
|
||||
{
|
||||
using register_type = typename batch_bool<T, A>::register_type;
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// abs
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_unsigned<T>::value)
|
||||
{
|
||||
return self;
|
||||
}
|
||||
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_abs_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_abs_epi16(self);
|
||||
}
|
||||
else
|
||||
{
|
||||
return abs(self, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_add_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_add_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return add(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_lshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_slli_epi16(self, other);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_lshift(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// bitwise_rshift
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
__m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
|
||||
__m512i zeros = _mm512_setzero_si512();
|
||||
__mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
|
||||
__m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
__m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
__m512i res = _mm512_srai_epi16(self, other);
|
||||
#endif
|
||||
return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srai_epi16(self, other);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
|
||||
#else
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_srli_epi16(self, other);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return bitwise_rshift(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// eq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
|
||||
}
|
||||
|
||||
// ge
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
|
||||
}
|
||||
|
||||
// gt
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
|
||||
}
|
||||
|
||||
// le
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
|
||||
}
|
||||
|
||||
// lt
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
|
||||
}
|
||||
|
||||
// max
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_max_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_max_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_max_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_max_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// min
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_min_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_min_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_min_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_min_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
__m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
|
||||
__m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
|
||||
return _mm512_or_si512(upper, lower);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_mullo_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return mul(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// neq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
|
||||
}
|
||||
|
||||
// sadd
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_adds_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_adds_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_adds_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_adds_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sadd(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// select
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(cond, true_br, false_br, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// slide_left
|
||||
namespace detail
|
||||
{
|
||||
template <size_t... Is>
|
||||
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is == 0 ? 8 : Is - 1)... };
|
||||
}
|
||||
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is >= N ? Is - N : 0)... };
|
||||
}
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is >= N ? 0xFFFF : 0x0000)... };
|
||||
}
|
||||
}
|
||||
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 512)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
batch<T, A> xx;
|
||||
if (N & 1)
|
||||
{
|
||||
alignas(A::alignment()) uint64_t buffer[8];
|
||||
_mm512_store_epi64(&buffer[0], x);
|
||||
for (int i = 7; i > 0; --i)
|
||||
buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
|
||||
buffer[0] = buffer[0] << 8;
|
||||
xx = _mm512_load_epi64(&buffer[0]);
|
||||
|
||||
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
|
||||
__m512i xl = _mm512_slli_epi64(x, 8);
|
||||
__m512i xr = _mm512_srli_epi64(x, 56);
|
||||
xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
|
||||
xx = _mm512_or_si512(xr, xl);
|
||||
if (N == 1)
|
||||
return xx;
|
||||
}
|
||||
else
|
||||
{
|
||||
xx = x;
|
||||
}
|
||||
alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
|
||||
}
|
||||
|
||||
// slide_right
|
||||
namespace detail
|
||||
{
|
||||
template <size_t... Is>
|
||||
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is + 1)... };
|
||||
}
|
||||
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is < (32 - N) ? Is + N : 0)... };
|
||||
}
|
||||
template <size_t N, size_t... Is>
|
||||
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
|
||||
{
|
||||
return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
|
||||
}
|
||||
}
|
||||
template <size_t N, class A, class T>
|
||||
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
constexpr unsigned BitCount = N * 8;
|
||||
if (BitCount == 0)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
if (BitCount >= 512)
|
||||
{
|
||||
return batch<T, A>(T(0));
|
||||
}
|
||||
batch<T, A> xx;
|
||||
if (N & 1)
|
||||
{
|
||||
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
|
||||
__m512i xr = _mm512_srli_epi64(x, 8);
|
||||
__m512i xl = _mm512_slli_epi64(x, 56);
|
||||
xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
|
||||
xx = _mm512_or_si512(xr, xl);
|
||||
if (N == 1)
|
||||
return xx;
|
||||
}
|
||||
else
|
||||
{
|
||||
xx = x;
|
||||
}
|
||||
alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
|
||||
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
|
||||
}
|
||||
|
||||
// ssub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_subs_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_subs_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_subs_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_subs_epu16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return ssub(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sub
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm512_sub_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm512_sub_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return sub(self, other, avx512dq {});
|
||||
}
|
||||
}
|
||||
|
||||
// swizzle
|
||||
|
||||
template <class A, uint16_t... Vs>
|
||||
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
|
||||
}
|
||||
|
||||
template <class A, uint16_t... Vs>
|
||||
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
|
||||
}
|
||||
|
||||
template <class A, uint8_t... Vs>
|
||||
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
|
||||
}
|
||||
|
||||
template <class A, uint8_t... Vs>
|
||||
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
|
||||
}
|
||||
|
||||
// zip_hi
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
__m512i lo, hi;
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi8(self, other);
|
||||
hi = _mm512_unpackhi_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi16(self, other);
|
||||
hi = _mm512_unpackhi_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return zip_hi(self, other, avx512f {});
|
||||
}
|
||||
return _mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
|
||||
_mm512_extracti32x4_epi32(lo, 3),
|
||||
2),
|
||||
_mm512_extracti32x4_epi32(hi, 2),
|
||||
1);
|
||||
}
|
||||
|
||||
// zip_lo
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
|
||||
{
|
||||
__m512i lo, hi;
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi8(self, other);
|
||||
hi = _mm512_unpackhi_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
lo = _mm512_unpacklo_epi16(self, other);
|
||||
hi = _mm512_unpackhi_epi16(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return zip_lo(self, other, avx512f {});
|
||||
}
|
||||
return _mm512_inserti32x4(
|
||||
_mm512_inserti32x4(
|
||||
_mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
|
||||
_mm512_extracti32x4_epi32(hi, 1),
|
||||
3),
|
||||
_mm512_extracti32x4_epi32(lo, 1),
|
||||
2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,28 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512CD_HPP
|
||||
#define XSIMD_AVX512CD_HPP
|
||||
|
||||
#include "../types/xsimd_avx512cd_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
// Nothing there yet.
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,212 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512_DQHPP
|
||||
#define XSIMD_AVX512_D_HPP
|
||||
|
||||
#include "../types/xsimd_avx512dq_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// bitwise_and
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_and_ps(self, other);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_and_pd(self, other);
|
||||
}
|
||||
|
||||
// bitwise_andnot
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_andnot_ps(other, self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_andnot_pd(other, self);
|
||||
}
|
||||
|
||||
// bitwise_not
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
|
||||
{
|
||||
return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
|
||||
{
|
||||
return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
|
||||
}
|
||||
|
||||
// bitwise_or
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_or_ps(self, other);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_or_pd(self, other);
|
||||
}
|
||||
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
using register_type = typename batch_bool<T, A>::register_type;
|
||||
return register_type(self.data | other.data);
|
||||
}
|
||||
|
||||
// bitwise_xor
|
||||
template <class A>
|
||||
inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_xor_ps(self, other);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_xor_pd(self, other);
|
||||
}
|
||||
|
||||
// haddp
|
||||
template <class A>
|
||||
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
// The following folds over the vector once:
|
||||
// tmp1 = [a0..8, b0..8]
|
||||
// tmp2 = [a8..f, b8..f]
|
||||
#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
|
||||
batch<float, avx512f> res##I; \
|
||||
{ \
|
||||
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
|
||||
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
|
||||
res##I = _mm512_add_ps(tmp1, tmp2); \
|
||||
}
|
||||
|
||||
XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
|
||||
XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
|
||||
XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
|
||||
XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
|
||||
XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
|
||||
XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
|
||||
XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
|
||||
XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
|
||||
|
||||
#undef XSIMD_AVX512_HADDP_STEP1
|
||||
|
||||
// The following flds the code and shuffles so that hadd_ps produces the correct result
|
||||
// tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3)
|
||||
// tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4)
|
||||
// tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
|
||||
#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
|
||||
batch<float, avx2> halfx##I; \
|
||||
{ \
|
||||
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
|
||||
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
|
||||
\
|
||||
auto resx1 = _mm512_add_ps(tmp1, tmp2); \
|
||||
\
|
||||
auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
|
||||
auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
|
||||
\
|
||||
auto resx2 = _mm512_add_ps(tmp3, tmp4); \
|
||||
\
|
||||
auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
|
||||
auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
|
||||
\
|
||||
auto resx3 = _mm512_add_ps(tmp5, tmp6); \
|
||||
\
|
||||
halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
|
||||
_mm512_extractf32x8_ps(resx3, 1)); \
|
||||
}
|
||||
|
||||
XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
|
||||
XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
|
||||
|
||||
#undef XSIMD_AVX512_HADDP_STEP2
|
||||
|
||||
auto concat = _mm512_castps256_ps512(halfx0);
|
||||
concat = _mm512_insertf32x8(concat, halfx1, 1);
|
||||
return concat;
|
||||
}
|
||||
|
||||
// ldexp
|
||||
template <class A>
|
||||
inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A>
|
||||
inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_mullo_epi64(self, other);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_mullo_epi64(self, other);
|
||||
}
|
||||
|
||||
// nearbyint_as_int
|
||||
template <class A>
|
||||
inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
|
||||
requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_cvtpd_epi64(self);
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A>
|
||||
inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
|
||||
{
|
||||
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
|
||||
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
|
||||
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
|
||||
return reduce_add(batch<float, avx2>(res1), avx2 {});
|
||||
}
|
||||
|
||||
// convert
|
||||
namespace detail
|
||||
{
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_cvtepi64_pd(self);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
|
||||
{
|
||||
return _mm512_cvttpd_epi64(self);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,384 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
|
||||
#define XSIMD_NUMERICAL_CONSTANT_HPP
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "../types/xsimd_utils.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace constants
|
||||
{
|
||||
|
||||
#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
|
||||
template <class T> \
|
||||
inline T NAME() noexcept \
|
||||
{ \
|
||||
return T(NAME<typename T::value_type>()); \
|
||||
} \
|
||||
template <> \
|
||||
inline float NAME<float>() noexcept \
|
||||
{ \
|
||||
return SINGLE; \
|
||||
} \
|
||||
template <> \
|
||||
inline double NAME<double>() noexcept \
|
||||
{ \
|
||||
return DOUBLE; \
|
||||
}
|
||||
|
||||
#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
|
||||
template <class T> \
|
||||
inline T NAME() noexcept \
|
||||
{ \
|
||||
return T(NAME<typename T::value_type>()); \
|
||||
} \
|
||||
template <> \
|
||||
inline float NAME<float>() noexcept \
|
||||
{ \
|
||||
return bit_cast<float>((uint32_t)SINGLE); \
|
||||
} \
|
||||
template <> \
|
||||
inline double NAME<double>() noexcept \
|
||||
{ \
|
||||
return bit_cast<double>((uint64_t)DOUBLE); \
|
||||
}
|
||||
|
||||
XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
|
||||
XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
|
||||
XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
|
||||
XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
|
||||
XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
|
||||
XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
|
||||
XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
|
||||
XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
|
||||
XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
|
||||
XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
|
||||
XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
|
||||
XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
|
||||
XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
|
||||
XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
|
||||
XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
|
||||
XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
|
||||
|
||||
#undef XSIMD_DEFINE_CONSTANT
|
||||
#undef XSIMD_DEFINE_CONSTANT_HEX
|
||||
|
||||
template <class T>
|
||||
constexpr T allbits() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> mask1frexp() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> mask2frexp() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> maxexponent() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr as_integer_t<T> maxexponentm1() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr int32_t nmb() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr T zero() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr T minvalue() noexcept;
|
||||
|
||||
template <class T>
|
||||
constexpr T maxvalue() noexcept;
|
||||
|
||||
/**************************
|
||||
* allbits implementation *
|
||||
**************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, bool = std::is_integral<T>::value>
|
||||
struct allbits_impl
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return T(~0);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct allbits_impl<T, false>
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return nan<T>();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline constexpr T allbits() noexcept
|
||||
{
|
||||
return T(detail::allbits_impl<typename T::value_type>::get_value());
|
||||
}
|
||||
|
||||
/*****************************
|
||||
* mask1frexp implementation *
|
||||
*****************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> mask1frexp() noexcept
|
||||
{
|
||||
return as_integer_t<T>(mask1frexp<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t mask1frexp<float>() noexcept
|
||||
{
|
||||
return 0x7f800000;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t mask1frexp<double>() noexcept
|
||||
{
|
||||
return 0x7ff0000000000000;
|
||||
}
|
||||
|
||||
/*****************************
|
||||
* mask2frexp implementation *
|
||||
*****************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> mask2frexp() noexcept
|
||||
{
|
||||
return as_integer_t<T>(mask2frexp<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t mask2frexp<float>() noexcept
|
||||
{
|
||||
return 0x3f000000;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t mask2frexp<double>() noexcept
|
||||
{
|
||||
return 0x3fe0000000000000;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* maxexponent implementation *
|
||||
******************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> maxexponent() noexcept
|
||||
{
|
||||
return as_integer_t<T>(maxexponent<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t maxexponent<float>() noexcept
|
||||
{
|
||||
return 127;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t maxexponent<double>() noexcept
|
||||
{
|
||||
return 1023;
|
||||
}
|
||||
|
||||
/******************************
|
||||
* maxexponent implementation *
|
||||
******************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr as_integer_t<T> maxexponentm1() noexcept
|
||||
{
|
||||
return as_integer_t<T>(maxexponentm1<typename T::value_type>());
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t maxexponentm1<float>() noexcept
|
||||
{
|
||||
return 126;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int64_t maxexponentm1<double>() noexcept
|
||||
{
|
||||
return 1022;
|
||||
}
|
||||
|
||||
/**********************
|
||||
* nmb implementation *
|
||||
**********************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr int32_t nmb() noexcept
|
||||
{
|
||||
return nmb<typename T::value_type>();
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t nmb<float>() noexcept
|
||||
{
|
||||
return 23;
|
||||
}
|
||||
|
||||
template <>
|
||||
inline constexpr int32_t nmb<double>() noexcept
|
||||
{
|
||||
return 52;
|
||||
}
|
||||
|
||||
/***********************
|
||||
* zero implementation *
|
||||
***********************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr T zero() noexcept
|
||||
{
|
||||
return T(typename T::value_type(0));
|
||||
}
|
||||
|
||||
/***************************
|
||||
* minvalue implementation *
|
||||
***************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T>
|
||||
struct minvalue_impl
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return std::numeric_limits<typename T::value_type>::min();
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct minvalue_common
|
||||
{
|
||||
static constexpr T get_value() noexcept
|
||||
{
|
||||
return std::numeric_limits<T>::min();
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct minvalue_impl<int8_t> : minvalue_common<int8_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<int16_t> : minvalue_common<int16_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<int32_t> : minvalue_common<int32_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<int64_t> : minvalue_common<int64_t>
|
||||
{
|
||||
};
|
||||
template <>
|
||||
struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct minvalue_impl<float>
|
||||
{
|
||||
static float get_value() noexcept
|
||||
{
|
||||
return bit_cast<float>((uint32_t)0xff7fffff);
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct minvalue_impl<double>
|
||||
{
|
||||
static double get_value() noexcept
|
||||
{
|
||||
return bit_cast<double>((uint64_t)0xffefffffffffffff);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline constexpr T minvalue() noexcept
|
||||
{
|
||||
return T(detail::minvalue_impl<typename T::value_type>::get_value());
|
||||
}
|
||||
|
||||
/***************************
|
||||
* maxvalue implementation *
|
||||
***************************/
|
||||
|
||||
template <class T>
|
||||
inline constexpr T maxvalue() noexcept
|
||||
{
|
||||
return T(std::numeric_limits<typename T::value_type>::max());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,80 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX_HPP
|
||||
#define XSIMD_FMA3_AVX_HPP
|
||||
|
||||
#include "../types/xsimd_fma3_avx_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// fnma
|
||||
template <class A>
|
||||
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A>
|
||||
inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fnmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A>
|
||||
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A>
|
||||
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
|
||||
{
|
||||
return _mm256_fmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,46 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX2_HPP
|
||||
#define XSIMD_FMA3_AVX2_HPP
|
||||
|
||||
#include "../types/xsimd_fma3_avx2_register.hpp"
|
||||
|
||||
// Allow inclusion of xsimd_fma3_avx.hpp
|
||||
#ifdef XSIMD_FMA3_AVX_HPP
|
||||
#undef XSIMD_FMA3_AVX_HPP
|
||||
#define XSIMD_FORCE_FMA3_AVX_HPP
|
||||
#endif
|
||||
|
||||
// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
|
||||
#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#define XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
|
||||
#endif
|
||||
|
||||
// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
|
||||
#define avx avx2
|
||||
#include "./xsimd_fma3_avx.hpp"
|
||||
#undef avx
|
||||
#undef XSIMD_FMA3_AVX_HPP
|
||||
|
||||
// Carefully restore guards
|
||||
#ifdef XSIMD_FORCE_FMA3_AVX_HPP
|
||||
#define XSIMD_FMA3_AVX_HPP
|
||||
#undef XSIMD_FORCE_FMA3_AVX_HPP
|
||||
#endif
|
||||
|
||||
#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
|
||||
#undef XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,79 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_SSE_HPP
|
||||
#define XSIMD_FMA3_SSE_HPP
|
||||
|
||||
#include "../types/xsimd_fma3_sse_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
// fnma
|
||||
template <class A>
|
||||
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A>
|
||||
inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fnmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A>
|
||||
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmadd_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmadd_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A>
|
||||
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
|
||||
{
|
||||
return _mm_fmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,79 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA4_HPP
|
||||
#define XSIMD_FMA4_HPP
|
||||
|
||||
#include "../types/xsimd_fma4_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// fnma
|
||||
template <class A>
|
||||
inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmacc_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmacc_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fnms
|
||||
template <class A>
|
||||
inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmsub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_nmsub_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fma
|
||||
template <class A>
|
||||
inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_macc_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_macc_pd(x, y, z);
|
||||
}
|
||||
|
||||
// fms
|
||||
template <class A>
|
||||
inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_msub_ps(x, y, z);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
|
||||
{
|
||||
return _mm_msub_pd(x, y, z);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,23 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_HPP
|
||||
#define XSIMD_GENERIC_HPP
|
||||
|
||||
#include "./generic/xsimd_generic_arithmetic.hpp"
|
||||
#include "./generic/xsimd_generic_complex.hpp"
|
||||
#include "./generic/xsimd_generic_logical.hpp"
|
||||
#include "./generic/xsimd_generic_math.hpp"
|
||||
#include "./generic/xsimd_generic_memory.hpp"
|
||||
#include "./generic/xsimd_generic_rounding.hpp"
|
||||
#include "./generic/xsimd_generic_trigo.hpp"
|
||||
|
||||
#endif
|
|
@ -0,0 +1,38 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_FWD_HPP
|
||||
#define XSIMD_GENERIC_FWD_HPP
|
||||
|
||||
#include "../types/xsimd_batch_constant.hpp"
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace kernel
|
||||
{
|
||||
// forward declaration
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
template <class A, class T>
|
||||
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,86 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ISA_HPP
|
||||
#define XSIMD_ISA_HPP
|
||||
|
||||
#include "../config/xsimd_arch.hpp"
|
||||
|
||||
#include "./xsimd_generic_fwd.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
#include "./xsimd_sse2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE3
|
||||
#include "./xsimd_sse3.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#include "./xsimd_ssse3.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#include "./xsimd_sse4_1.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#include "./xsimd_sse4_2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#include "./xsimd_fma3_sse.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA4
|
||||
#include "./xsimd_fma4.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
#include "./xsimd_avx.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#include "./xsimd_fma3_avx.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
#include "./xsimd_avx2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#include "./xsimd_fma3_avx2.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX512F
|
||||
#include "./xsimd_avx512f.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX512BW
|
||||
#include "./xsimd_avx512bw.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_NEON
|
||||
#include "./xsimd_neon.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_NEON64
|
||||
#include "./xsimd_neon64.hpp"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SVE
|
||||
#include "./xsimd_sve.hpp"
|
||||
#endif
|
||||
|
||||
// Must come last to have access to all conversion specializations.
|
||||
#include "./xsimd_generic.hpp"
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,64 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE3_HPP
|
||||
#define XSIMD_SSE3_HPP
|
||||
|
||||
#include "../types/xsimd_sse3_register.hpp"
|
||||
#include <type_traits>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// haddp
|
||||
template <class A>
|
||||
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
|
||||
{
|
||||
return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
|
||||
_mm_hadd_ps(row[2], row[3]));
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
|
||||
{
|
||||
return _mm_hadd_pd(row[0], row[1]);
|
||||
}
|
||||
|
||||
// load_unaligned
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
|
||||
{
|
||||
return _mm_lddqu_si128((__m128i const*)mem);
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A>
|
||||
inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
|
||||
{
|
||||
__m128 tmp0 = _mm_hadd_ps(self, self);
|
||||
__m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
|
||||
return _mm_cvtss_f32(tmp1);
|
||||
}
|
||||
template <class A>
|
||||
inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
|
||||
{
|
||||
__m128d tmp0 = _mm_hadd_pd(self, self);
|
||||
return _mm_cvtsd_f64(tmp0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,350 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_1_HPP
|
||||
#define XSIMD_SSE4_1_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_sse4_1_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
// any
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return !_mm_testz_si128(self, self);
|
||||
}
|
||||
// ceil
|
||||
template <class A>
|
||||
inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_ceil_ps(self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_ceil_pd(self);
|
||||
}
|
||||
|
||||
// fast_cast
|
||||
namespace detail
|
||||
{
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
__m128i xH = _mm_srai_epi32(x, 16);
|
||||
xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
|
||||
xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
|
||||
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52
|
||||
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
|
||||
return _mm_add_pd(f, _mm_castsi128_pd(xL));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
|
||||
__m128i xH = _mm_srli_epi64(x, 32);
|
||||
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
|
||||
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
|
||||
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
|
||||
return _mm_add_pd(f, _mm_castsi128_pd(xL));
|
||||
}
|
||||
|
||||
template <class A>
|
||||
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_castps_si128(
|
||||
_mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
|
||||
_mm_castsi128_ps(_mm_xor_si128(
|
||||
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
|
||||
_mm_set1_epi32(1u << 31))),
|
||||
_mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
|
||||
}
|
||||
}
|
||||
|
||||
// eq
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm_cmpeq_epi64(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return eq(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
|
||||
// floor
|
||||
template <class A>
|
||||
inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_floor_ps(self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_floor_pd(self);
|
||||
}
|
||||
|
||||
// insert
|
||||
template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_insert_epi8(self, val, I);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_insert_epi32(self, val, I);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
|
||||
return _mm_insert_epi64(self, val, I);
|
||||
#else
|
||||
uint32_t lo, hi;
|
||||
memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
|
||||
memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
|
||||
return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
return insert(self, val, pos, ssse3 {});
|
||||
}
|
||||
}
|
||||
|
||||
// max
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_max_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_max_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_max_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_max_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_max_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_max_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return max(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// min
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
if (std::is_signed<T>::value)
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_min_epi8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_min_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_min_epi32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_min_epu8(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_min_epu16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_min_epu32(self, other);
|
||||
}
|
||||
else
|
||||
{
|
||||
return min(self, other, ssse3 {});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// mul
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_or_si128(
|
||||
_mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
|
||||
_mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_mullo_epi16(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_mullo_epi32(self, other);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm_add_epi64(
|
||||
_mm_mul_epu32(self, other),
|
||||
_mm_slli_epi64(
|
||||
_mm_add_epi64(
|
||||
_mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
|
||||
_mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
|
||||
32));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// nearbyint
|
||||
template <class A>
|
||||
inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
|
||||
}
|
||||
|
||||
// select
|
||||
namespace detail
|
||||
{
|
||||
template <class T>
|
||||
inline constexpr T interleave(T const& cond) noexcept
|
||||
{
|
||||
return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_blendv_epi8(false_br, true_br, cond);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_blendv_ps(false_br, true_br, cond);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_blendv_pd(false_br, true_br, cond);
|
||||
}
|
||||
|
||||
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_blend_epi16(false_br, true_br, mask);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
constexpr int imask = detail::interleave(mask);
|
||||
return _mm_blend_epi16(false_br, true_br, imask);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
constexpr int imask = detail::interleave(mask);
|
||||
constexpr int imask2 = detail::interleave(imask);
|
||||
return _mm_blend_epi16(false_br, true_br, imask2);
|
||||
}
|
||||
else
|
||||
{
|
||||
return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
|
||||
}
|
||||
}
|
||||
template <class A, bool... Values>
|
||||
inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
|
||||
return _mm_blend_ps(false_br, true_br, mask);
|
||||
}
|
||||
template <class A, bool... Values>
|
||||
inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
|
||||
return _mm_blend_pd(false_br, true_br, mask);
|
||||
}
|
||||
|
||||
// trunc
|
||||
template <class A>
|
||||
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
|
||||
}
|
||||
template <class A>
|
||||
inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
|
||||
{
|
||||
return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,44 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_2_HPP
|
||||
#define XSIMD_SSE4_2_HPP
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "../types/xsimd_sse4_2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// lt
|
||||
template <class A>
|
||||
inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
|
||||
{
|
||||
return _mm_cmpgt_epi64(other, self);
|
||||
}
|
||||
template <class A>
|
||||
inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
|
||||
{
|
||||
auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
|
||||
auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
|
||||
return _mm_cmpgt_epi64(xother, xself);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,142 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSSE3_HPP
|
||||
#define XSIMD_SSSE3_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <type_traits>
|
||||
|
||||
#include "../types/xsimd_ssse3_register.hpp"
|
||||
#include "../types/xsimd_utils.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
using namespace types;
|
||||
|
||||
// abs
|
||||
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
|
||||
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
|
||||
{
|
||||
return _mm_abs_epi8(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
return _mm_abs_epi16(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
return _mm_abs_epi32(self);
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
return _mm_abs_epi64(self);
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false && "unsupported arch/op combination");
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// extract_pair
|
||||
namespace detail
|
||||
{
|
||||
|
||||
template <class T, class A>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
|
||||
{
|
||||
return other;
|
||||
}
|
||||
|
||||
template <class T, class A, std::size_t I, std::size_t... Is>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
|
||||
{
|
||||
if (i == I)
|
||||
{
|
||||
return _mm_alignr_epi8(self, other, sizeof(T) * I);
|
||||
}
|
||||
else
|
||||
return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
|
||||
}
|
||||
}
|
||||
|
||||
template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
constexpr std::size_t size = batch<T, A>::size;
|
||||
assert(0 <= i && i < size && "index in bounds");
|
||||
return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
|
||||
}
|
||||
|
||||
// reduce_add
|
||||
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
|
||||
inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
|
||||
{
|
||||
__m128i tmp1 = _mm_hadd_epi16(self, self);
|
||||
__m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
|
||||
__m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
|
||||
return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
__m128i tmp1 = _mm_hadd_epi32(self, self);
|
||||
__m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
|
||||
return _mm_cvtsi128_si32(tmp2);
|
||||
}
|
||||
else
|
||||
{
|
||||
return reduce_add(self, sse3 {});
|
||||
}
|
||||
}
|
||||
|
||||
// swizzle
|
||||
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
|
||||
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
|
||||
2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
|
||||
mask8;
|
||||
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
|
||||
}
|
||||
|
||||
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
|
||||
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
|
||||
}
|
||||
|
||||
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
|
||||
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
|
||||
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
|
||||
}
|
||||
|
||||
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
|
||||
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
|
||||
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
|
||||
{
|
||||
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,249 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ARCH_HPP
|
||||
#define XSIMD_ARCH_HPP
|
||||
|
||||
#include <initializer_list>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "../types/xsimd_all_registers.hpp"
|
||||
#include "./xsimd_config.hpp"
|
||||
#include "./xsimd_cpuid.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Checks whether T appears in Tys.
|
||||
template <class T, class... Tys>
|
||||
struct contains;
|
||||
|
||||
template <class T>
|
||||
struct contains<T> : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class Ty, class... Tys>
|
||||
struct contains<T, Ty, Tys...>
|
||||
: std::conditional<std::is_same<Ty, T>::value, std::true_type,
|
||||
contains<T, Tys...>>::type
|
||||
{
|
||||
};
|
||||
|
||||
template <class... Archs>
|
||||
struct is_sorted;
|
||||
|
||||
template <>
|
||||
struct is_sorted<> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class Arch>
|
||||
struct is_sorted<Arch> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class A0, class A1, class... Archs>
|
||||
struct is_sorted<A0, A1, Archs...>
|
||||
: std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
|
||||
std::false_type>::type
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
inline constexpr T max_of(T value) noexcept
|
||||
{
|
||||
return value;
|
||||
}
|
||||
|
||||
template <typename T, typename... Ts>
|
||||
inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
|
||||
{
|
||||
return max_of((head0 > head1 ? head0 : head1), tail...);
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
// An arch_list is a list of architectures, sorted by version number.
|
||||
template <class... Archs>
|
||||
struct arch_list
|
||||
{
|
||||
#ifndef NDEBUG
|
||||
static_assert(detail::is_sorted<Archs...>::value,
|
||||
"architecture list must be sorted by version");
|
||||
#endif
|
||||
|
||||
template <class Arch>
|
||||
using add = arch_list<Archs..., Arch>;
|
||||
|
||||
template <class... OtherArchs>
|
||||
using extend = arch_list<Archs..., OtherArchs...>;
|
||||
|
||||
template <class Arch>
|
||||
static constexpr bool contains() noexcept
|
||||
{
|
||||
return detail::contains<Arch, Archs...>::value;
|
||||
}
|
||||
|
||||
template <class F>
|
||||
static void for_each(F&& f) noexcept
|
||||
{
|
||||
(void)std::initializer_list<bool> { (f(Archs {}), true)... };
|
||||
}
|
||||
|
||||
static constexpr std::size_t alignment() noexcept
|
||||
{
|
||||
// all alignments are a power of two
|
||||
return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
|
||||
}
|
||||
};
|
||||
|
||||
struct unavailable
|
||||
{
|
||||
static constexpr bool supported() noexcept { return false; }
|
||||
static constexpr bool available() noexcept { return false; }
|
||||
static constexpr unsigned version() noexcept { return 0; }
|
||||
static constexpr std::size_t alignment() noexcept { return 0; }
|
||||
static constexpr bool requires_alignment() noexcept { return false; }
|
||||
static constexpr char const* name() noexcept { return "<none>"; }
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// Pick the best architecture in arch_list L, which is the last
|
||||
// because architectures are sorted by version.
|
||||
template <class L>
|
||||
struct best;
|
||||
|
||||
template <>
|
||||
struct best<arch_list<>>
|
||||
{
|
||||
using type = unavailable;
|
||||
};
|
||||
|
||||
template <class Arch, class... Archs>
|
||||
struct best<arch_list<Arch, Archs...>>
|
||||
{
|
||||
using type = Arch;
|
||||
};
|
||||
|
||||
// Filter archlists Archs, picking only supported archs and adding
|
||||
// them to L.
|
||||
template <class L, class... Archs>
|
||||
struct supported_helper;
|
||||
|
||||
template <class L>
|
||||
struct supported_helper<L, arch_list<>>
|
||||
{
|
||||
using type = L;
|
||||
};
|
||||
|
||||
template <class L, class Arch, class... Archs>
|
||||
struct supported_helper<L, arch_list<Arch, Archs...>>
|
||||
: supported_helper<
|
||||
typename std::conditional<Arch::supported(),
|
||||
typename L::template add<Arch>, L>::type,
|
||||
arch_list<Archs...>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class... Archs>
|
||||
struct supported : supported_helper<arch_list<>, Archs...>
|
||||
{
|
||||
};
|
||||
|
||||
// Joins all arch_list Archs in a single arch_list.
|
||||
template <class... Archs>
|
||||
struct join;
|
||||
|
||||
template <class Arch>
|
||||
struct join<Arch>
|
||||
{
|
||||
using type = Arch;
|
||||
};
|
||||
|
||||
template <class Arch, class... Archs, class... Args>
|
||||
struct join<Arch, arch_list<Archs...>, Args...>
|
||||
: join<typename Arch::template extend<Archs...>, Args...>
|
||||
{
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
struct unsupported
|
||||
{
|
||||
};
|
||||
using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
|
||||
using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
|
||||
using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
|
||||
using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
|
||||
|
||||
using supported_architectures = typename detail::supported<all_architectures>::type;
|
||||
|
||||
using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
|
||||
using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
|
||||
// using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
|
||||
using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
|
||||
arm_arch,
|
||||
x86_arch>::type;
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class F, class ArchList>
|
||||
class dispatcher
|
||||
{
|
||||
|
||||
const unsigned best_arch;
|
||||
F functor;
|
||||
|
||||
template <class Arch, class... Tys>
|
||||
auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
|
||||
{
|
||||
assert(Arch::available() && "At least one arch must be supported during dispatch");
|
||||
return functor(Arch {}, std::forward<Tys>(args)...);
|
||||
}
|
||||
|
||||
template <class Arch, class ArchNext, class... Archs, class... Tys>
|
||||
auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
|
||||
{
|
||||
if (Arch::version() <= best_arch)
|
||||
return functor(Arch {}, std::forward<Tys>(args)...);
|
||||
else
|
||||
return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
|
||||
}
|
||||
|
||||
public:
|
||||
dispatcher(F f) noexcept
|
||||
: best_arch(available_architectures().best)
|
||||
, functor(f)
|
||||
{
|
||||
}
|
||||
|
||||
template <class... Tys>
|
||||
auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
|
||||
{
|
||||
return walk_archs(ArchList {}, std::forward<Tys>(args)...);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Generic function dispatch, à la ifunc
|
||||
template <class ArchList = supported_architectures, class F>
|
||||
inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
|
||||
{
|
||||
return { std::forward<F>(f) };
|
||||
}
|
||||
|
||||
} // namespace xsimd
|
||||
|
||||
#endif
|
|
@ -0,0 +1,341 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_CONFIG_HPP
|
||||
#define XSIMD_CONFIG_HPP
|
||||
|
||||
#define XSIMD_VERSION_MAJOR 10
|
||||
#define XSIMD_VERSION_MINOR 0
|
||||
#define XSIMD_VERSION_PATCH 0
|
||||
|
||||
/**
|
||||
* high level free functions
|
||||
*
|
||||
* @defgroup xsimd_config_macro Instruction Set Detection
|
||||
*/
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE2__
|
||||
#define XSIMD_WITH_SSE2 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE3__
|
||||
#define XSIMD_WITH_SSE3 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE3 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSSE3__
|
||||
#define XSIMD_WITH_SSSE3 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSSE3 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE4_1__
|
||||
#define XSIMD_WITH_SSE4_1 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE4_1 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE4_2__
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE4_2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX__
|
||||
#define XSIMD_WITH_AVX 1
|
||||
#else
|
||||
#define XSIMD_WITH_AVX 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX2__
|
||||
#define XSIMD_WITH_AVX2 1
|
||||
#else
|
||||
#define XSIMD_WITH_AVX2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA__
|
||||
|
||||
#if defined(__SSE__)
|
||||
#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_SSE 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_SSE 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_SSE 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA__
|
||||
|
||||
#if defined(__AVX__)
|
||||
#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_AVX 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX 0
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_AVX2 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX2 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX 0
|
||||
#define XSIMD_WITH_FMA3_AVX2 0
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA4__
|
||||
#define XSIMD_WITH_FMA4 1
|
||||
#else
|
||||
#define XSIMD_WITH_FMA4 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512F__
|
||||
// AVX512 instructions are supported starting with gcc 6
|
||||
// see https://www.gnu.org/software/gcc/gcc-6/changes.html
|
||||
// check clang first, newer clang always defines __GNUC__ = 4
|
||||
#if defined(__clang__) && __clang_major__ >= 6
|
||||
#define XSIMD_WITH_AVX512F 1
|
||||
#elif defined(__GNUC__) && __GNUC__ < 6
|
||||
#define XSIMD_WITH_AVX512F 0
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512F 1
|
||||
#if __GNUC__ == 6
|
||||
#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512F 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512CD__
|
||||
// Avoids repeating the GCC workaround over and over
|
||||
#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512CD 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512DQ__
|
||||
#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512DQ 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512BW__
|
||||
#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512BW 0
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if NEON is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#if __ARM_ARCH >= 7
|
||||
#define XSIMD_WITH_NEON 1
|
||||
#else
|
||||
#define XSIMD_WITH_NEON 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __aarch64__
|
||||
#define XSIMD_WITH_NEON64 1
|
||||
#else
|
||||
#define XSIMD_WITH_NEON64 0
|
||||
#endif
|
||||
#else
|
||||
#define XSIMD_WITH_NEON 0
|
||||
#define XSIMD_WITH_NEON64 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
|
||||
#define XSIMD_WITH_SVE 1
|
||||
#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
|
||||
#else
|
||||
#define XSIMD_WITH_SVE 0
|
||||
#define XSIMD_SVE_BITS 0
|
||||
#endif
|
||||
|
||||
// Workaround for MSVC compiler
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#if XSIMD_WITH_AVX512
|
||||
#undef XSIMD_WITH_AVX2
|
||||
#define XSIMD_WITH_AVX2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
#undef XSIMD_WITH_AVX
|
||||
#define XSIMD_WITH_AVX 1
|
||||
#undef XSIMD_WITH_FMA3_AVX
|
||||
#define XSIMD_WITH_FMA3_AVX 1
|
||||
#undef XSIMD_WITH_FMA3_AVX2
|
||||
#define XSIMD_WITH_FMA3_AVX2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
#undef XSIMD_WITH_SSE4_2
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#endif
|
||||
|
||||
#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
|
||||
#undef XSIMD_WITH_SSE4_2
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#undef XSIMD_WITH_SSE4_1
|
||||
#define XSIMD_WITH_SSE4_1 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#undef XSIMD_WITH_SSSE3
|
||||
#define XSIMD_WITH_SSSE3 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#undef XSIMD_WITH_SSE3
|
||||
#define XSIMD_WITH_SSE3 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#undef XSIMD_WITH_SSE2
|
||||
#define XSIMD_WITH_SSE2 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
|
||||
#define XSIMD_NO_SUPPORTED_ARCHITECTURE
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,341 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_CONFIG_HPP
|
||||
#define XSIMD_CONFIG_HPP
|
||||
|
||||
#define XSIMD_VERSION_MAJOR 10
|
||||
#define XSIMD_VERSION_MINOR 0
|
||||
#define XSIMD_VERSION_PATCH 0
|
||||
|
||||
/**
|
||||
* high level free functions
|
||||
*
|
||||
* @defgroup xsimd_config_macro Instruction Set Detection
|
||||
*/
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE2__
|
||||
#define XSIMD_WITH_SSE2 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE3__
|
||||
#define XSIMD_WITH_SSE3 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE3 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSSE3__
|
||||
#define XSIMD_WITH_SSSE3 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSSE3 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE4_1__
|
||||
#define XSIMD_WITH_SSE4_1 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE4_1 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __SSE4_2__
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#else
|
||||
#define XSIMD_WITH_SSE4_2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX__
|
||||
#define XSIMD_WITH_AVX 1
|
||||
#else
|
||||
#define XSIMD_WITH_AVX 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX2__
|
||||
#define XSIMD_WITH_AVX2 1
|
||||
#else
|
||||
#define XSIMD_WITH_AVX2 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA__
|
||||
|
||||
#if defined(__SSE__) && !defined(__AVX__)
|
||||
#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_SSE 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_SSE 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_SSE 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA__
|
||||
|
||||
#if defined(__AVX__)
|
||||
#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_AVX 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX 0
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
|
||||
#define XSIMD_WITH_FMA3_AVX2 1
|
||||
#endif
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX2 0
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
|
||||
#endif
|
||||
|
||||
#define XSIMD_WITH_FMA3_AVX 0
|
||||
#define XSIMD_WITH_FMA3_AVX2 0
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __FMA4__
|
||||
#define XSIMD_WITH_FMA4 1
|
||||
#else
|
||||
#define XSIMD_WITH_FMA4 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512F__
|
||||
// AVX512 instructions are supported starting with gcc 6
|
||||
// see https://www.gnu.org/software/gcc/gcc-6/changes.html
|
||||
// check clang first, newer clang always defines __GNUC__ = 4
|
||||
#if defined(__clang__) && __clang_major__ >= 6
|
||||
#define XSIMD_WITH_AVX512F 1
|
||||
#elif defined(__GNUC__) && __GNUC__ < 6
|
||||
#define XSIMD_WITH_AVX512F 0
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512F 1
|
||||
#if __GNUC__ == 6
|
||||
#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512F 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512CD__
|
||||
// Avoids repeating the GCC workaround over and over
|
||||
#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512CD 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512DQ__
|
||||
#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512DQ 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __AVX512BW__
|
||||
#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
|
||||
#else
|
||||
#define XSIMD_WITH_AVX512BW 0
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if NEON is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#if __ARM_ARCH >= 7
|
||||
#define XSIMD_WITH_NEON 1
|
||||
#else
|
||||
#define XSIMD_WITH_NEON 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#ifdef __aarch64__
|
||||
#define XSIMD_WITH_NEON64 1
|
||||
#else
|
||||
#define XSIMD_WITH_NEON64 0
|
||||
#endif
|
||||
#else
|
||||
#define XSIMD_WITH_NEON 0
|
||||
#define XSIMD_WITH_NEON64 0
|
||||
#endif
|
||||
|
||||
/**
|
||||
* @ingroup xsimd_config_macro
|
||||
*
|
||||
* Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
|
||||
*/
|
||||
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
|
||||
#define XSIMD_WITH_SVE 1
|
||||
#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
|
||||
#else
|
||||
#define XSIMD_WITH_SVE 0
|
||||
#define XSIMD_SVE_BITS 0
|
||||
#endif
|
||||
|
||||
// Workaround for MSVC compiler
|
||||
#ifdef _MSC_VER
|
||||
|
||||
#if XSIMD_WITH_AVX512
|
||||
#undef XSIMD_WITH_AVX2
|
||||
#define XSIMD_WITH_AVX2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
#undef XSIMD_WITH_AVX
|
||||
#define XSIMD_WITH_AVX 1
|
||||
#undef XSIMD_WITH_FMA3_AVX
|
||||
#define XSIMD_WITH_FMA3_AVX 1
|
||||
#undef XSIMD_WITH_FMA3_AVX2
|
||||
#define XSIMD_WITH_FMA3_AVX2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
#undef XSIMD_WITH_SSE4_2
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#endif
|
||||
|
||||
#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
|
||||
#undef XSIMD_WITH_SSE4_2
|
||||
#define XSIMD_WITH_SSE4_2 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#undef XSIMD_WITH_SSE4_1
|
||||
#define XSIMD_WITH_SSE4_1 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#undef XSIMD_WITH_SSSE3
|
||||
#define XSIMD_WITH_SSSE3 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#undef XSIMD_WITH_SSE3
|
||||
#define XSIMD_WITH_SSE3 1
|
||||
#endif
|
||||
|
||||
#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
||||
#undef XSIMD_WITH_SSE2
|
||||
#define XSIMD_WITH_SSE2 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
|
||||
#define XSIMD_NO_SUPPORTED_ARCHITECTURE
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -0,0 +1,181 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_CPUID_HPP
|
||||
#define XSIMD_CPUID_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// Contains the definition of __cpuidex
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#include "../types/xsimd_all_registers.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
struct supported_arch
|
||||
{
|
||||
unsigned sse2 : 1;
|
||||
unsigned sse3 : 1;
|
||||
unsigned ssse3 : 1;
|
||||
unsigned sse4_1 : 1;
|
||||
unsigned sse4_2 : 1;
|
||||
unsigned sse4a : 1;
|
||||
unsigned fma3_sse : 1;
|
||||
unsigned fma4 : 1;
|
||||
unsigned xop : 1;
|
||||
unsigned avx : 1;
|
||||
unsigned fma3_avx : 1;
|
||||
unsigned avx2 : 1;
|
||||
unsigned fma3_avx2 : 1;
|
||||
unsigned avx512f : 1;
|
||||
unsigned avx512cd : 1;
|
||||
unsigned avx512dq : 1;
|
||||
unsigned avx512bw : 1;
|
||||
unsigned neon : 1;
|
||||
unsigned neon64 : 1;
|
||||
|
||||
// version number of the best arch available
|
||||
unsigned best;
|
||||
|
||||
supported_arch() noexcept
|
||||
{
|
||||
memset(this, 0, sizeof(supported_arch));
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
neon = 1;
|
||||
neon64 = 1;
|
||||
best = neon64::version();
|
||||
#elif defined(__ARM_NEON) || defined(_M_ARM)
|
||||
|
||||
#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
|
||||
neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
|
||||
#else
|
||||
// that's very conservative :-/
|
||||
neon = 0;
|
||||
#endif
|
||||
neon64 = 0;
|
||||
best = neon::version() * neon;
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
|
||||
auto get_cpuid = [](int reg[4], int func_id) noexcept
|
||||
{
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
__cpuidex(reg, func_id, 0);
|
||||
|
||||
#elif defined(__INTEL_COMPILER)
|
||||
__cpuid(reg, func_id);
|
||||
|
||||
#elif defined(__GNUC__) || defined(__clang__)
|
||||
|
||||
#if defined(__i386__) && defined(__PIC__)
|
||||
// %ebx may be the PIC register
|
||||
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
|
||||
"cpuid\n\t"
|
||||
"xchg{l}\t{%%}ebx, %1\n\t"
|
||||
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
|
||||
"=d"(reg[3])
|
||||
: "a"(func_id), "c"(0));
|
||||
|
||||
#else
|
||||
__asm__("cpuid\n\t"
|
||||
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
|
||||
"=d"(reg[3])
|
||||
: "a"(func_id), "c"(0));
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error "Unsupported configuration"
|
||||
#endif
|
||||
};
|
||||
|
||||
int regs[4];
|
||||
|
||||
get_cpuid(regs, 0x1);
|
||||
|
||||
sse2 = regs[3] >> 26 & 1;
|
||||
best = std::max(best, sse2::version() * sse2);
|
||||
|
||||
sse3 = regs[2] >> 0 & 1;
|
||||
best = std::max(best, sse3::version() * sse3);
|
||||
|
||||
ssse3 = regs[2] >> 9 & 1;
|
||||
best = std::max(best, ssse3::version() * ssse3);
|
||||
|
||||
sse4_1 = regs[2] >> 19 & 1;
|
||||
best = std::max(best, sse4_1::version() * sse4_1);
|
||||
|
||||
sse4_2 = regs[2] >> 20 & 1;
|
||||
best = std::max(best, sse4_2::version() * sse4_2);
|
||||
|
||||
fma3_sse = regs[2] >> 12 & 1;
|
||||
if (sse4_2)
|
||||
best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
|
||||
|
||||
get_cpuid(regs, 0x80000001);
|
||||
fma4 = regs[2] >> 16 & 1;
|
||||
best = std::max(best, fma4::version() * fma4);
|
||||
|
||||
// sse4a = regs[2] >> 6 & 1;
|
||||
// best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
|
||||
|
||||
// xop = regs[2] >> 11 & 1;
|
||||
// best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
|
||||
|
||||
avx = regs[2] >> 28 & 1;
|
||||
best = std::max(best, avx::version() * avx);
|
||||
|
||||
fma3_avx = avx && fma3_sse;
|
||||
best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
|
||||
|
||||
get_cpuid(regs, 0x7);
|
||||
avx2 = regs[1] >> 5 & 1;
|
||||
best = std::max(best, avx2::version() * avx2);
|
||||
|
||||
fma3_avx2 = avx2 && fma3_sse;
|
||||
best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
|
||||
|
||||
avx512f = regs[1] >> 16 & 1;
|
||||
best = std::max(best, avx512f::version() * avx512f);
|
||||
|
||||
avx512cd = regs[1] >> 28 & 1;
|
||||
best = std::max(best, avx512cd::version() * avx512cd * avx512f);
|
||||
|
||||
avx512dq = regs[1] >> 17 & 1;
|
||||
best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
|
||||
|
||||
avx512bw = regs[1] >> 30 & 1;
|
||||
best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
|
||||
|
||||
#endif
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
inline detail::supported_arch available_architectures() noexcept
|
||||
{
|
||||
static detail::supported_arch supported;
|
||||
return supported;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,719 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
|
||||
/* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
|
||||
/*
|
||||
* ====================================================
|
||||
* copyright 2016 NumScale SAS
|
||||
*
|
||||
* Distributed under the Boost Software License, Version 1.0.
|
||||
* (See copy at http://boost.org/LICENSE_1_0.txt)
|
||||
* ====================================================
|
||||
*/
|
||||
#if defined(_MSC_VER)
|
||||
#define ONCE0 \
|
||||
__pragma(warning(push)) \
|
||||
__pragma(warning(disable : 4127)) while (0) \
|
||||
__pragma(warning(pop)) /**/
|
||||
#else
|
||||
#define ONCE0 while (0)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* ====================================================
|
||||
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
|
||||
*
|
||||
* Developed at SunPro, a Sun Microsystems, Inc. business.
|
||||
* Permission to use, copy, modify, and distribute this
|
||||
* software is freely granted, provided that this notice
|
||||
* is preserved.
|
||||
* ====================================================
|
||||
*/
|
||||
|
||||
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
|
||||
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
||||
#define XSIMD_LITTLE_ENDIAN
|
||||
#endif
|
||||
#elif defined(_WIN32)
|
||||
// We can safely assume that Windows is always little endian
|
||||
#define XSIMD_LITTLE_ENDIAN
|
||||
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
|
||||
#define XSIMD_LITTLE_ENDIAN
|
||||
#endif
|
||||
|
||||
#ifdef XSIMD_LITTLE_ENDIAN
|
||||
#define LOW_WORD_IDX 0
|
||||
#define HIGH_WORD_IDX sizeof(std::uint32_t)
|
||||
#else
|
||||
#define LOW_WORD_IDX sizeof(std::uint32_t)
|
||||
#define HIGH_WORD_IDX 0
|
||||
#endif
|
||||
|
||||
#define GET_HIGH_WORD(i, d) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
|
||||
sizeof(std::uint32_t)); \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
#define GET_LOW_WORD(i, d) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
|
||||
sizeof(std::uint32_t)); \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
#define SET_HIGH_WORD(d, v) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::uint32_t value = (v); \
|
||||
std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
|
||||
&value, sizeof(std::uint32_t)); \
|
||||
(d) = f; \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
#define SET_LOW_WORD(d, v) \
|
||||
do \
|
||||
{ \
|
||||
double f = (d); \
|
||||
std::uint32_t value = (v); \
|
||||
std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
|
||||
&value, sizeof(std::uint32_t)); \
|
||||
(d) = f; \
|
||||
} \
|
||||
ONCE0 \
|
||||
/**/
|
||||
|
||||
/*
|
||||
* __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
|
||||
* double x[],y[]; int e0,nx,prec; int ipio2[];
|
||||
*
|
||||
* __kernel_rem_pio2 return the last three digits of N with
|
||||
* y = x - N*pi/2
|
||||
* so that |y| < pi/2.
|
||||
*
|
||||
* The method is to compute the integer (mod 8) and fraction parts of
|
||||
* (2/pi)*x without doing the full multiplication. In general we
|
||||
* skip the part of the product that are known to be a huge integer (
|
||||
* more accurately, = 0 mod 8 ). Thus the number of operations are
|
||||
* independent of the exponent of the input.
|
||||
*
|
||||
* (2/pi) is represented by an array of 24-bit integers in ipio2[].
|
||||
*
|
||||
* Input parameters:
|
||||
* x[] The input value (must be positive) is broken into nx
|
||||
* pieces of 24-bit integers in double precision format.
|
||||
* x[i] will be the i-th 24 bit of x. The scaled exponent
|
||||
* of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
|
||||
* match x's up to 24 bits.
|
||||
*
|
||||
* Example of breaking a double positive z into x[0]+x[1]+x[2]:
|
||||
* e0 = ilogb(z)-23
|
||||
* z = scalbn(z,-e0)
|
||||
* for i = 0,1,2
|
||||
* x[i] = floor(z)
|
||||
* z = (z-x[i])*2**24
|
||||
*
|
||||
*
|
||||
* y[] ouput result in an array of double precision numbers.
|
||||
* The dimension of y[] is:
|
||||
* 24-bit precision 1
|
||||
* 53-bit precision 2
|
||||
* 64-bit precision 2
|
||||
* 113-bit precision 3
|
||||
* The actual value is the sum of them. Thus for 113-bit
|
||||
* precison, one may have to do something like:
|
||||
*
|
||||
* long double t,w,r_head, r_tail;
|
||||
* t = (long double)y[2] + (long double)y[1];
|
||||
* w = (long double)y[0];
|
||||
* r_head = t+w;
|
||||
* r_tail = w - (r_head - t);
|
||||
*
|
||||
* e0 The exponent of x[0]
|
||||
*
|
||||
* nx dimension of x[]
|
||||
*
|
||||
* prec an integer indicating the precision:
|
||||
* 0 24 bits (single)
|
||||
* 1 53 bits (double)
|
||||
* 2 64 bits (extended)
|
||||
* 3 113 bits (quad)
|
||||
*
|
||||
* ipio2[]
|
||||
* integer array, contains the (24*i)-th to (24*i+23)-th
|
||||
* bit of 2/pi after binary point. The corresponding
|
||||
* floating value is
|
||||
*
|
||||
* ipio2[i] * 2^(-24(i+1)).
|
||||
*
|
||||
* External function:
|
||||
* double scalbn(), floor();
|
||||
*
|
||||
*
|
||||
* Here is the description of some local variables:
|
||||
*
|
||||
* jk jk+1 is the initial number of terms of ipio2[] needed
|
||||
* in the computation. The recommended value is 2,3,4,
|
||||
* 6 for single, double, extended,and quad.
|
||||
*
|
||||
* jz local integer variable indicating the number of
|
||||
* terms of ipio2[] used.
|
||||
*
|
||||
* jx nx - 1
|
||||
*
|
||||
* jv index for pointing to the suitable ipio2[] for the
|
||||
* computation. In general, we want
|
||||
* ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
|
||||
* is an integer. Thus
|
||||
* e0-3-24*jv >= 0 or (e0-3)/24 >= jv
|
||||
* Hence jv = max(0,(e0-3)/24).
|
||||
*
|
||||
* jp jp+1 is the number of terms in PIo2[] needed, jp = jk.
|
||||
*
|
||||
* q[] double array with integral value, representing the
|
||||
* 24-bits chunk of the product of x and 2/pi.
|
||||
*
|
||||
* q0 the corresponding exponent of q[0]. Note that the
|
||||
* exponent for q[i] would be q0-24*i.
|
||||
*
|
||||
* PIo2[] double precision array, obtained by cutting pi/2
|
||||
* into 24 bits chunks.
|
||||
*
|
||||
* f[] ipio2[] in floating point
|
||||
*
|
||||
* iq[] integer array by breaking up q[] in 24-bits chunk.
|
||||
*
|
||||
* fq[] final product of x*(2/pi) in fq[0],..,fq[jk]
|
||||
*
|
||||
* ih integer. If >0 it indicates q[] is >= 0.5, hence
|
||||
* it also indicates the *sign* of the result.
|
||||
*
|
||||
*/
|
||||
|
||||
inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
|
||||
{
|
||||
static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
|
||||
|
||||
static const double PIo2[] = {
|
||||
1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
|
||||
7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
|
||||
5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
|
||||
3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
|
||||
1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
|
||||
1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
|
||||
2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
|
||||
2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
|
||||
};
|
||||
|
||||
static const double
|
||||
zero
|
||||
= 0.0,
|
||||
one = 1.0,
|
||||
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
|
||||
twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
|
||||
|
||||
int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
|
||||
double z, fw, f[20], fq[20], q[20];
|
||||
|
||||
/* initialize jk*/
|
||||
jk = init_jk[prec];
|
||||
jp = jk;
|
||||
|
||||
/* determine jx,jv,q0, note that 3>q0 */
|
||||
jx = nx - 1;
|
||||
jv = (e0 - 3) / 24;
|
||||
if (jv < 0)
|
||||
jv = 0;
|
||||
q0 = e0 - 24 * (jv + 1);
|
||||
|
||||
/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
|
||||
j = jv - jx;
|
||||
m = jx + jk;
|
||||
for (i = 0; i <= m; i++, j++)
|
||||
f[i] = (j < 0) ? zero : (double)ipio2[j];
|
||||
|
||||
/* compute q[0],q[1],...q[jk] */
|
||||
for (i = 0; i <= jk; i++)
|
||||
{
|
||||
for (j = 0, fw = 0.0; j <= jx; j++)
|
||||
fw += x[j] * f[jx + i - j];
|
||||
q[i] = fw;
|
||||
}
|
||||
|
||||
jz = jk;
|
||||
|
||||
recompute:
|
||||
/* distill q[] into iq[] reversingly */
|
||||
for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
|
||||
{
|
||||
fw = (double)((int32_t)(twon24 * z));
|
||||
iq[i] = (int)(z - two24 * fw);
|
||||
z = q[j - 1] + fw;
|
||||
}
|
||||
|
||||
/* compute n */
|
||||
z = std::scalbn(z, q0); /* actual value of z */
|
||||
z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
|
||||
n = (int32_t)z;
|
||||
z -= (double)n;
|
||||
ih = 0;
|
||||
if (q0 > 0)
|
||||
{ /* need iq[jz-1] to determine n */
|
||||
i = (iq[jz - 1] >> (24 - q0));
|
||||
n += i;
|
||||
iq[jz - 1] -= i << (24 - q0);
|
||||
ih = iq[jz - 1] >> (23 - q0);
|
||||
}
|
||||
else if (q0 == 0)
|
||||
ih = iq[jz - 1] >> 23;
|
||||
else if (z >= 0.5)
|
||||
ih = 2;
|
||||
|
||||
if (ih > 0)
|
||||
{ /* q > 0.5 */
|
||||
n += 1;
|
||||
carry = 0;
|
||||
for (i = 0; i < jz; i++)
|
||||
{ /* compute 1-q */
|
||||
j = iq[i];
|
||||
if (carry == 0)
|
||||
{
|
||||
if (j != 0)
|
||||
{
|
||||
carry = 1;
|
||||
iq[i] = 0x1000000 - j;
|
||||
}
|
||||
}
|
||||
else
|
||||
iq[i] = 0xffffff - j;
|
||||
}
|
||||
if (q0 > 0)
|
||||
{ /* rare case: chance is 1 in 12 */
|
||||
switch (q0)
|
||||
{
|
||||
case 1:
|
||||
iq[jz - 1] &= 0x7fffff;
|
||||
break;
|
||||
case 2:
|
||||
iq[jz - 1] &= 0x3fffff;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (ih == 2)
|
||||
{
|
||||
z = one - z;
|
||||
if (carry != 0)
|
||||
z -= std::scalbn(one, q0);
|
||||
}
|
||||
}
|
||||
|
||||
/* check if recomputation is needed */
|
||||
if (z == zero)
|
||||
{
|
||||
j = 0;
|
||||
for (i = jz - 1; i >= jk; i--)
|
||||
j |= iq[i];
|
||||
if (j == 0)
|
||||
{ /* need recomputation */
|
||||
for (k = 1; iq[jk - k] == 0; k++)
|
||||
; /* k = no. of terms needed */
|
||||
|
||||
for (i = jz + 1; i <= jz + k; i++)
|
||||
{ /* add q[jz+1] to q[jz+k] */
|
||||
f[jx + i] = (double)ipio2[jv + i];
|
||||
for (j = 0, fw = 0.0; j <= jx; j++)
|
||||
fw += x[j] * f[jx + i - j];
|
||||
q[i] = fw;
|
||||
}
|
||||
jz += k;
|
||||
goto recompute;
|
||||
}
|
||||
}
|
||||
|
||||
/* chop off zero terms */
|
||||
if (z == 0.0)
|
||||
{
|
||||
jz -= 1;
|
||||
q0 -= 24;
|
||||
while (iq[jz] == 0)
|
||||
{
|
||||
jz--;
|
||||
q0 -= 24;
|
||||
}
|
||||
}
|
||||
else
|
||||
{ /* break z into 24-bit if necessary */
|
||||
z = std::scalbn(z, -q0);
|
||||
if (z >= two24)
|
||||
{
|
||||
fw = (double)((int32_t)(twon24 * z));
|
||||
iq[jz] = (int32_t)(z - two24 * fw);
|
||||
jz += 1;
|
||||
q0 += 24;
|
||||
iq[jz] = (int32_t)fw;
|
||||
}
|
||||
else
|
||||
iq[jz] = (int32_t)z;
|
||||
}
|
||||
|
||||
/* convert integer "bit" chunk to floating-point value */
|
||||
fw = scalbn(one, q0);
|
||||
for (i = jz; i >= 0; i--)
|
||||
{
|
||||
q[i] = fw * (double)iq[i];
|
||||
fw *= twon24;
|
||||
}
|
||||
|
||||
/* compute PIo2[0,...,jp]*q[jz,...,0] */
|
||||
for (i = jz; i >= 0; i--)
|
||||
{
|
||||
for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
|
||||
fw += PIo2[k] * q[i + k];
|
||||
fq[jz - i] = fw;
|
||||
}
|
||||
|
||||
/* compress fq[] into y[] */
|
||||
switch (prec)
|
||||
{
|
||||
case 0:
|
||||
fw = 0.0;
|
||||
for (i = jz; i >= 0; i--)
|
||||
fw += fq[i];
|
||||
y[0] = (ih == 0) ? fw : -fw;
|
||||
break;
|
||||
case 1:
|
||||
case 2:
|
||||
fw = 0.0;
|
||||
for (i = jz; i >= 0; i--)
|
||||
fw += fq[i];
|
||||
y[0] = (ih == 0) ? fw : -fw;
|
||||
fw = fq[0] - fw;
|
||||
for (i = 1; i <= jz; i++)
|
||||
fw += fq[i];
|
||||
y[1] = (ih == 0) ? fw : -fw;
|
||||
break;
|
||||
case 3: /* painful */
|
||||
for (i = jz; i > 0; i--)
|
||||
{
|
||||
fw = fq[i - 1] + fq[i];
|
||||
fq[i] += fq[i - 1] - fw;
|
||||
fq[i - 1] = fw;
|
||||
}
|
||||
for (i = jz; i > 1; i--)
|
||||
{
|
||||
fw = fq[i - 1] + fq[i];
|
||||
fq[i] += fq[i - 1] - fw;
|
||||
fq[i - 1] = fw;
|
||||
}
|
||||
for (fw = 0.0, i = jz; i >= 2; i--)
|
||||
fw += fq[i];
|
||||
if (ih == 0)
|
||||
{
|
||||
y[0] = fq[0];
|
||||
y[1] = fq[1];
|
||||
y[2] = fw;
|
||||
}
|
||||
else
|
||||
{
|
||||
y[0] = -fq[0];
|
||||
y[1] = -fq[1];
|
||||
y[2] = -fw;
|
||||
}
|
||||
}
|
||||
return n & 7;
|
||||
}
|
||||
|
||||
inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
|
||||
{
|
||||
static const std::int32_t two_over_pi[] = {
|
||||
0xA2F983,
|
||||
0x6E4E44,
|
||||
0x1529FC,
|
||||
0x2757D1,
|
||||
0xF534DD,
|
||||
0xC0DB62,
|
||||
0x95993C,
|
||||
0x439041,
|
||||
0xFE5163,
|
||||
0xABDEBB,
|
||||
0xC561B7,
|
||||
0x246E3A,
|
||||
0x424DD2,
|
||||
0xE00649,
|
||||
0x2EEA09,
|
||||
0xD1921C,
|
||||
0xFE1DEB,
|
||||
0x1CB129,
|
||||
0xA73EE8,
|
||||
0x8235F5,
|
||||
0x2EBB44,
|
||||
0x84E99C,
|
||||
0x7026B4,
|
||||
0x5F7E41,
|
||||
0x3991D6,
|
||||
0x398353,
|
||||
0x39F49C,
|
||||
0x845F8B,
|
||||
0xBDF928,
|
||||
0x3B1FF8,
|
||||
0x97FFDE,
|
||||
0x05980F,
|
||||
0xEF2F11,
|
||||
0x8B5A0A,
|
||||
0x6D1F6D,
|
||||
0x367ECF,
|
||||
0x27CB09,
|
||||
0xB74F46,
|
||||
0x3F669E,
|
||||
0x5FEA2D,
|
||||
0x7527BA,
|
||||
0xC7EBE5,
|
||||
0xF17B3D,
|
||||
0x0739F7,
|
||||
0x8A5292,
|
||||
0xEA6BFB,
|
||||
0x5FB11F,
|
||||
0x8D5D08,
|
||||
0x560330,
|
||||
0x46FC7B,
|
||||
0x6BABF0,
|
||||
0xCFBC20,
|
||||
0x9AF436,
|
||||
0x1DA9E3,
|
||||
0x91615E,
|
||||
0xE61B08,
|
||||
0x659985,
|
||||
0x5F14A0,
|
||||
0x68408D,
|
||||
0xFFD880,
|
||||
0x4D7327,
|
||||
0x310606,
|
||||
0x1556CA,
|
||||
0x73A8C9,
|
||||
0x60E27B,
|
||||
0xC08C6B,
|
||||
};
|
||||
|
||||
static const std::int32_t npio2_hw[] = {
|
||||
0x3FF921FB,
|
||||
0x400921FB,
|
||||
0x4012D97C,
|
||||
0x401921FB,
|
||||
0x401F6A7A,
|
||||
0x4022D97C,
|
||||
0x4025FDBB,
|
||||
0x402921FB,
|
||||
0x402C463A,
|
||||
0x402F6A7A,
|
||||
0x4031475C,
|
||||
0x4032D97C,
|
||||
0x40346B9C,
|
||||
0x4035FDBB,
|
||||
0x40378FDB,
|
||||
0x403921FB,
|
||||
0x403AB41B,
|
||||
0x403C463A,
|
||||
0x403DD85A,
|
||||
0x403F6A7A,
|
||||
0x40407E4C,
|
||||
0x4041475C,
|
||||
0x4042106C,
|
||||
0x4042D97C,
|
||||
0x4043A28C,
|
||||
0x40446B9C,
|
||||
0x404534AC,
|
||||
0x4045FDBB,
|
||||
0x4046C6CB,
|
||||
0x40478FDB,
|
||||
0x404858EB,
|
||||
0x404921FB,
|
||||
};
|
||||
|
||||
/*
|
||||
* invpio2: 53 bits of 2/pi
|
||||
* pio2_1: first 33 bit of pi/2
|
||||
* pio2_1t: pi/2 - pio2_1
|
||||
* pio2_2: second 33 bit of pi/2
|
||||
* pio2_2t: pi/2 - (pio2_1+pio2_2)
|
||||
* pio2_3: third 33 bit of pi/2
|
||||
* pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
|
||||
*/
|
||||
|
||||
static const double
|
||||
zero
|
||||
= 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
|
||||
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
|
||||
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
|
||||
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
|
||||
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
|
||||
pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
|
||||
pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
|
||||
pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
|
||||
pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
|
||||
pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
|
||||
|
||||
double z = 0., w, t, r, fn;
|
||||
double tx[3];
|
||||
std::int32_t e0, i, j, nx, n, ix, hx;
|
||||
std::uint32_t low;
|
||||
|
||||
GET_HIGH_WORD(hx, x); /* high word of x */
|
||||
ix = hx & 0x7fffffff;
|
||||
if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
|
||||
{
|
||||
y[0] = x;
|
||||
y[1] = 0;
|
||||
return 0;
|
||||
}
|
||||
if (ix < 0x4002d97c)
|
||||
{ /* |x| < 3pi/4, special case with n=+-1 */
|
||||
if (hx > 0)
|
||||
{
|
||||
z = x - pio2_1;
|
||||
if (ix != 0x3ff921fb)
|
||||
{ /* 33+53 bit pi is good enough */
|
||||
y[0] = z - pio2_1t;
|
||||
y[1] = (z - y[0]) - pio2_1t;
|
||||
}
|
||||
else
|
||||
{ /* near pi/2, use 33+33+53 bit pi */
|
||||
z -= pio2_2;
|
||||
y[0] = z - pio2_2t;
|
||||
y[1] = (z - y[0]) - pio2_2t;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{ /* negative x */
|
||||
z = x + pio2_1;
|
||||
if (ix != 0x3ff921fb)
|
||||
{ /* 33+53 bit pi is good enough */
|
||||
y[0] = z + pio2_1t;
|
||||
y[1] = (z - y[0]) + pio2_1t;
|
||||
}
|
||||
else
|
||||
{ /* near pi/2, use 33+33+53 bit pi */
|
||||
z += pio2_2;
|
||||
y[0] = z + pio2_2t;
|
||||
y[1] = (z - y[0]) + pio2_2t;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
if (ix <= 0x413921fb)
|
||||
{ /* |x| ~<= 2^19*(pi/2), medium_ size */
|
||||
t = std::fabs(x);
|
||||
n = (std::int32_t)(t * invpio2 + half);
|
||||
fn = (double)n;
|
||||
r = t - fn * pio2_1;
|
||||
w = fn * pio2_1t; /* 1st round good to 85 bit */
|
||||
if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
|
||||
{
|
||||
y[0] = r - w; /* quick check no cancellation */
|
||||
}
|
||||
else
|
||||
{
|
||||
std::uint32_t high;
|
||||
j = ix >> 20;
|
||||
y[0] = r - w;
|
||||
GET_HIGH_WORD(high, y[0]);
|
||||
i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
|
||||
if (i > 16)
|
||||
{ /* 2nd iteration needed, good to 118 */
|
||||
t = r;
|
||||
w = fn * pio2_2;
|
||||
r = t - w;
|
||||
w = fn * pio2_2t - ((t - r) - w);
|
||||
y[0] = r - w;
|
||||
GET_HIGH_WORD(high, y[0]);
|
||||
i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
|
||||
if (i > 49)
|
||||
{ /* 3rd iteration need, 151 bits acc */
|
||||
t = r; /* will cover all possible cases */
|
||||
w = fn * pio2_3;
|
||||
r = t - w;
|
||||
w = fn * pio2_3t - ((t - r) - w);
|
||||
y[0] = r - w;
|
||||
}
|
||||
}
|
||||
}
|
||||
y[1] = (r - y[0]) - w;
|
||||
if (hx < 0)
|
||||
{
|
||||
y[0] = -y[0];
|
||||
y[1] = -y[1];
|
||||
return -n;
|
||||
}
|
||||
else
|
||||
return n;
|
||||
}
|
||||
/*
|
||||
* all other (large) arguments
|
||||
*/
|
||||
if (ix >= 0x7ff00000)
|
||||
{ /* x is inf or NaN */
|
||||
y[0] = y[1] = x - x;
|
||||
return 0;
|
||||
}
|
||||
/* set z = scalbn(|x|,ilogb(x)-23) */
|
||||
GET_LOW_WORD(low, x);
|
||||
SET_LOW_WORD(z, low);
|
||||
e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
|
||||
SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
tx[i] = (double)((std::int32_t)(z));
|
||||
z = (z - tx[i]) * two24;
|
||||
}
|
||||
tx[2] = z;
|
||||
nx = 3;
|
||||
while (tx[nx - 1] == zero)
|
||||
nx--; /* skip zero term */
|
||||
n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
|
||||
if (hx < 0)
|
||||
{
|
||||
y[0] = -y[0];
|
||||
y[1] = -y[1];
|
||||
return -n;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
}
|
||||
|
||||
#undef XSIMD_LITTLE_ENDIAN
|
||||
#undef SET_LOW_WORD
|
||||
#undef SET_HIGH_WORD
|
||||
#undef GET_LOW_WORD
|
||||
#undef GET_HIGH_WORD
|
||||
#undef HIGH_WORD_IDX
|
||||
#undef LOW_WORD_IDX
|
||||
#undef ONCE0
|
||||
}
|
|
@ -0,0 +1,349 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
|
||||
#define XSIMD_ALIGNED_ALLOCATOR_HPP
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <utility>
|
||||
#ifdef _WIN32
|
||||
#include <malloc.h>
|
||||
#else
|
||||
#include <cstdlib>
|
||||
#endif
|
||||
|
||||
#include <cassert>
|
||||
#include <memory>
|
||||
|
||||
#include "../config/xsimd_arch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @class aligned_allocator
|
||||
* @brief Allocator for aligned memory
|
||||
*
|
||||
* The aligned_allocator class template is an allocator that
|
||||
* performs memory allocation aligned by the specified value.
|
||||
*
|
||||
* @tparam T type of objects to allocate.
|
||||
* @tparam Align alignment in bytes.
|
||||
*/
|
||||
template <class T, size_t Align = default_arch::alignment()>
|
||||
class aligned_allocator
|
||||
{
|
||||
public:
|
||||
using value_type = T;
|
||||
using pointer = T*;
|
||||
using const_pointer = const T*;
|
||||
using reference = T&;
|
||||
using const_reference = const T&;
|
||||
using size_type = size_t;
|
||||
using difference_type = ptrdiff_t;
|
||||
|
||||
static constexpr size_t alignment = Align;
|
||||
|
||||
template <class U>
|
||||
struct rebind
|
||||
{
|
||||
using other = aligned_allocator<U, Align>;
|
||||
};
|
||||
|
||||
aligned_allocator() noexcept;
|
||||
aligned_allocator(const aligned_allocator& rhs) noexcept;
|
||||
|
||||
template <class U>
|
||||
aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
|
||||
|
||||
~aligned_allocator();
|
||||
|
||||
pointer address(reference) noexcept;
|
||||
const_pointer address(const_reference) const noexcept;
|
||||
|
||||
pointer allocate(size_type n, const void* hint = 0);
|
||||
void deallocate(pointer p, size_type n);
|
||||
|
||||
size_type max_size() const noexcept;
|
||||
size_type size_max() const noexcept;
|
||||
|
||||
template <class U, class... Args>
|
||||
void construct(U* p, Args&&... args);
|
||||
|
||||
template <class U>
|
||||
void destroy(U* p);
|
||||
};
|
||||
|
||||
template <class T1, size_t Align1, class T2, size_t Align2>
|
||||
bool operator==(const aligned_allocator<T1, Align1>& lhs,
|
||||
const aligned_allocator<T2, Align2>& rhs) noexcept;
|
||||
|
||||
template <class T1, size_t Align1, class T2, size_t Align2>
|
||||
bool operator!=(const aligned_allocator<T1, Align1>& lhs,
|
||||
const aligned_allocator<T2, Align2>& rhs) noexcept;
|
||||
|
||||
void* aligned_malloc(size_t size, size_t alignment);
|
||||
void aligned_free(void* ptr);
|
||||
|
||||
template <class T>
|
||||
size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
|
||||
|
||||
/************************************
|
||||
* aligned_allocator implementation *
|
||||
************************************/
|
||||
|
||||
/**
|
||||
* Default constructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline aligned_allocator<T, A>::aligned_allocator() noexcept
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy constructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Extended copy constructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
template <class U>
|
||||
inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline aligned_allocator<T, A>::~aligned_allocator()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the actual address of \c r even in presence of overloaded \c operator&.
|
||||
* @param r the object to acquire address of.
|
||||
* @return the actual address of \c r.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::address(reference r) noexcept -> pointer
|
||||
{
|
||||
return &r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the actual address of \c r even in presence of overloaded \c operator&.
|
||||
* @param r the object to acquire address of.
|
||||
* @return the actual address of \c r.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
|
||||
{
|
||||
return &r;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
|
||||
* The alignment may require some extra memory allocation.
|
||||
* @param n the number of objects to allocate storage for.
|
||||
* @param hint unused parameter provided for standard compliance.
|
||||
* @return a pointer to the first byte of a memory block suitably aligned and sufficient to
|
||||
* hold an array of \c n objects of type \c T.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
|
||||
{
|
||||
pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
|
||||
#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
|
||||
if (res == nullptr)
|
||||
throw std::bad_alloc();
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
|
||||
* an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
|
||||
* to allocate() that originally produced \c p; otherwise, the behavior is undefined.
|
||||
* @param p pointer obtained from allocate().
|
||||
* @param n number of objects earlier passed to allocate().
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
|
||||
{
|
||||
aligned_free(p);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the maximum theoretically possible value of \c n, for which the
|
||||
* call allocate(n, 0) could succeed.
|
||||
* @return the maximum supported allocated size.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::max_size() const noexcept -> size_type
|
||||
{
|
||||
return size_type(-1) / sizeof(T);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is deprecated, use max_size() instead
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
inline auto
|
||||
aligned_allocator<T, A>::size_max() const noexcept -> size_type
|
||||
{
|
||||
return size_type(-1) / sizeof(T);
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an object of type \c T in allocated uninitialized memory
|
||||
* pointed to by \c p, using placement-new.
|
||||
* @param p pointer to allocated uninitialized memory.
|
||||
* @param args the constructor arguments to use.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
template <class U, class... Args>
|
||||
inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
|
||||
{
|
||||
new ((void*)p) U(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calls the destructor of the object pointed to by \c p.
|
||||
* @param p pointer to the object that is going to be destroyed.
|
||||
*/
|
||||
template <class T, size_t A>
|
||||
template <class U>
|
||||
inline void aligned_allocator<T, A>::destroy(U* p)
|
||||
{
|
||||
p->~U();
|
||||
}
|
||||
|
||||
/**
|
||||
* @defgroup allocator_comparison Comparison operators
|
||||
*/
|
||||
|
||||
/**
|
||||
* @ingroup allocator_comparison
|
||||
* Compares two aligned memory allocator for equality. Since allocators
|
||||
* are stateless, return \c true iff <tt>A1 == A2</tt>.
|
||||
* @param lhs aligned_allocator to compare.
|
||||
* @param rhs aligned_allocator to compare.
|
||||
* @return true if the allocators have the same alignment.
|
||||
*/
|
||||
template <class T1, size_t A1, class T2, size_t A2>
|
||||
inline bool operator==(const aligned_allocator<T1, A1>& lhs,
|
||||
const aligned_allocator<T2, A2>& rhs) noexcept
|
||||
{
|
||||
return lhs.alignment == rhs.alignment;
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup allocator_comparison
|
||||
* Compares two aligned memory allocator for inequality. Since allocators
|
||||
* are stateless, return \c true iff <tt>A1 != A2</tt>.
|
||||
* @param lhs aligned_allocator to compare.
|
||||
* @param rhs aligned_allocator to compare.
|
||||
* @return true if the allocators have different alignments.
|
||||
*/
|
||||
template <class T1, size_t A1, class T2, size_t A2>
|
||||
inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
|
||||
const aligned_allocator<T2, A2>& rhs) noexcept
|
||||
{
|
||||
return !(lhs == rhs);
|
||||
}
|
||||
|
||||
/****************************************
|
||||
* aligned malloc / free implementation *
|
||||
****************************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
inline void* xaligned_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
|
||||
assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
|
||||
void* res = nullptr;
|
||||
#ifdef _WIN32
|
||||
res = _aligned_malloc(size, alignment);
|
||||
#else
|
||||
if (posix_memalign(&res, alignment, size) != 0)
|
||||
{
|
||||
res = nullptr;
|
||||
}
|
||||
#endif
|
||||
return res;
|
||||
}
|
||||
|
||||
inline void xaligned_free(void* ptr)
|
||||
{
|
||||
#ifdef _WIN32
|
||||
_aligned_free(ptr);
|
||||
#else
|
||||
free(ptr);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
inline void* aligned_malloc(size_t size, size_t alignment)
|
||||
{
|
||||
return detail::xaligned_malloc(size, alignment);
|
||||
}
|
||||
|
||||
inline void aligned_free(void* ptr)
|
||||
{
|
||||
detail::xaligned_free(ptr);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
|
||||
{
|
||||
// size_t block_size = simd_traits<T>::size;
|
||||
if (block_size == 1)
|
||||
{
|
||||
// The simd_block consists of exactly one scalar so that all
|
||||
// elements of the array
|
||||
// are "well" aligned.
|
||||
return 0;
|
||||
}
|
||||
else if (size_t(p) & (sizeof(T) - 1))
|
||||
{
|
||||
// The array is not aligned to the size of a single element, so that
|
||||
// no element
|
||||
// of the array is well aligned
|
||||
return size;
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t block_mask = block_size - 1;
|
||||
return std::min<size_t>(
|
||||
(block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
|
||||
size);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, class A = default_arch>
|
||||
using default_allocator = typename std::conditional<A::requires_alignment(),
|
||||
aligned_allocator<T, A::alignment()>,
|
||||
std::allocator<T>>::type;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,76 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_ALIGNMENT_HPP
|
||||
#define XSIMD_ALIGNMENT_HPP
|
||||
|
||||
#include "../types/xsimd_utils.hpp"
|
||||
#include "xsimd_aligned_allocator.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @struct aligned_mode
|
||||
* @brief tag for load and store of aligned memory.
|
||||
*/
|
||||
struct aligned_mode
|
||||
{
|
||||
};
|
||||
|
||||
/**
|
||||
* @struct unaligned_mode
|
||||
* @brief tag for load and store of unaligned memory.
|
||||
*/
|
||||
struct unaligned_mode
|
||||
{
|
||||
};
|
||||
|
||||
/***********************
|
||||
* Allocator alignment *
|
||||
***********************/
|
||||
|
||||
template <class A>
|
||||
struct allocator_alignment
|
||||
{
|
||||
using type = unaligned_mode;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct allocator_alignment<aligned_allocator<T>>
|
||||
{
|
||||
using type = aligned_mode;
|
||||
};
|
||||
|
||||
template <class A>
|
||||
using allocator_alignment_t = typename allocator_alignment<A>::type;
|
||||
|
||||
/***********************
|
||||
* container alignment *
|
||||
***********************/
|
||||
|
||||
template <class C, class = void>
|
||||
struct container_alignment
|
||||
{
|
||||
using type = unaligned_mode;
|
||||
};
|
||||
|
||||
template <class C>
|
||||
struct container_alignment<C, detail::void_t<typename C::allocator_type>>
|
||||
{
|
||||
using type = allocator_alignment_t<typename C::allocator_type>;
|
||||
};
|
||||
|
||||
template <class C>
|
||||
using container_alignment_t = typename container_alignment<C>::type;
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,32 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#include "xsimd_fma3_sse_register.hpp"
|
||||
#include "xsimd_fma4_register.hpp"
|
||||
#include "xsimd_sse2_register.hpp"
|
||||
#include "xsimd_sse3_register.hpp"
|
||||
#include "xsimd_sse4_1_register.hpp"
|
||||
#include "xsimd_sse4_2_register.hpp"
|
||||
|
||||
#include "xsimd_avx2_register.hpp"
|
||||
#include "xsimd_avx_register.hpp"
|
||||
#include "xsimd_fma3_avx2_register.hpp"
|
||||
#include "xsimd_fma3_avx_register.hpp"
|
||||
|
||||
#include "xsimd_avx512bw_register.hpp"
|
||||
#include "xsimd_avx512cd_register.hpp"
|
||||
#include "xsimd_avx512dq_register.hpp"
|
||||
#include "xsimd_avx512f_register.hpp"
|
||||
|
||||
#include "xsimd_neon64_register.hpp"
|
||||
#include "xsimd_neon_register.hpp"
|
||||
|
||||
#include "xsimd_sve_register.hpp"
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,40 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX2_REGISTER_HPP
|
||||
#define XSIMD_AVX2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX2 instructions
|
||||
*/
|
||||
struct avx2 : avx
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX2
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,48 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512BW_REGISTER_HPP
|
||||
#define XSIMD_AVX512BW_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx512dq_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512BW instructions
|
||||
*/
|
||||
struct avx512bw : avx512dq
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx512bw"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512BW
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512bw>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,48 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512CD_REGISTER_HPP
|
||||
#define XSIMD_AVX512CD_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx512f_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512CD instrutions
|
||||
*/
|
||||
struct avx512cd : avx512f
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx512cd"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512CD
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512cd>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,48 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512DQ_REGISTER_HPP
|
||||
#define XSIMD_AVX512DQ_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx512cd_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512DQ instructions
|
||||
*/
|
||||
struct avx512dq : avx512cd
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
|
||||
static constexpr char const* name() noexcept { return "avx512dq"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512DQ
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512dq>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,75 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX512F_REGISTER_HPP
|
||||
#define XSIMD_AVX512F_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_generic_arch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX512F instructions
|
||||
*/
|
||||
struct avx512f : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
|
||||
static constexpr std::size_t alignment() noexcept { return 64; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr char const* name() noexcept { return "avx512f"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_AVX512F
|
||||
|
||||
namespace types
|
||||
{
|
||||
template <class T>
|
||||
struct simd_avx512_bool_register
|
||||
{
|
||||
using register_type = typename std::conditional<
|
||||
(sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
|
||||
std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
|
||||
register_type data;
|
||||
simd_avx512_bool_register() = default;
|
||||
simd_avx512_bool_register(register_type r) { data = r; }
|
||||
operator register_type() const noexcept { return data; }
|
||||
};
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, avx512f>
|
||||
{
|
||||
using type = simd_avx512_bool_register<T>;
|
||||
};
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
|
||||
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,62 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_AVX_REGISTER_HPP
|
||||
#define XSIMD_AVX_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_generic_arch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX instructions
|
||||
*/
|
||||
struct avx : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
|
||||
static constexpr std::size_t alignment() noexcept { return 32; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr char const* name() noexcept { return "avx"; }
|
||||
};
|
||||
}
|
||||
|
||||
#if XSIMD_WITH_AVX
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,147 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_BATCH_CONSTANT_HPP
|
||||
#define XSIMD_BATCH_CONSTANT_HPP
|
||||
|
||||
#include "./xsimd_batch.hpp"
|
||||
#include "./xsimd_utils.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @brief batch of boolean constant
|
||||
*
|
||||
* Abstract representation of a batch of boolean constants.
|
||||
*
|
||||
* @tparam batch_type the type of the associated batch values.
|
||||
* @tparam Values boolean constant represented by this batch
|
||||
**/
|
||||
template <class batch_type, bool... Values>
|
||||
struct batch_bool_constant
|
||||
{
|
||||
static constexpr std::size_t size = sizeof...(Values);
|
||||
using arch_type = typename batch_type::arch_type;
|
||||
using value_type = bool;
|
||||
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
|
||||
|
||||
operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
|
||||
|
||||
bool get(size_t i) const noexcept
|
||||
{
|
||||
return std::array<value_type, size> { { Values... } }[i];
|
||||
}
|
||||
|
||||
static constexpr int mask() noexcept
|
||||
{
|
||||
return mask_helper(0, static_cast<int>(Values)...);
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr int mask_helper(int acc) noexcept { return acc; }
|
||||
template <class... Tys>
|
||||
static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
|
||||
{
|
||||
return mask_helper(acc | mask, (masks << 1)...);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief batch of integral constants
|
||||
*
|
||||
* Abstract representation of a batch of integral constants.
|
||||
*
|
||||
* @tparam batch_type the type of the associated batch values.
|
||||
* @tparam Values constants represented by this batch
|
||||
**/
|
||||
template <class batch_type, typename batch_type::value_type... Values>
|
||||
struct batch_constant
|
||||
{
|
||||
static constexpr std::size_t size = sizeof...(Values);
|
||||
using arch_type = typename batch_type::arch_type;
|
||||
using value_type = typename batch_type::value_type;
|
||||
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
|
||||
|
||||
/**
|
||||
* @brief Generate a batch of @p batch_type from this @p batch_constant
|
||||
*/
|
||||
operator batch_type() const noexcept { return { Values... }; }
|
||||
|
||||
/**
|
||||
* @brief Get the @p i th element of this @p batch_constant
|
||||
*/
|
||||
constexpr value_type get(size_t i) const noexcept
|
||||
{
|
||||
return get(i, std::array<value_type, size> { Values... });
|
||||
}
|
||||
|
||||
private:
|
||||
constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
|
||||
{
|
||||
return values[i];
|
||||
}
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class batch_type, class G, std::size_t... Is>
|
||||
inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
|
||||
-> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
|
||||
{
|
||||
return {};
|
||||
}
|
||||
template <class batch_type, class G, std::size_t... Is>
|
||||
inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
|
||||
-> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
|
||||
{
|
||||
return {};
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/**
|
||||
* @brief Build a @c batch_constant out of a generator function
|
||||
*
|
||||
* @tparam batch_type type of the (non-constant) batch to build
|
||||
* @tparam G type used to generate that batch. That type must have a static
|
||||
* member @c get that's used to generate the batch constant. Conversely, the
|
||||
* generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
|
||||
*
|
||||
* The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
|
||||
*
|
||||
* @code
|
||||
* struct Rot
|
||||
* {
|
||||
* static constexpr unsigned get(unsigned i, unsigned n)
|
||||
* {
|
||||
* return (i + n - 1) % n;
|
||||
* }
|
||||
* };
|
||||
* @endcode
|
||||
*/
|
||||
template <class batch_type, class G>
|
||||
inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
|
||||
{
|
||||
return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
|
||||
}
|
||||
|
||||
template <class batch_type, class G>
|
||||
inline constexpr auto make_batch_bool_constant() noexcept
|
||||
-> decltype(detail::make_batch_bool_constant<batch_type, G>(
|
||||
detail::make_index_sequence<batch_type::size>()))
|
||||
{
|
||||
return detail::make_batch_bool_constant<batch_type, G>(
|
||||
detail::make_index_sequence<batch_type::size>());
|
||||
}
|
||||
|
||||
} // namespace xsimd
|
||||
|
||||
#endif
|
|
@ -0,0 +1,46 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
|
||||
#define XSIMD_FMA3_AVX2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <typename arch>
|
||||
struct fma3;
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX2 + FMA instructions
|
||||
*/
|
||||
template <>
|
||||
struct fma3<avx2> : avx2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
|
||||
static constexpr char const* name() noexcept { return "fma3+avx2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX2
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,46 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
#define XSIMD_FMA3_AVX_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_avx_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <typename arch>
|
||||
struct fma3;
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* AVX + FMA instructions
|
||||
*/
|
||||
template <>
|
||||
struct fma3<avx> : avx
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
|
||||
static constexpr char const* name() noexcept { return "fma3+avx"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA3_AVX
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,46 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
|
||||
#define XSIMD_FMA3_SSE_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse4_2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
template <typename arch>
|
||||
struct fma3;
|
||||
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE4.2 + FMA instructions
|
||||
*/
|
||||
template <>
|
||||
struct fma3<sse4_2> : sse4_2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
|
||||
static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA3_SSE
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,42 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_FMA4_REGISTER_HPP
|
||||
#define XSIMD_FMA4_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse4_2_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* FMA4 instructions
|
||||
*/
|
||||
struct fma4 : sse4_2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
|
||||
static constexpr char const* name() noexcept { return "fma4"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_FMA4
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
|
@ -0,0 +1,35 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_GENERIC_ARCH_HPP
|
||||
#define XSIMD_GENERIC_ARCH_HPP
|
||||
|
||||
#include "../config/xsimd_config.hpp"
|
||||
|
||||
/**
|
||||
* @defgroup arch Architecture description
|
||||
* */
|
||||
namespace xsimd
|
||||
{
|
||||
struct generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return true; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 0; }
|
||||
static constexpr bool requires_alignment() noexcept { return false; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
|
||||
|
||||
protected:
|
||||
static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,52 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_NEON64_REGISTER_HPP
|
||||
#define XSIMD_NEON64_REGISTER_HPP
|
||||
|
||||
#include "xsimd_neon_register.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* NEON instructions for arm64
|
||||
*/
|
||||
struct neon64 : neon
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
|
||||
static constexpr char const* name() noexcept { return "arm64+neon"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_NEON64
|
||||
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
|
||||
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, neon64>
|
||||
: detail::neon_bool_simd_register<T, neon64>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,155 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_NEON_REGISTER_HPP
|
||||
#define XSIMD_NEON_REGISTER_HPP
|
||||
|
||||
#include "xsimd_generic_arch.hpp"
|
||||
#include "xsimd_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_NEON
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* NEON instructions for arm32
|
||||
*/
|
||||
struct neon : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
|
||||
static constexpr char const* name() noexcept { return "arm32+neon"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_NEON
|
||||
namespace types
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
template <size_t S>
|
||||
struct neon_vector_type_impl;
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<8>
|
||||
{
|
||||
using signed_type = int8x16_t;
|
||||
using unsigned_type = uint8x16_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<16>
|
||||
{
|
||||
using signed_type = int16x8_t;
|
||||
using unsigned_type = uint16x8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<32>
|
||||
{
|
||||
using signed_type = int32x4_t;
|
||||
using unsigned_type = uint32x4_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct neon_vector_type_impl<64>
|
||||
{
|
||||
using signed_type = int64x2_t;
|
||||
using unsigned_type = uint64x2_t;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
|
||||
|
||||
template <class T>
|
||||
using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
|
||||
|
||||
template <class T>
|
||||
using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
|
||||
signed_neon_vector_type<T>,
|
||||
unsigned_neon_vector_type<T>>::type;
|
||||
|
||||
using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
|
||||
signed_neon_vector_type<char>,
|
||||
unsigned_neon_vector_type<char>>::type;
|
||||
}
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
|
||||
XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <size_t S>
|
||||
struct get_unsigned_type;
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<1>
|
||||
{
|
||||
using type = uint8_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<2>
|
||||
{
|
||||
using type = uint16_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<4>
|
||||
{
|
||||
using type = uint32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct get_unsigned_type<8>
|
||||
{
|
||||
using type = uint64_t;
|
||||
};
|
||||
|
||||
template <size_t S>
|
||||
using get_unsigned_type_t = typename get_unsigned_type<S>::type;
|
||||
|
||||
template <class T, class A>
|
||||
struct neon_bool_simd_register
|
||||
{
|
||||
using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, neon>
|
||||
: detail::neon_bool_simd_register<T, neon>
|
||||
{
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,94 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_REGISTER_HPP
|
||||
#define XSIMD_REGISTER_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace types
|
||||
{
|
||||
template <class T, class A>
|
||||
struct has_simd_register : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class Arch>
|
||||
struct simd_register
|
||||
{
|
||||
struct register_type
|
||||
{
|
||||
};
|
||||
};
|
||||
|
||||
#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
|
||||
template <> \
|
||||
struct simd_register<SCALAR_TYPE, ISA> \
|
||||
{ \
|
||||
using register_type = VECTOR_TYPE; \
|
||||
register_type data; \
|
||||
operator register_type() const noexcept \
|
||||
{ \
|
||||
return data; \
|
||||
} \
|
||||
}; \
|
||||
template <> \
|
||||
struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type \
|
||||
{ \
|
||||
}
|
||||
|
||||
#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA) \
|
||||
template <> \
|
||||
struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
|
||||
{ \
|
||||
}
|
||||
|
||||
#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE) \
|
||||
template <class T> \
|
||||
struct simd_register<T, ISA> : simd_register<T, ISA_BASE> \
|
||||
{ \
|
||||
using register_type = typename simd_register<T, ISA_BASE>::register_type; \
|
||||
simd_register(register_type reg) noexcept \
|
||||
: simd_register<T, ISA_BASE> { reg } \
|
||||
{ \
|
||||
} \
|
||||
simd_register() = default; \
|
||||
}; \
|
||||
template <class T> \
|
||||
struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE> \
|
||||
{ \
|
||||
}
|
||||
|
||||
template <class T, class Arch>
|
||||
struct get_bool_simd_register
|
||||
{
|
||||
using type = simd_register<T, Arch>;
|
||||
};
|
||||
|
||||
template <class T, class Arch>
|
||||
using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
|
||||
}
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
template <class A>
|
||||
// makes requires_arch equal to A const&, using type_traits functions
|
||||
using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
|
||||
template <class T>
|
||||
struct convert
|
||||
{
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,61 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE2_REGISTER_HPP
|
||||
#define XSIMD_SSE2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_generic_arch.hpp"
|
||||
#include "./xsimd_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
#include <emmintrin.h>
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE2 instructions
|
||||
*/
|
||||
struct sse2 : generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr char const* name() noexcept { return "sse2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE2
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,45 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE3_REGISTER_HPP
|
||||
#define XSIMD_SSE3_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse2_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE3
|
||||
#include <pmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE3 instructions
|
||||
*/
|
||||
struct sse3 : sse2
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
|
||||
static constexpr char const* name() noexcept { return "sse3"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE3
|
||||
namespace types
|
||||
{
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,44 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_1_REGISTER_HPP
|
||||
#define XSIMD_SSE4_1_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_ssse3_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
#include <smmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE4.1 instructions
|
||||
*/
|
||||
struct sse4_1 : ssse3
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
|
||||
static constexpr char const* name() noexcept { return "sse4.1"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE4_1
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,44 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSE4_2_REGISTER_HPP
|
||||
#define XSIMD_SSE4_2_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse4_1_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
#include <nmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSE4.2 instructions
|
||||
*/
|
||||
struct sse4_2 : sse4_1
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
|
||||
static constexpr char const* name() noexcept { return "sse4.2"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSE4_2
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,44 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SSSE3_REGISTER_HPP
|
||||
#define XSIMD_SSSE3_REGISTER_HPP
|
||||
|
||||
#include "./xsimd_sse3_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SSSE3 instructions
|
||||
*/
|
||||
struct ssse3 : sse3
|
||||
{
|
||||
static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
|
||||
static constexpr char const* name() noexcept { return "ssse3"; }
|
||||
};
|
||||
|
||||
#if XSIMD_WITH_SSSE3
|
||||
namespace types
|
||||
{
|
||||
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,155 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* Copyright (c) Yibo Cai *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_SVE_REGISTER_HPP
|
||||
#define XSIMD_SVE_REGISTER_HPP
|
||||
|
||||
#include "xsimd_generic_arch.hpp"
|
||||
#include "xsimd_register.hpp"
|
||||
|
||||
#if XSIMD_WITH_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
/**
|
||||
* @ingroup arch
|
||||
*
|
||||
* SVE instructions (fixed vector size) for arm64
|
||||
*/
|
||||
template <size_t Width>
|
||||
struct sve : xsimd::generic
|
||||
{
|
||||
static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
|
||||
static constexpr bool available() noexcept { return true; }
|
||||
static constexpr bool requires_alignment() noexcept { return true; }
|
||||
static constexpr std::size_t alignment() noexcept { return 16; }
|
||||
static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
|
||||
static constexpr char const* name() noexcept { return "arm64+sve"; }
|
||||
};
|
||||
}
|
||||
|
||||
#if XSIMD_WITH_SVE
|
||||
|
||||
using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
|
||||
|
||||
namespace types
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
// define fixed size alias per SVE sizeless type
|
||||
#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
|
||||
using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
|
||||
using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
|
||||
using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
|
||||
using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
|
||||
using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
|
||||
using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
|
||||
using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
|
||||
using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
|
||||
using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
|
||||
using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
|
||||
using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
|
||||
#undef SVE_TO_FIXED_SIZE
|
||||
|
||||
template <size_t S>
|
||||
struct sve_vector_type_impl;
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<8>
|
||||
{
|
||||
using signed_type = sve_int8_t;
|
||||
using unsigned_type = sve_uint8_t;
|
||||
using floating_point_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<16>
|
||||
{
|
||||
using signed_type = sve_int16_t;
|
||||
using unsigned_type = sve_uint16_t;
|
||||
using floating_point_type = void;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<32>
|
||||
{
|
||||
using signed_type = sve_int32_t;
|
||||
using unsigned_type = sve_uint32_t;
|
||||
using floating_point_type = sve_float32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct sve_vector_type_impl<64>
|
||||
{
|
||||
using signed_type = sve_int64_t;
|
||||
using unsigned_type = sve_uint64_t;
|
||||
using floating_point_type = sve_float64_t;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
|
||||
|
||||
template <class T>
|
||||
using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
|
||||
|
||||
template <class T>
|
||||
using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
|
||||
|
||||
template <class T>
|
||||
using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
|
||||
floating_point_sve_vector_type<T>,
|
||||
signed_int_sve_vector_type<T>>::type;
|
||||
|
||||
template <class T>
|
||||
using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
|
||||
signed_int_or_floating_point_sve_vector_type<T>,
|
||||
unsigned_int_sve_vector_type<T>>::type;
|
||||
} // namespace detail
|
||||
|
||||
XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
|
||||
XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
|
||||
|
||||
namespace detail
|
||||
{
|
||||
struct sve_bool_simd_register
|
||||
{
|
||||
using register_type = sve_bool_t;
|
||||
register_type data;
|
||||
operator register_type() const noexcept { return data; }
|
||||
};
|
||||
} // namespace detail
|
||||
|
||||
template <class T>
|
||||
struct get_bool_simd_register<T, sve>
|
||||
{
|
||||
using type = detail::sve_bool_simd_register;
|
||||
};
|
||||
} // namespace types
|
||||
#endif
|
||||
} // namespace xsimd
|
||||
|
||||
#endif
|
|
@ -0,0 +1,251 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_TRAITS_HPP
|
||||
#define XSIMD_TRAITS_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "xsimd_batch.hpp"
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
/**************************************
|
||||
* simd_traits and revert_simd_traits *
|
||||
**************************************/
|
||||
|
||||
template <class T, class A = default_arch>
|
||||
struct has_simd_register : types::has_simd_register<T, A>
|
||||
{
|
||||
};
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, bool>
|
||||
struct simd_traits_impl;
|
||||
|
||||
template <class T>
|
||||
struct simd_traits_impl<T, false>
|
||||
{
|
||||
using type = T;
|
||||
using bool_type = bool;
|
||||
static constexpr size_t size = 1;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t simd_traits_impl<T, false>::size;
|
||||
|
||||
template <class T>
|
||||
struct simd_traits_impl<T, true>
|
||||
{
|
||||
using type = batch<T>;
|
||||
using bool_type = typename type::batch_bool_type;
|
||||
static constexpr size_t size = type::size;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t simd_traits_impl<T, true>::size;
|
||||
|
||||
template <class T, class A>
|
||||
struct static_check_supported_config_emitter
|
||||
{
|
||||
|
||||
static_assert(A::supported(),
|
||||
"usage of batch type with unsupported architecture");
|
||||
static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
|
||||
"usage of batch type with unsupported type");
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T, class A, bool i3ec>
|
||||
struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
|
||||
{
|
||||
};
|
||||
#endif
|
||||
|
||||
// consistency checker
|
||||
template <class T, class A>
|
||||
void static_check_supported_config()
|
||||
{
|
||||
(void)static_check_supported_config_emitter<T, A>();
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct simd_traits<std::complex<T>>
|
||||
: detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T, bool i3ec>
|
||||
struct simd_traits<xtl::xcomplex<T, T, i3ec>>
|
||||
: detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
|
||||
{
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class T>
|
||||
struct revert_simd_traits
|
||||
{
|
||||
using type = T;
|
||||
static constexpr size_t size = simd_traits<type>::size;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t revert_simd_traits<T>::size;
|
||||
|
||||
template <class T>
|
||||
struct revert_simd_traits<batch<T>>
|
||||
{
|
||||
using type = T;
|
||||
static constexpr size_t size = batch<T>::size;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
constexpr size_t revert_simd_traits<batch<T>>::size;
|
||||
|
||||
template <class T>
|
||||
using simd_type = typename simd_traits<T>::type;
|
||||
|
||||
template <class T>
|
||||
using simd_bool_type = typename simd_traits<T>::bool_type;
|
||||
|
||||
template <class T>
|
||||
using revert_simd_type = typename revert_simd_traits<T>::type;
|
||||
|
||||
/********************
|
||||
* simd_return_type *
|
||||
********************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T1, class T2>
|
||||
struct simd_condition
|
||||
{
|
||||
static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
|
||||
};
|
||||
|
||||
template <class T1, class T2, class A>
|
||||
struct simd_return_type_impl
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T2, class A>
|
||||
struct simd_return_type_impl<bool, T2, A>
|
||||
: std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T2, class A>
|
||||
struct simd_return_type_impl<bool, std::complex<T2>, A>
|
||||
: std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, class A>
|
||||
struct simd_return_type_impl<std::complex<T1>, T2, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, class A>
|
||||
struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T1, class T2, bool I3EC, class A>
|
||||
struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
|
||||
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
|
||||
{
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class T1, class T2, class A = default_arch>
|
||||
using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
|
||||
|
||||
/************
|
||||
* is_batch *
|
||||
************/
|
||||
|
||||
template <class V>
|
||||
struct is_batch : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct is_batch<batch<T, A>> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
/*****************
|
||||
* is_batch_bool *
|
||||
*****************/
|
||||
|
||||
template <class V>
|
||||
struct is_batch_bool : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct is_batch_bool<batch_bool<T, A>> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
/********************
|
||||
* is_batch_complex *
|
||||
********************/
|
||||
|
||||
template <class V>
|
||||
struct is_batch_complex : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,530 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_UTILS_HPP
|
||||
#define XSIMD_UTILS_HPP
|
||||
|
||||
#include <complex>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
#include "xtl/xcomplex.hpp"
|
||||
#endif
|
||||
|
||||
namespace xsimd
|
||||
{
|
||||
|
||||
template <class T, class A>
|
||||
class batch;
|
||||
|
||||
template <class T, class A>
|
||||
class batch_bool;
|
||||
|
||||
/**************
|
||||
* index *
|
||||
**************/
|
||||
|
||||
template <size_t I>
|
||||
using index = std::integral_constant<size_t, I>;
|
||||
|
||||
/**************
|
||||
* as_integer *
|
||||
**************/
|
||||
|
||||
template <class T>
|
||||
struct as_integer : std::make_signed<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_integer<float>
|
||||
{
|
||||
using type = int32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_integer<double>
|
||||
{
|
||||
using type = int64_t;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct as_integer<batch<T, A>>
|
||||
{
|
||||
using type = batch<typename as_integer<T>::type, A>;
|
||||
};
|
||||
|
||||
template <class B>
|
||||
using as_integer_t = typename as_integer<B>::type;
|
||||
|
||||
/***********************
|
||||
* as_unsigned_integer *
|
||||
***********************/
|
||||
|
||||
template <class T>
|
||||
struct as_unsigned_integer : std::make_unsigned<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_unsigned_integer<float>
|
||||
{
|
||||
using type = uint32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_unsigned_integer<double>
|
||||
{
|
||||
using type = uint64_t;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct as_unsigned_integer<batch<T, A>>
|
||||
{
|
||||
using type = batch<typename as_unsigned_integer<T>::type, A>;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
|
||||
|
||||
/*********************
|
||||
* as_signed_integer *
|
||||
*********************/
|
||||
|
||||
template <class T>
|
||||
struct as_signed_integer : std::make_signed<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_signed_integer_t = typename as_signed_integer<T>::type;
|
||||
|
||||
/******************
|
||||
* flip_sign_type *
|
||||
******************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, bool is_signed>
|
||||
struct flipped_sign_type_impl : std::make_signed<T>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
|
||||
{
|
||||
};
|
||||
}
|
||||
|
||||
template <class T>
|
||||
struct flipped_sign_type
|
||||
: detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using flipped_sign_type_t = typename flipped_sign_type<T>::type;
|
||||
|
||||
/***********
|
||||
* as_float *
|
||||
************/
|
||||
|
||||
template <class T>
|
||||
struct as_float;
|
||||
|
||||
template <>
|
||||
struct as_float<int32_t>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct as_float<int64_t>
|
||||
{
|
||||
using type = double;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct as_float<batch<T, A>>
|
||||
{
|
||||
using type = batch<typename as_float<T>::type, A>;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_float_t = typename as_float<T>::type;
|
||||
|
||||
/**************
|
||||
* as_logical *
|
||||
**************/
|
||||
|
||||
template <class T>
|
||||
struct as_logical;
|
||||
|
||||
template <class T, class A>
|
||||
struct as_logical<batch<T, A>>
|
||||
{
|
||||
using type = batch_bool<T, A>;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
using as_logical_t = typename as_logical<T>::type;
|
||||
|
||||
/********************
|
||||
* bit_cast *
|
||||
********************/
|
||||
|
||||
template <class To, class From>
|
||||
inline To bit_cast(From val) noexcept
|
||||
{
|
||||
static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
|
||||
// FIXME: Some old version of GCC don't support that trait
|
||||
// static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
|
||||
// static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
|
||||
To res;
|
||||
std::memcpy(&res, &val, sizeof(val));
|
||||
return res;
|
||||
}
|
||||
|
||||
namespace kernel
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
/**************************************
|
||||
* enabling / disabling metafunctions *
|
||||
**************************************/
|
||||
|
||||
template <class T>
|
||||
using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
|
||||
|
||||
template <class T, size_t S>
|
||||
using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
|
||||
|
||||
/********************************
|
||||
* Matching & mismatching sizes *
|
||||
********************************/
|
||||
|
||||
template <class T, class U, class B = int>
|
||||
using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
|
||||
|
||||
template <class T, class U, class B = int>
|
||||
using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
|
||||
|
||||
template <class T, class U, class B = int>
|
||||
using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
|
||||
} // namespace detail
|
||||
} // namespace kernel
|
||||
|
||||
/*****************************************
|
||||
* Backport of index_sequence from c++14 *
|
||||
*****************************************/
|
||||
|
||||
// TODO: Remove this once we drop C++11 support
|
||||
namespace detail
|
||||
{
|
||||
template <typename T>
|
||||
struct identity
|
||||
{
|
||||
using type = T;
|
||||
};
|
||||
|
||||
#ifdef __cpp_lib_integer_sequence
|
||||
using std::index_sequence;
|
||||
using std::integer_sequence;
|
||||
using std::make_index_sequence;
|
||||
using std::make_integer_sequence;
|
||||
|
||||
using std::index_sequence_for;
|
||||
#else
|
||||
template <typename T, T... Is>
|
||||
struct integer_sequence
|
||||
{
|
||||
using value_type = T;
|
||||
static constexpr std::size_t size() noexcept { return sizeof...(Is); }
|
||||
};
|
||||
|
||||
template <typename Lhs, typename Rhs>
|
||||
struct make_integer_sequence_concat;
|
||||
|
||||
template <typename T, T... Lhs, T... Rhs>
|
||||
struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
|
||||
integer_sequence<T, Rhs...>>
|
||||
: identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct make_integer_sequence_impl;
|
||||
|
||||
template <typename T>
|
||||
struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, T N>
|
||||
struct make_integer_sequence_impl<std::integral_constant<T, N>>
|
||||
: make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
|
||||
typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
|
||||
{
|
||||
};
|
||||
|
||||
template <typename T, T N>
|
||||
using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
|
||||
|
||||
template <std::size_t... Is>
|
||||
using index_sequence = integer_sequence<std::size_t, Is...>;
|
||||
|
||||
template <std::size_t N>
|
||||
using make_index_sequence = make_integer_sequence<std::size_t, N>;
|
||||
|
||||
template <typename... Ts>
|
||||
using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
|
||||
|
||||
#endif
|
||||
|
||||
template <int... Is>
|
||||
using int_sequence = integer_sequence<int, Is...>;
|
||||
|
||||
template <int N>
|
||||
using make_int_sequence = make_integer_sequence<int, N>;
|
||||
|
||||
template <typename... Ts>
|
||||
using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
|
||||
|
||||
// Type-casted index sequence.
|
||||
template <class P, size_t... Is>
|
||||
inline P indexes_from(index_sequence<Is...>) noexcept
|
||||
{
|
||||
return { static_cast<typename P::value_type>(Is)... };
|
||||
}
|
||||
|
||||
template <class P>
|
||||
inline P make_sequence_as_batch() noexcept
|
||||
{
|
||||
return indexes_from<P>(make_index_sequence<P::size>());
|
||||
}
|
||||
}
|
||||
|
||||
/***********************************
|
||||
* Backport of std::get from C++14 *
|
||||
***********************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T, class... Types, size_t I, size_t... Is>
|
||||
inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
|
||||
{
|
||||
return std::get<I>(t);
|
||||
}
|
||||
|
||||
template <class T, class U, class... Types, size_t I, size_t... Is>
|
||||
inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
|
||||
{
|
||||
using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
|
||||
return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
|
||||
}
|
||||
|
||||
template <class T, class... Types>
|
||||
inline const T& get(const std::tuple<Types...>& t) noexcept
|
||||
{
|
||||
using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
|
||||
return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
|
||||
}
|
||||
}
|
||||
|
||||
/*********************************
|
||||
* Backport of void_t from C++17 *
|
||||
*********************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class... T>
|
||||
struct make_void
|
||||
{
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <class... T>
|
||||
using void_t = typename make_void<T...>::type;
|
||||
}
|
||||
|
||||
/**************************************************
|
||||
* Equivalent of void_t but with size_t parameter *
|
||||
**************************************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <std::size_t>
|
||||
struct check_size
|
||||
{
|
||||
using type = void;
|
||||
};
|
||||
|
||||
template <std::size_t S>
|
||||
using check_size_t = typename check_size<S>::type;
|
||||
}
|
||||
|
||||
/*****************************************
|
||||
* Supplementary std::array constructors *
|
||||
*****************************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
// std::array constructor from scalar value ("broadcast")
|
||||
template <typename T, std::size_t... Is>
|
||||
inline constexpr std::array<T, sizeof...(Is)>
|
||||
array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
|
||||
{
|
||||
// You can safely ignore this silly ternary, the "scalar" is all
|
||||
// that matters. The rest is just a dirty workaround...
|
||||
return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
inline constexpr std::array<T, N>
|
||||
array_from_scalar(const T& scalar) noexcept
|
||||
{
|
||||
return array_from_scalar_impl(scalar, make_index_sequence<N>());
|
||||
}
|
||||
|
||||
// std::array constructor from C-style pointer (handled as an array)
|
||||
template <typename T, std::size_t... Is>
|
||||
inline constexpr std::array<T, sizeof...(Is)>
|
||||
array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
|
||||
{
|
||||
return std::array<T, sizeof...(Is)> { c_array[Is]... };
|
||||
}
|
||||
|
||||
template <typename T, std::size_t N>
|
||||
inline constexpr std::array<T, N>
|
||||
array_from_pointer(const T* c_array) noexcept
|
||||
{
|
||||
return array_from_pointer_impl(c_array, make_index_sequence<N>());
|
||||
}
|
||||
}
|
||||
|
||||
/************************
|
||||
* is_array_initializer *
|
||||
************************/
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <bool...>
|
||||
struct bool_pack;
|
||||
|
||||
template <bool... bs>
|
||||
using all_true = std::is_same<
|
||||
bool_pack<bs..., true>, bool_pack<true, bs...>>;
|
||||
|
||||
template <typename T, typename... Args>
|
||||
using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
|
||||
|
||||
template <typename T, std::size_t N, typename... Args>
|
||||
using is_array_initializer = std::enable_if<
|
||||
(sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
|
||||
|
||||
// Check that a variadic argument pack is a list of N values of type T,
|
||||
// as usable for instantiating a value of type std::array<T, N>.
|
||||
template <typename T, std::size_t N, typename... Args>
|
||||
using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
|
||||
}
|
||||
|
||||
/**************
|
||||
* is_complex *
|
||||
**************/
|
||||
|
||||
// This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
|
||||
// However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
|
||||
// so we cannot define is_complex in xsimd_traits.hpp. Besides, if
|
||||
// no file defining batches is included, we still need this definition
|
||||
// in xsimd_traits.hpp, so let's define it here.
|
||||
|
||||
namespace detail
|
||||
{
|
||||
template <class T>
|
||||
struct is_complex : std::false_type
|
||||
{
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct is_complex<std::complex<T>> : std::true_type
|
||||
{
|
||||
};
|
||||
|
||||
#ifdef XSIMD_ENABLE_XTL_COMPLEX
|
||||
template <class T, bool i3ec>
|
||||
struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
|
||||
{
|
||||
};
|
||||
#endif
|
||||
}
|
||||
|
||||
/*******************
|
||||
* real_batch_type *
|
||||
*******************/
|
||||
|
||||
template <class B>
|
||||
struct real_batch_type
|
||||
{
|
||||
using type = B;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct real_batch_type<batch<std::complex<T>, A>>
|
||||
{
|
||||
using type = batch<T, A>;
|
||||
};
|
||||
|
||||
template <class B>
|
||||
using real_batch_type_t = typename real_batch_type<B>::type;
|
||||
|
||||
/**********************
|
||||
* complex_batch_type *
|
||||
**********************/
|
||||
|
||||
template <class B>
|
||||
struct complex_batch_type
|
||||
{
|
||||
using real_value_type = typename B::value_type;
|
||||
using arch_type = typename B::arch_type;
|
||||
using type = batch<std::complex<real_value_type>, arch_type>;
|
||||
};
|
||||
|
||||
template <class T, class A>
|
||||
struct complex_batch_type<batch<std::complex<T>, A>>
|
||||
{
|
||||
using type = batch<std::complex<T>, A>;
|
||||
};
|
||||
|
||||
template <class B>
|
||||
using complex_batch_type_t = typename complex_batch_type<B>::type;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -0,0 +1,68 @@
|
|||
/***************************************************************************
|
||||
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
|
||||
* Martin Renou *
|
||||
* Copyright (c) QuantStack *
|
||||
* Copyright (c) Serge Guelton *
|
||||
* *
|
||||
* Distributed under the terms of the BSD 3-Clause License. *
|
||||
* *
|
||||
* The full license is in the file LICENSE, distributed with this software. *
|
||||
****************************************************************************/
|
||||
|
||||
#ifndef XSIMD_HPP
|
||||
#define XSIMD_HPP
|
||||
|
||||
#if defined(__has_cpp_attribute)
|
||||
// if this check passes, then the compiler supports feature test macros
|
||||
#if __has_cpp_attribute(nodiscard) >= 201603L
|
||||
// if this check passes, then the compiler supports [[nodiscard]] without a message
|
||||
#define XSIMD_NO_DISCARD [[nodiscard]]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
|
||||
// this means that the previous tests failed, but we are using C++17 or higher
|
||||
#define XSIMD_NO_DISCARD [[nodiscard]]
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
|
||||
// this means that the previous checks failed, but we are using GCC or Clang
|
||||
#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_NO_DISCARD)
|
||||
// this means that all the previous checks failed, so we fallback to doing nothing
|
||||
#define XSIMD_NO_DISCARD
|
||||
#endif
|
||||
|
||||
#ifdef __cpp_if_constexpr
|
||||
// this means that the compiler supports the `if constexpr` construct
|
||||
#define XSIMD_IF_CONSTEXPR if constexpr
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
|
||||
// this means that the previous test failed, but we are using C++17 or higher
|
||||
#define XSIMD_IF_CONSTEXPR if constexpr
|
||||
#endif
|
||||
|
||||
#if !defined(XSIMD_IF_CONSTEXPR)
|
||||
// this means that all the previous checks failed, so we fallback to a normal `if`
|
||||
#define XSIMD_IF_CONSTEXPR if
|
||||
#endif
|
||||
|
||||
#include "config/xsimd_config.hpp"
|
||||
|
||||
#include "arch/xsimd_scalar.hpp"
|
||||
#include "memory/xsimd_aligned_allocator.hpp"
|
||||
|
||||
#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
|
||||
// to type definition or anything appart from scalar definition and aligned allocator
|
||||
#else
|
||||
#include "types/xsimd_batch.hpp"
|
||||
#include "types/xsimd_batch_constant.hpp"
|
||||
#include "types/xsimd_traits.hpp"
|
||||
|
||||
// This include must come last
|
||||
#include "types/xsimd_api.hpp"
|
||||
#endif
|
||||
#endif
|
|
@ -0,0 +1,37 @@
|
|||
schema: 1
|
||||
|
||||
bugzilla:
|
||||
product: Toolkit
|
||||
component: "General"
|
||||
|
||||
origin:
|
||||
name: xsimd
|
||||
description: C++ wrappers for SIMD intrinsics
|
||||
|
||||
url: https://github.com/QuantStack/xsimd
|
||||
|
||||
release: 75b043b8e031f1ada8053fe80d5ba635e2a75588 (2023-01-05T06:45:23Z).
|
||||
revision: 75b043b8e031f1ada8053fe80d5ba635e2a75588
|
||||
|
||||
license: BSD-3-Clause
|
||||
|
||||
vendoring:
|
||||
url: https://github.com/QuantStack/xsimd
|
||||
source-hosting: github
|
||||
tracking: commit
|
||||
|
||||
exclude:
|
||||
- ".*"
|
||||
- "*.md"
|
||||
- "*.yml"
|
||||
- "*.txt"
|
||||
- "*.in"
|
||||
- "*.sh"
|
||||
- benchmark
|
||||
- cmake
|
||||
- docs
|
||||
- examples
|
||||
- test
|
||||
|
||||
keep:
|
||||
- include/
|
|
@ -2029,6 +2029,7 @@ into source code and to files in the following directories:
|
|||
#ifdef MOZ_JXL
|
||||
<li><code>third_party/jpeg-xl/</code></li>
|
||||
#endif
|
||||
<li><code>third_party/xsimd/</code></li>
|
||||
</ul>
|
||||
See the individual LICENSE files for copyright owners.</p>
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче