Bug 1801557 - import xsimd to third_party r=glandium

Differential Revision: https://phabricator.services.mozilla.com/D162537
This commit is contained in:
serge-sans-paille 2023-01-16 11:05:19 +00:00
Родитель ec72d27e4d
Коммит 46a6cbf6ca
69 изменённых файлов: 27523 добавлений и 0 удалений

3
third_party/moz.build поставляемый
Просмотреть файл

@ -40,6 +40,9 @@ with Files('rust/**'):
with Files('webkit/**'):
BUG_COMPONENT = ('Firefox Build System', 'General')
with Files('xsimd/**'):
BUG_COMPONENT = ('Firefox Build System', 'General')
with Files('prio/**'):
BUG_COMPONENT = ('Firefox Build System', 'General')

29
third_party/xsimd/LICENSE поставляемый Normal file
Просмотреть файл

@ -0,0 +1,29 @@
Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
Copyright (c) 2016, QuantStack
Copyright (c) 2018, Serge Guelton
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
* Neither the name of the copyright holder nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Просмотреть файл

@ -0,0 +1,152 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
#define XSIMD_GENERIC_ARITHMETIC_HPP
#include <complex>
#include <type_traits>
#include "./xsimd_generic_details.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// bitwise_lshift
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return detail::apply([](T x, T y) noexcept
{ return x << y; },
self, other);
}
// bitwise_rshift
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return detail::apply([](T x, T y) noexcept
{ return x >> y; },
self, other);
}
// div
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return detail::apply([](T x, T y) noexcept -> T
{ return x / y; },
self, other);
}
// fma
template <class A, class T>
inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
{
return x * y + z;
}
template <class A, class T>
inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
return { res_r, res_i };
}
// fms
template <class A, class T>
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
{
return x * y - z;
}
template <class A, class T>
inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
return { res_r, res_i };
}
// fnma
template <class A, class T>
inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
{
return -x * y + z;
}
template <class A, class T>
inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
return { res_r, res_i };
}
// fnms
template <class A, class T>
inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
{
return -x * y - z;
}
template <class A, class T>
inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
return { res_r, res_i };
}
// mul
template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return detail::apply([](T x, T y) noexcept -> T
{ return x * y; },
self, other);
}
// sadd
template <class A>
inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
{
return add(self, other); // no saturated arithmetic on floating point numbers
}
template <class A>
inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
{
return add(self, other); // no saturated arithmetic on floating point numbers
}
// ssub
template <class A>
inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
{
return sub(self, other); // no saturated arithmetic on floating point numbers
}
template <class A>
inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
{
return sub(self, other); // no saturated arithmetic on floating point numbers
}
}
}
#endif

Просмотреть файл

@ -0,0 +1,96 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_COMPLEX_HPP
#define XSIMD_GENERIC_COMPLEX_HPP
#include <complex>
#include "./xsimd_generic_details.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// real
template <class A, class T>
inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return self;
}
template <class A, class T>
inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
{
return self.real();
}
// imag
template <class A, class T>
inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
{
return batch<T, A>(T(0));
}
template <class A, class T>
inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
{
return self.imag();
}
// arg
template <class A, class T>
inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return atan2(imag(self), real(self));
}
// conj
template <class A, class T>
inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return { real(self), -imag(self) };
}
// norm
template <class A, class T>
inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return { fma(real(self), real(self), imag(self) * imag(self)) };
}
// proj
template <class A, class T>
inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = complex_batch_type_t<batch<T, A>>;
using real_batch = typename batch_type::real_batch;
using real_value_type = typename real_batch::value_type;
auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
return select(cond,
batch_type(constants::infinity<real_batch>(),
copysign(real_batch(real_value_type(0)), imag(self))),
batch_type(self));
}
template <class A, class T>
inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
{
return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
}
}
}
#endif

Просмотреть файл

@ -0,0 +1,239 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_DETAILS_HPP
#define XSIMD_GENERIC_DETAILS_HPP
#include <complex>
#include "../../math/xsimd_rem_pio2.hpp"
#include "../../types/xsimd_generic_arch.hpp"
#include "../../types/xsimd_utils.hpp"
#include "../xsimd_constants.hpp"
namespace xsimd
{
// Forward declaration. Should we put them in a separate file?
template <class T, class A>
inline batch<T, A> abs(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
template <class T, class A>
inline bool any(batch_bool<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
template <class A, class T_out, class T_in>
inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
template <class T, class A>
inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
template <class B, class T, class A>
inline B bitwise_cast(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> cos(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> exp(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
template <class T, class A>
inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
template <class T, class A>
inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
template <class T, class A, uint64_t... Coefs>
inline batch<T, A> horner(const batch<T, A>& self) noexcept;
template <class T, class A>
inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
template <class T, class A>
inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
template <class T, class A>
inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
template <class T, class A>
inline batch<T, A> log(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
template <class T, class A>
inline T reduce_add(batch<T, A> const&) noexcept;
template <class T, class A>
inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
template <class T, class A>
inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
template <class T, class A>
inline batch<T, A> sign(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> sin(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
template <class T, class A>
inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> tan(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
template <class T, class A>
inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
namespace kernel
{
namespace detail
{
template <class F, class A, class T, class... Batches>
inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
{
constexpr std::size_t size = batch<T, A>::size;
alignas(A::alignment()) T self_buffer[size];
alignas(A::alignment()) T other_buffer[size];
self.store_aligned(&self_buffer[0]);
other.store_aligned(&other_buffer[0]);
for (std::size_t i = 0; i < size; ++i)
{
self_buffer[i] = func(self_buffer[i], other_buffer[i]);
}
return batch<T, A>::load_aligned(self_buffer);
}
template <class U, class F, class A, class T>
inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
{
static_assert(batch<T, A>::size == batch<U, A>::size,
"Source and destination sizes must match");
constexpr std::size_t src_size = batch<T, A>::size;
constexpr std::size_t dest_size = batch<U, A>::size;
alignas(A::alignment()) T self_buffer[src_size];
alignas(A::alignment()) U other_buffer[dest_size];
self.store_aligned(&self_buffer[0]);
for (std::size_t i = 0; i < src_size; ++i)
{
other_buffer[i] = func(self_buffer[i]);
}
return batch<U, A>::load_aligned(other_buffer);
}
}
namespace detail
{
// Generic conversion handling machinery. Each architecture must define
// conversion function when such conversions exits in the form of
// intrinsic. Then we use that information to automatically decide whether
// to use scalar or vector conversion when doing load / store / batch_cast
struct with_fast_conversion
{
};
struct with_slow_conversion
{
};
template <class A, class From, class To, class = void>
struct conversion_type_impl
{
using type = with_slow_conversion;
};
using xsimd::detail::void_t;
template <class A, class From, class To>
struct conversion_type_impl<A, From, To,
void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
std::declval<const batch<To, A>&>(),
std::declval<const A&>()))>>
{
using type = with_fast_conversion;
};
template <class A, class From, class To>
using conversion_type = typename conversion_type_impl<A, From, To>::type;
}
namespace detail
{
/* origin: boost/simdfunction/horn.hpp*/
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class B, uint64_t c>
inline B coef() noexcept
{
using value_type = typename B::value_type;
return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
}
template <class B>
inline B horner(const B&) noexcept
{
return B(typename B::value_type(0.));
}
template <class B, uint64_t c0>
inline B horner(const B&) noexcept
{
return coef<B, c0>();
}
template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
inline B horner(const B& self) noexcept
{
return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
}
/* origin: boost/simdfunction/horn1.hpp*/
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class B>
inline B horner1(const B&) noexcept
{
return B(1.);
}
template <class B, uint64_t c0>
inline B horner1(const B& x) noexcept
{
return x + detail::coef<B, c0>();
}
template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
inline B horner1(const B& x) noexcept
{
return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
}
}
}
}
#endif

Просмотреть файл

@ -0,0 +1,163 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_LOGICAL_HPP
#define XSIMD_GENERIC_LOGICAL_HPP
#include "./xsimd_generic_details.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// from mask
template <class A, class T>
inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
{
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
// This is inefficient but should never be called. It's just a
// temporary implementation until arm support is added.
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
buffer[i] = mask & (1ull << i);
return batch_bool<T, A>::load_aligned(buffer);
}
// ge
template <class A, class T>
inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return other <= self;
}
// gt
template <class A, class T>
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return other < self;
}
// is_even
template <class A, class T>
inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return is_flint(self * T(0.5));
}
// is_flint
template <class A, class T>
inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
{
auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
return frac == T(0.);
}
// is_odd
template <class A, class T>
inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return is_even(self - T(1.));
}
// isinf
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
{
return batch_bool<T, A>(false);
}
template <class A>
inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
{
return abs(self) == std::numeric_limits<float>::infinity();
}
template <class A>
inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
{
return abs(self) == std::numeric_limits<double>::infinity();
}
// isfinite
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
{
return batch_bool<T, A>(true);
}
template <class A>
inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
{
return (self - self) == 0.f;
}
template <class A>
inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
{
return (self - self) == 0.;
}
// isnan
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
{
return batch_bool<T, A>(false);
}
// le
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return (self < other) || (self == other);
}
// neq
template <class A, class T>
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return !(other == self);
}
// logical_and
template <class A, class T>
inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return detail::apply([](T x, T y) noexcept
{ return x && y; },
self, other);
}
// logical_or
template <class A, class T>
inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
return detail::apply([](T x, T y) noexcept
{ return x || y; },
self, other);
}
// mask
template <class A, class T>
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
{
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
self.store_aligned(buffer);
// This is inefficient but should never be called. It's just a
// temporary implementation until arm support is added.
uint64_t res = 0;
for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
if (buffer[i])
res |= 1ul << i;
return res;
}
}
}
#endif

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,397 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_MEMORY_HPP
#define XSIMD_GENERIC_MEMORY_HPP
#include <algorithm>
#include <complex>
#include <stdexcept>
#include "../../types/xsimd_batch_constant.hpp"
#include "./xsimd_generic_details.hpp"
namespace xsimd
{
template <class batch_type, typename batch_type::value_type... Values>
struct batch_constant;
namespace kernel
{
using namespace types;
// extract_pair
template <class A, class T>
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
{
constexpr std::size_t size = batch<T, A>::size;
assert(i < size && "index in bounds");
alignas(A::alignment()) T self_buffer[size];
self.store_aligned(self_buffer);
alignas(A::alignment()) T other_buffer[size];
other.store_aligned(other_buffer);
alignas(A::alignment()) T concat_buffer[size];
for (std::size_t j = 0; j < (size - i); ++j)
{
concat_buffer[j] = other_buffer[i + j];
if (j < i)
{
concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
}
}
return batch<T, A>::load_aligned(concat_buffer);
}
// gather
namespace detail
{
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
inline batch<T, A> gather(U const* src, batch<V, A> const& index,
::xsimd::index<N> I) noexcept
{
return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
}
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
inline batch<T, A>
gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
{
static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
const auto test = gather<N - 1, T, A>(src, index, {});
return insert(test, static_cast<T>(src[index.get(I)]), I);
}
} // namespace detail
template <typename T, typename A, typename V>
inline batch<T, A>
gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
kernel::requires_arch<generic>) noexcept
{
static_assert(batch<T, A>::size == batch<V, A>::size,
"Index and destination sizes must match");
return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
}
// Gather with runtime indexes and mismatched strides.
template <typename T, typename A, typename U, typename V>
inline detail::sizes_mismatch_t<T, U, batch<T, A>>
gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
kernel::requires_arch<generic>) noexcept
{
static_assert(batch<T, A>::size == batch<V, A>::size,
"Index and destination sizes must match");
return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
}
// Gather with runtime indexes and matching strides.
template <typename T, typename A, typename U, typename V>
inline detail::stride_match_t<T, U, batch<T, A>>
gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
kernel::requires_arch<generic>) noexcept
{
static_assert(batch<T, A>::size == batch<V, A>::size,
"Index and destination sizes must match");
return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
}
// insert
template <class A, class T, size_t I>
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
{
struct index_mask
{
static constexpr bool get(size_t index, size_t /* size*/)
{
return index != I;
}
};
batch<T, A> tmp(val);
return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
}
// get
template <class A, size_t I, class T>
inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(&buffer[0]);
return buffer[I];
}
template <class A, size_t I, class T>
inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
{
alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
self.store_aligned(&buffer[0]);
return buffer[I];
}
template <class A, size_t I, class T>
inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
{
alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
self.store_aligned(&buffer[0]);
return buffer[I];
}
template <class A, class T>
inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
{
alignas(A::alignment()) T buffer[batch<T, A>::size];
self.store_aligned(&buffer[0]);
return buffer[i];
}
template <class A, class T>
inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
{
alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
self.store_aligned(&buffer[0]);
return buffer[i];
}
template <class A, class T>
inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
{
using T2 = typename batch<std::complex<T>, A>::value_type;
alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
self.store_aligned(&buffer[0]);
return buffer[i];
}
// load_aligned
namespace detail
{
template <class A, class T_in, class T_out>
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
{
using batch_type_in = batch<T_in, A>;
using batch_type_out = batch<T_out, A>;
return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
}
template <class A, class T_in, class T_out>
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
{
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
using batch_type_out = batch<T_out, A>;
alignas(A::alignment()) T_out buffer[batch_type_out::size];
std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
return batch_type_out::load_aligned(buffer);
}
}
template <class A, class T_in, class T_out>
inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
{
return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
}
// load_unaligned
namespace detail
{
template <class A, class T_in, class T_out>
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
{
using batch_type_in = batch<T_in, A>;
using batch_type_out = batch<T_out, A>;
return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
}
template <class A, class T_in, class T_out>
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
{
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
}
}
template <class A, class T_in, class T_out>
inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
{
return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
}
namespace detail
{
// Scatter with runtime indexes.
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
inline void scatter(batch<T, A> const& src, U* dst,
batch<V, A> const& index,
::xsimd::index<N> I) noexcept
{
dst[index.get(I)] = static_cast<U>(src.get(I));
}
template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
inline void
scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
::xsimd::index<N> I) noexcept
{
static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
kernel::detail::scatter<N - 1, T, A, U, V>(
src, dst, index, {});
dst[index.get(I)] = static_cast<U>(src.get(I));
}
} // namespace detail
template <typename A, typename T, typename V>
inline void
scatter(batch<T, A> const& src, T* dst,
batch<V, A> const& index,
kernel::requires_arch<generic>) noexcept
{
static_assert(batch<T, A>::size == batch<V, A>::size,
"Source and index sizes must match");
kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
src, dst, index, {});
}
template <typename A, typename T, typename U, typename V>
inline detail::sizes_mismatch_t<T, U, void>
scatter(batch<T, A> const& src, U* dst,
batch<V, A> const& index,
kernel::requires_arch<generic>) noexcept
{
static_assert(batch<T, A>::size == batch<V, A>::size,
"Source and index sizes must match");
kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
src, dst, index, {});
}
template <typename A, typename T, typename U, typename V>
inline detail::stride_match_t<T, U, void>
scatter(batch<T, A> const& src, U* dst,
batch<V, A> const& index,
kernel::requires_arch<generic>) noexcept
{
static_assert(batch<T, A>::size == batch<V, A>::size,
"Source and index sizes must match");
const auto tmp = batch_cast<U>(src);
kernel::scatter<A>(tmp, dst, index, A {});
}
// store
template <class T, class A>
inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
constexpr auto size = batch_bool<T, A>::size;
alignas(A::alignment()) T buffer[size];
kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
for (std::size_t i = 0; i < size; ++i)
mem[i] = bool(buffer[i]);
}
// store_aligned
template <class A, class T_in, class T_out>
inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
{
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
store_aligned(&buffer[0], self);
std::copy(std::begin(buffer), std::end(buffer), mem);
}
// store_unaligned
template <class A, class T_in, class T_out>
inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
{
static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
return store_aligned<A>(mem, self, generic {});
}
// swizzle
template <class A, class T, class ITy, ITy... Vs>
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
{
return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
}
namespace detail
{
template <class A, class T>
inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
{
static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
}
template <class A, class T>
inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
{
static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
}
template <class A, class T>
inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
{
static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
}
}
// load_complex_aligned
template <class A, class T_out, class T_in>
inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
{
using real_batch = batch<T_out, A>;
T_in const* buffer = reinterpret_cast<T_in const*>(mem);
real_batch hi = real_batch::load_aligned(buffer),
lo = real_batch::load_aligned(buffer + real_batch::size);
return detail::load_complex(hi, lo, A {});
}
// load_complex_unaligned
template <class A, class T_out, class T_in>
inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
{
using real_batch = batch<T_out, A>;
T_in const* buffer = reinterpret_cast<T_in const*>(mem);
real_batch hi = real_batch::load_unaligned(buffer),
lo = real_batch::load_unaligned(buffer + real_batch::size);
return detail::load_complex(hi, lo, A {});
}
// store_complex_aligned
template <class A, class T_out, class T_in>
inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
{
using real_batch = batch<T_in, A>;
real_batch hi = detail::complex_high(src, A {});
real_batch lo = detail::complex_low(src, A {});
T_out* buffer = reinterpret_cast<T_out*>(dst);
lo.store_aligned(buffer);
hi.store_aligned(buffer + real_batch::size);
}
// store_compelx_unaligned
template <class A, class T_out, class T_in>
inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
{
using real_batch = batch<T_in, A>;
real_batch hi = detail::complex_high(src, A {});
real_batch lo = detail::complex_low(src, A {});
T_out* buffer = reinterpret_cast<T_out*>(dst);
lo.store_unaligned(buffer);
hi.store_unaligned(buffer + real_batch::size);
}
}
}
#endif

Просмотреть файл

@ -0,0 +1,72 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_ROUNDING_HPP
#define XSIMD_GENERIC_ROUNDING_HPP
#include "./xsimd_generic_details.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// ceil
template <class A, class T>
inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
{
batch<T, A> truncated_self = trunc(self);
return select(truncated_self < self, truncated_self + 1, truncated_self);
}
// floor
template <class A, class T>
inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
{
batch<T, A> truncated_self = trunc(self);
return select(truncated_self > self, truncated_self - 1, truncated_self);
}
// round
template <class A, class T>
inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
{
auto v = abs(self);
auto c = ceil(v);
auto cp = select(c - 0.5 > v, c - 1, c);
return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
}
// trunc
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return self;
}
template <class A>
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
{
return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
}
template <class A>
inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
{
return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
}
}
}
#endif

Просмотреть файл

@ -0,0 +1,969 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_TRIGO_HPP
#define XSIMD_GENERIC_TRIGO_HPP
#include "./xsimd_generic_details.hpp"
#include <array>
namespace xsimd
{
namespace kernel
{
/* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
using namespace types;
// acos
template <class A, class T>
inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
batch_type x = abs(self);
auto x_larger_05 = x > batch_type(0.5);
x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
x = asin(x);
x = select(x_larger_05, x + x, x);
x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
return select(x_larger_05, x, constants::pio2<batch_type>() - x);
}
template <class A, class T>
inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
using real_batch = typename batch_type::real_batch;
batch_type tmp = asin(z);
return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
}
// acosh
/* origin: boost/simd/arch/common/simd/function/acosh.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A, class T>
inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
batch_type x = self - batch_type(1.);
auto test = x > constants::oneotwoeps<batch_type>();
batch_type z = select(test, self, x + sqrt(x + x + x * x));
batch_type l1pz = log1p(z);
return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
}
template <class A, class T>
inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
batch_type w = acos(z);
w = batch_type(-w.imag(), w.real());
return w;
}
// asin
template <class A>
inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<float, A>;
batch_type x = abs(self);
batch_type sign = bitofsign(self);
auto x_larger_05 = x > batch_type(0.5);
batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
x = select(x_larger_05, sqrt(z), x);
batch_type z1 = detail::horner<batch_type,
0x3e2aaae4,
0x3d9980f6,
0x3d3a3ec7,
0x3cc617e3,
0x3d2cb352>(z);
z1 = fma(z1, z * x, x);
z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
return z ^ sign;
}
template <class A>
inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<double, A>;
batch_type x = abs(self);
auto small_cond = x < constants::sqrteps<batch_type>();
batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
batch_type zz1 = batch_type(1.) - x;
batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
zz1 = sqrt(zz1 + zz1);
batch_type z = constants::pio4<batch_type>() - zz1;
zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
z = z - zz1;
zz1 = z + constants::pio4<batch_type>();
batch_type zz2 = self * self;
z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
zz2 = fma(x, z, x);
return select(x > batch_type(1.), constants::nan<batch_type>(),
select(small_cond, x,
select(x > ct1, zz1, zz2))
^ bitofsign(self));
}
template <class A, class T>
inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
using real_batch = typename batch_type::real_batch;
real_batch x = z.real();
real_batch y = z.imag();
batch_type ct(-y, x);
batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
zz = log(ct + sqrt(zz));
batch_type resg(zz.imag(), -zz.real());
return select(y == real_batch(0.),
select(fabs(x) > real_batch(1.),
batch_type(constants::pio2<real_batch>(), real_batch(0.)),
batch_type(asin(x), real_batch(0.))),
resg);
}
// asinh
/* origin: boost/simd/arch/common/simd/function/asinh.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
namespace detail
{
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A>
average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
{
return (x1 & x2) + ((x1 ^ x2) >> 1);
}
template <class A, class T>
inline batch<T, A>
averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
{
using batch_type = batch<T, A>;
return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
}
template <class A>
inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
{
return averagef(x1, x2);
}
template <class A>
inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
{
return averagef(x1, x2);
}
}
template <class A>
inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<float, A>;
batch_type x = abs(self);
auto lthalf = x < batch_type(0.5);
batch_type x2 = x * x;
batch_type bts = bitofsign(self);
batch_type z(0.);
if (any(lthalf))
{
z = detail::horner<batch_type,
0x3f800000,
0xbe2aa9ad,
0x3d9949b1,
0xbd2ee581,
0x3ca4d6e6>(x2)
* x;
if (all(lthalf))
return z ^ bts;
}
batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
#ifndef XSIMD_NO_NANS
return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
#else
return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
#endif
}
template <class A>
inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<double, A>;
batch_type x = abs(self);
auto test = x > constants::oneosqrteps<batch_type>();
batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
#ifndef XSIMD_NO_INFINITIES
z = select(x == constants::infinity<batch_type>(), x, z);
#endif
batch_type l1pz = log1p(z);
z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
return bitofsign(self) ^ z;
}
template <class A, class T>
inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
batch_type w = asin(batch_type(-z.imag(), z.real()));
w = batch_type(w.imag(), -w.real());
return w;
}
// atan
namespace detail
{
template <class A>
static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
{
using batch_type = batch<float, A>;
const auto flag1 = x < constants::tan3pio8<batch_type>();
const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
yy = select(flag2, constants::pio4<batch_type>(), yy);
batch_type xx = select(flag1, x, -recx);
xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
const batch_type z = xx * xx;
batch_type z1 = detail::horner<batch_type,
0xbeaaaa2aul,
0x3e4c925ful,
0xbe0e1b85ul,
0x3da4f0d1ul>(z);
z1 = fma(xx, z1 * z, xx);
z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
return yy + z1;
}
template <class A>
static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
{
using batch_type = batch<double, A>;
const auto flag1 = x < constants::tan3pio8<batch_type>();
const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
yy = select(flag2, constants::pio4<batch_type>(), yy);
batch_type xx = select(flag1, x, -recx);
xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
batch_type z = xx * xx;
z *= detail::horner<batch_type,
0xc0503669fd28ec8eull,
0xc05eb8bf2d05ba25ull,
0xc052c08c36880273ull,
0xc03028545b6b807aull,
0xbfec007fa1f72594ull>(z)
/ detail::horner1<batch_type,
0x4068519efbbd62ecull,
0x407e563f13b049eaull,
0x407b0e18d2e2be3bull,
0x4064a0dd43b8fa25ull,
0x4038dbc45b14603cull>(z);
z = fma(xx, z, xx);
z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
return yy + z;
}
}
template <class A, class T>
inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
const batch_type absa = abs(self);
const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
return x ^ bitofsign(self);
}
template <class A, class T>
inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
using real_batch = typename batch_type::real_batch;
real_batch x = z.real();
real_batch y = z.imag();
real_batch x2 = x * x;
real_batch one(1.);
real_batch a = one - x2 - (y * y);
real_batch w = 0.5 * atan2(2. * x, a);
real_batch num = y + one;
num = x2 + num * num;
real_batch den = y - one;
den = x2 + den * den;
batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
batch_type(real_batch(0.), constants::infinity<real_batch>()),
batch_type(w, 0.25 * log(num / den)));
return res;
}
// atanh
/* origin: boost/simd/arch/common/simd/function/acosh.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A, class T>
inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
batch_type x = abs(self);
batch_type t = x + x;
batch_type z = batch_type(1.) - x;
auto test = x < batch_type(0.5);
batch_type tmp = select(test, x, t) / z;
return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
}
template <class A, class T>
inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
batch_type w = atan(batch_type(-z.imag(), z.real()));
w = batch_type(w.imag(), -w.real());
return w;
}
// atan2
template <class A, class T>
inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
const batch_type q = abs(self / other);
const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
}
// cos
namespace detail
{
template <class T, class A>
inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
{
return x & batch<T, A>(3);
}
template <class A>
inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
{
return to_float(quadrant(to_int(x)));
}
template <class A>
inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
{
using batch_type = batch<double, A>;
batch_type a = x * batch_type(0.25);
return (a - floor(a)) * batch_type(4.);
}
/* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A>
inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
{
using batch_type = batch<float, A>;
batch_type y = detail::horner<batch_type,
0x3d2aaaa5,
0xbab60619,
0x37ccf5ce>(z);
return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
}
template <class A>
inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
{
using batch_type = batch<float, A>;
batch_type y = detail::horner<batch_type,
0xbe2aaaa2,
0x3c08839d,
0xb94ca1f9>(z);
return fma(y * z, x, x);
}
template <class A>
static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
{
using batch_type = batch<float, A>;
batch_type zz = z * z;
batch_type y = detail::horner<batch_type,
0x3eaaaa6f,
0x3e0896dd,
0x3d5ac5c9,
0x3cc821b5,
0x3b4c779c,
0x3c19c53b>(zz);
return fma(y, zz * z, z);
}
template <class A, class BB>
static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
{
using batch_type = batch<float, A>;
batch_type y = base_tancot_eval(z);
return select(test, y, -batch_type(1.) / y);
}
template <class A, class BB>
static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
{
using batch_type = batch<float, A>;
batch_type y = base_tancot_eval(z);
return select(test, batch_type(1.) / y, -y);
}
/* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A>
static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
{
using batch_type = batch<double, A>;
batch_type y = detail::horner<batch_type,
0x3fe0000000000000ull,
0xbfa5555555555551ull,
0x3f56c16c16c15d47ull,
0xbefa01a019ddbcd9ull,
0x3e927e4f8e06d9a5ull,
0xbe21eea7c1e514d4ull,
0x3da8ff831ad9b219ull>(z);
return batch_type(1.) - y * z;
}
template <class A>
static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
{
using batch_type = batch<double, A>;
batch_type y = detail::horner<batch_type,
0xbfc5555555555548ull,
0x3f8111111110f7d0ull,
0xbf2a01a019bfdf03ull,
0x3ec71de3567d4896ull,
0xbe5ae5e5a9291691ull,
0x3de5d8fd1fcf0ec1ull>(z);
return fma(y * z, x, x);
}
template <class A>
static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
{
using batch_type = batch<double, A>;
batch_type zz = z * z;
batch_type num = detail::horner<batch_type,
0xc1711fead3299176ull,
0x413199eca5fc9dddull,
0xc0c992d8d24f3f38ull>(zz);
batch_type den = detail::horner1<batch_type,
0xc189afe03cbe5a31ull,
0x4177d98fc2ead8efull,
0xc13427bc582abc96ull,
0x40cab8a5eeb36572ull>(zz);
return fma(z, (zz * (num / den)), z);
}
template <class A, class BB>
static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
{
using batch_type = batch<double, A>;
batch_type y = base_tancot_eval(z);
return select(test, y, -batch_type(1.) / y);
}
template <class A, class BB>
static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
{
using batch_type = batch<double, A>;
batch_type y = base_tancot_eval(z);
return select(test, batch_type(1.) / y, -y);
}
/* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
struct trigo_radian_tag
{
};
struct trigo_pi_tag
{
};
template <class B, class Tag = trigo_radian_tag>
struct trigo_reducer
{
static inline B reduce(const B& x, B& xr) noexcept
{
if (all(x <= constants::pio4<B>()))
{
xr = x;
return B(0.);
}
else if (all(x <= constants::pio2<B>()))
{
auto test = x > constants::pio4<B>();
xr = x - constants::pio2_1<B>();
xr -= constants::pio2_2<B>();
xr -= constants::pio2_3<B>();
xr = select(test, xr, x);
return select(test, B(1.), B(0.));
}
else if (all(x <= constants::twentypi<B>()))
{
B xi = nearbyint(x * constants::twoopi<B>());
xr = fnma(xi, constants::pio2_1<B>(), x);
xr -= xi * constants::pio2_2<B>();
xr -= xi * constants::pio2_3<B>();
return quadrant(xi);
}
else if (all(x <= constants::mediumpi<B>()))
{
B fn = nearbyint(x * constants::twoopi<B>());
B r = x - fn * constants::pio2_1<B>();
B w = fn * constants::pio2_1t<B>();
B t = r;
w = fn * constants::pio2_2<B>();
r = t - w;
w = fn * constants::pio2_2t<B>() - ((t - r) - w);
t = r;
w = fn * constants::pio2_3<B>();
r = t - w;
w = fn * constants::pio2_3t<B>() - ((t - r) - w);
xr = r - w;
return quadrant(fn);
}
else
{
static constexpr std::size_t size = B::size;
using value_type = typename B::value_type;
alignas(B) std::array<value_type, size> tmp;
alignas(B) std::array<value_type, size> txr;
alignas(B) std::array<value_type, size> args;
x.store_aligned(args.data());
for (std::size_t i = 0; i < size; ++i)
{
double arg = args[i];
if (arg == std::numeric_limits<value_type>::infinity())
{
tmp[i] = 0.;
txr[i] = std::numeric_limits<value_type>::quiet_NaN();
}
else
{
double y[2];
std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
tmp[i] = value_type(n & 3);
txr[i] = value_type(y[0]);
}
}
xr = B::load_aligned(&txr[0]);
B res = B::load_aligned(&tmp[0]);
return res;
}
}
};
template <class B>
struct trigo_reducer<B, trigo_pi_tag>
{
static inline B reduce(const B& x, B& xr) noexcept
{
B xi = nearbyint(x * B(2.));
B x2 = x - xi * B(0.5);
xr = x2 * constants::pi<B>();
return quadrant(xi);
}
};
}
template <class A, class T>
inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
const batch_type x = abs(self);
batch_type xr = constants::nan<batch_type>();
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
auto swap_bit = fma(batch_type(-2.), tmp, n);
auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
const batch_type z = xr * xr;
const batch_type se = detail::sin_eval(z, xr);
const batch_type ce = detail::cos_eval(z);
const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
return z1 ^ sign_bit;
}
template <class A, class T>
inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
}
// cosh
/* origin: boost/simd/arch/common/simd/function/cosh.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A, class T>
inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
batch_type x = abs(self);
auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
batch_type tmp = exp(x * fac);
batch_type tmp1 = batch_type(0.5) * tmp;
return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
}
template <class A, class T>
inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
auto x = z.real();
auto y = z.imag();
return { cosh(x) * cos(y), sinh(x) * sin(y) };
}
// sin
namespace detail
{
template <class A, class T, class Tag = trigo_radian_tag>
inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
{
using batch_type = batch<T, A>;
const batch_type x = abs(self);
batch_type xr = constants::nan<batch_type>();
const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
auto swap_bit = fma(batch_type(-2.), tmp, n);
auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
const batch_type z = xr * xr;
const batch_type se = detail::sin_eval(z, xr);
const batch_type ce = detail::cos_eval(z);
const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
return z1 ^ sign_bit;
}
}
template <class A, class T>
inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
{
return detail::sin(self);
}
template <class A, class T>
inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
}
// sincos
template <class A, class T>
inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
const batch_type x = abs(self);
batch_type xr = constants::nan<batch_type>();
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
auto swap_bit = fma(batch_type(-2.), tmp, n);
const batch_type z = xr * xr;
const batch_type se = detail::sin_eval(z, xr);
const batch_type ce = detail::cos_eval(z);
auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
}
template <class A, class T>
inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
using real_batch = typename batch_type::real_batch;
real_batch rcos = cos(z.real());
real_batch rsin = sin(z.real());
real_batch icosh = cosh(z.imag());
real_batch isinh = sinh(z.imag());
return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
}
// sinh
namespace detail
{
/* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A>
inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
{
using batch_type = batch<float, A>;
batch_type sqr_self = self * self;
return detail::horner<batch_type,
0x3f800000, // 1.0f
0x3e2aaacc, // 1.66667160211E-1f
0x3c087bbe, // 8.33028376239E-3f
0x39559e2f // 2.03721912945E-4f
>(sqr_self)
* self;
}
template <class A>
inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
{
using batch_type = batch<double, A>;
batch_type sqrself = self * self;
return fma(self, (detail::horner<batch_type,
0xc115782bdbf6ab05ull, // -3.51754964808151394800E5
0xc0c694b8c71d6182ull, // -1.15614435765005216044E4,
0xc064773a398ff4feull, // -1.63725857525983828727E2,
0xbfe9435fe8bb3cd6ull // -7.89474443963537015605E-1
>(sqrself)
/ detail::horner1<batch_type,
0xc1401a20e4f90044ull, // -2.11052978884890840399E6
0x40e1a7ba7ed72245ull, // 3.61578279834431989373E4,
0xc0715b6096e96484ull // -2.77711081420602794433E2,
>(sqrself))
* sqrself,
self);
}
}
/* origin: boost/simd/arch/common/simd/function/sinh.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A, class T>
inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
batch_type half(0.5);
batch_type x = abs(a);
auto lt1 = x < batch_type(1.);
batch_type bts = bitofsign(a);
batch_type z(0.);
if (any(lt1))
{
z = detail::sinh_kernel(x);
if (all(lt1))
return z ^ bts;
}
auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
batch_type fac = select(test1, half, batch_type(1.));
batch_type tmp = exp(x * fac);
batch_type tmp1 = half * tmp;
batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
return select(lt1, z, r) ^ bts;
}
template <class A, class T>
inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
auto x = z.real();
auto y = z.imag();
return { sinh(x) * cos(y), cosh(x) * sin(y) };
}
// tan
template <class A, class T>
inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
const batch_type x = abs(self);
batch_type xr = constants::nan<batch_type>();
const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
auto swap_bit = fma(batch_type(-2.), tmp, n);
auto test = (swap_bit == batch_type(0.));
const batch_type y = detail::tan_eval(xr, test);
return y ^ bitofsign(self);
}
template <class A, class T>
inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
{
using batch_type = batch<std::complex<T>, A>;
using real_batch = typename batch_type::real_batch;
real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
real_batch wreal = sin(2 * z.real()) / d;
real_batch wimag = sinh(2 * z.imag());
batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
return select(d == real_batch(0.), winf, wres);
}
// tanh
namespace detail
{
/* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class B>
struct tanh_kernel;
template <class A>
struct tanh_kernel<batch<float, A>>
{
using batch_type = batch<float, A>;
static inline batch_type tanh(const batch_type& x) noexcept
{
batch_type sqrx = x * x;
return fma(detail::horner<batch_type,
0xbeaaaa99, // -3.33332819422E-1F
0x3e088393, // +1.33314422036E-1F
0xbd5c1e2d, // -5.37397155531E-2F
0x3ca9134e, // +2.06390887954E-2F
0xbbbaf0ea // -5.70498872745E-3F
>(sqrx)
* sqrx,
x, x);
}
static inline batch_type cotanh(const batch_type& x) noexcept
{
return batch_type(1.) / tanh(x);
}
};
template <class A>
struct tanh_kernel<batch<double, A>>
{
using batch_type = batch<double, A>;
static inline batch_type tanh(const batch_type& x) noexcept
{
batch_type sqrx = x * x;
return fma(sqrx * p(sqrx) / q(sqrx), x, x);
}
static inline batch_type cotanh(const batch_type& x) noexcept
{
batch_type sqrx = x * x;
batch_type qval = q(sqrx);
return qval / (x * fma(p(sqrx), sqrx, qval));
}
static inline batch_type p(const batch_type& x) noexcept
{
return detail::horner<batch_type,
0xc0993ac030580563, // -1.61468768441708447952E3
0xc058d26a0e26682d, // -9.92877231001918586564E1,
0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
>(x);
}
static inline batch_type q(const batch_type& x) noexcept
{
return detail::horner1<batch_type,
0x40b2ec102442040c, // 4.84406305325125486048E3
0x40a176fa0e5535fa, // 2.23548839060100448583E3,
0x405c33f28a581B86 // 1.12811678491632931402E2,
>(x);
}
};
}
/* origin: boost/simd/arch/common/simd/function/tanh.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
template <class A, class T>
inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
{
using batch_type = batch<T, A>;
batch_type one(1.);
batch_type x = abs(self);
auto test = x < (batch_type(5.) / batch_type(8.));
batch_type bts = bitofsign(self);
batch_type z = one;
if (any(test))
{
z = detail::tanh_kernel<batch_type>::tanh(x);
if (all(test))
return z ^ bts;
}
batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
return select(test, z, r) ^ bts;
}
template <class A, class T>
inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
{
using real_batch = typename batch<std::complex<T>, A>::real_batch;
auto x = z.real();
auto y = z.imag();
real_batch two(2);
auto d = cosh(two * x) + cos(two * y);
return { sinh(two * x) / d, sin(two * y) / d };
}
}
}
#endif

1657
third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

940
third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,940 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX2_HPP
#define XSIMD_AVX2_HPP
#include <complex>
#include <type_traits>
#include "../types/xsimd_avx2_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// abs
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_abs_epi8(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_abs_epi16(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_abs_epi32(self);
}
else
{
return abs(self, avx {});
}
}
return self;
}
// add
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_add_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_add_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_add_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_add_epi64(self, other);
}
else
{
return add(self, other, avx {});
}
}
// bitwise_and
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_and_si256(self, other);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_and_si256(self, other);
}
// bitwise_andnot
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_andnot_si256(other, self);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_andnot_si256(other, self);
}
// bitwise_not
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
{
return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
{
return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
}
// bitwise_lshift
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_slli_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_slli_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_slli_epi64(self, other);
}
else
{
return bitwise_lshift(self, other, avx {});
}
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_sllv_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_sllv_epi64(self, other);
}
else
{
return bitwise_lshift(self, other, avx {});
}
}
// bitwise_or
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_or_si256(self, other);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_or_si256(self, other);
}
// bitwise_rshift
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
__m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
__m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
__m256i res = _mm256_srai_epi16(self, other);
return _mm256_or_si256(
detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
{ return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
sign_mask, cmp_is_negative),
_mm256_andnot_si256(sign_mask, res));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_srai_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_srai_epi32(self, other);
}
else
{
return bitwise_rshift(self, other, avx {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_srli_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_srli_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_srli_epi64(self, other);
}
else
{
return bitwise_rshift(self, other, avx {});
}
}
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_srav_epi32(self, other);
}
else
{
return bitwise_rshift(self, other, avx {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_srlv_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_srlv_epi64(self, other);
}
else
{
return bitwise_rshift(self, other, avx {});
}
}
}
// bitwise_xor
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_xor_si256(self, other);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
{
return _mm256_xor_si256(self, other);
}
// complex_low
template <class A>
inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
{
__m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
__m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
return _mm256_blend_pd(tmp0, tmp1, 10);
}
// complex_high
template <class A>
inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
{
__m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
__m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
return _mm256_blend_pd(tmp0, tmp1, 10);
}
// fast_cast
namespace detail
{
template <class A>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
{
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
__m256 cnst65536f = _mm256_set1_ps(65536.0f);
__m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */
__m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
}
template <class A>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
{
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
// adapted to avx
__m256i xH = _mm256_srli_epi64(x, 32);
xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); // 2^84
__m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}
template <class A>
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
{
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
// adapted to avx
__m256i xH = _mm256_srai_epi32(x, 16);
xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); // 3*2^67
__m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
__m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); // 2^52
__m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
}
}
// eq
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_cmpeq_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_cmpeq_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_cmpeq_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_cmpeq_epi64(self, other);
}
else
{
return eq(self, other, avx {});
}
}
// gather
template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
kernel::requires_arch<avx2>) noexcept
{
// scatter for this one is AVX512F+AVX512VL
return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
}
template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
kernel::requires_arch<avx2>) noexcept
{
// scatter for this one is AVX512F+AVX512VL
return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
}
template <class A, class U,
detail::enable_sized_integral_t<U, 4> = 0>
inline batch<float, A> gather(batch<float, A> const&, float const* src,
batch<U, A> const& index,
kernel::requires_arch<avx2>) noexcept
{
// scatter for this one is AVX512F+AVX512VL
return _mm256_i32gather_ps(src, index, sizeof(float));
}
template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
inline batch<double, A> gather(batch<double, A> const&, double const* src,
batch<U, A> const& index,
requires_arch<avx2>) noexcept
{
// scatter for this one is AVX512F+AVX512VL
return _mm256_i64gather_pd(src, index, sizeof(double));
}
// gather: handmade conversions
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
inline batch<float, A> gather(batch<float, A> const&, double const* src,
batch<V, A> const& index,
requires_arch<avx2>) noexcept
{
const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
}
template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
batch<V, A> const& index,
requires_arch<avx2>) noexcept
{
const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
}
// lt
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_cmpgt_epi8(other, self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_cmpgt_epi16(other, self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_cmpgt_epi32(other, self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_cmpgt_epi64(other, self);
}
else
{
return lt(self, other, avx {});
}
}
else
{
return lt(self, other, avx {});
}
}
// load_complex
template <class A>
inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
{
using batch_type = batch<float, A>;
batch_type real = _mm256_castpd_ps(
_mm256_permute4x64_pd(
_mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
_MM_SHUFFLE(3, 1, 2, 0)));
batch_type imag = _mm256_castpd_ps(
_mm256_permute4x64_pd(
_mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
_MM_SHUFFLE(3, 1, 2, 0)));
return { real, imag };
}
template <class A>
inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
{
using batch_type = batch<double, A>;
batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
return { real, imag };
}
// mask
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
}
else
{
return mask(self, avx {});
}
}
// max
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_max_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_max_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_max_epi32(self, other);
}
else
{
return max(self, other, avx {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_max_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_max_epu16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_max_epu32(self, other);
}
else
{
return max(self, other, avx {});
}
}
}
// min
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_min_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_min_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_min_epi32(self, other);
}
else
{
return min(self, other, avx {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_min_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_min_epu16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_min_epu32(self, other);
}
else
{
return min(self, other, avx {});
}
}
}
// mul
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_mullo_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_mullo_epi32(self, other);
}
else
{
return mul(self, other, avx {});
}
}
// reduce_add
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
__m256i tmp1 = _mm256_hadd_epi32(self, self);
__m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
__m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
return _mm_cvtsi128_si32(tmp4);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
__m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
__m256i tmp2 = _mm256_add_epi64(self, tmp1);
__m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
__m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
#if defined(__x86_64__)
return _mm_cvtsi128_si64(res);
#else
__m128i m;
_mm_storel_epi64(&m, res);
int64_t i;
std::memcpy(&i, &m, sizeof(i));
return i;
#endif
}
else
{
return reduce_add(self, avx {});
}
}
// sadd
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_adds_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_adds_epi16(self, other);
}
else
{
return sadd(self, other, avx {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_adds_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_adds_epu16(self, other);
}
else
{
return sadd(self, other, avx {});
}
}
}
// select
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_blendv_epi8(false_br, true_br, cond);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_blendv_epi8(false_br, true_br, cond);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_blendv_epi8(false_br, true_br, cond);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_blendv_epi8(false_br, true_br, cond);
}
else
{
return select(cond, true_br, false_br, avx {});
}
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
{
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
// FIXME: for some reason mask here is not considered as an immediate,
// but it's okay for _mm256_blend_epi32
// case 2: return _mm256_blend_epi16(false_br, true_br, mask);
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_blend_epi32(false_br, true_br, mask);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
constexpr int imask = detail::interleave(mask);
return _mm256_blend_epi32(false_br, true_br, imask);
}
else
{
return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
}
}
// slide_left
template <size_t N, class A, class T>
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
{
constexpr unsigned BitCount = N * 8;
if (BitCount == 0)
{
return x;
}
if (BitCount >= 256)
{
return batch<T, A>(T(0));
}
if (BitCount > 128)
{
constexpr unsigned M = (BitCount - 128) / 8;
auto y = _mm256_bslli_epi128(x, M);
return _mm256_permute2x128_si256(y, y, 0x28);
}
if (BitCount == 128)
{
return _mm256_permute2x128_si256(x, x, 0x28);
}
// shifting by [0, 128[ bits
constexpr unsigned M = BitCount / 8;
auto y = _mm256_bslli_epi128(x, M);
auto z = _mm256_bsrli_epi128(x, 16 - M);
auto w = _mm256_permute2x128_si256(z, z, 0x28);
return _mm256_or_si256(y, w);
}
// slide_right
template <size_t N, class A, class T>
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
{
constexpr unsigned BitCount = N * 8;
if (BitCount == 0)
{
return x;
}
if (BitCount >= 256)
{
return batch<T, A>(T(0));
}
if (BitCount > 128)
{
constexpr unsigned M = (BitCount - 128) / 8;
auto y = _mm256_bsrli_epi128(x, M);
return _mm256_permute2x128_si256(y, y, 0x81);
}
if (BitCount == 128)
{
return _mm256_permute2x128_si256(x, x, 0x81);
}
// shifting by [0, 128[ bits
constexpr unsigned M = BitCount / 8;
auto y = _mm256_bsrli_epi128(x, M);
auto z = _mm256_bslli_epi128(x, 16 - M);
auto w = _mm256_permute2x128_si256(z, z, 0x81);
return _mm256_or_si256(y, w);
}
// ssub
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_subs_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_subs_epi16(self, other);
}
else
{
return ssub(self, other, avx {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_subs_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_subs_epu16(self, other);
}
else
{
return ssub(self, other, avx {});
}
}
}
// sub
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm256_sub_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm256_sub_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm256_sub_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm256_sub_epi64(self, other);
}
else
{
return sub(self, other, avx {});
}
}
// swizzle
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
{
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
return _mm256_permute4x64_pd(self, mask);
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
{
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
return _mm256_permute4x64_epi64(self, mask);
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
}
// zip_hi
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
auto lo = _mm256_unpacklo_epi8(self, other);
auto hi = _mm256_unpackhi_epi8(self, other);
return _mm256_permute2f128_si256(lo, hi, 0x31);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
auto lo = _mm256_unpacklo_epi16(self, other);
auto hi = _mm256_unpackhi_epi16(self, other);
return _mm256_permute2f128_si256(lo, hi, 0x31);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
auto lo = _mm256_unpacklo_epi32(self, other);
auto hi = _mm256_unpackhi_epi32(self, other);
return _mm256_permute2f128_si256(lo, hi, 0x31);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
auto lo = _mm256_unpacklo_epi64(self, other);
auto hi = _mm256_unpackhi_epi64(self, other);
return _mm256_permute2f128_si256(lo, hi, 0x31);
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}
// zip_lo
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
auto lo = _mm256_unpacklo_epi8(self, other);
auto hi = _mm256_unpackhi_epi8(self, other);
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
auto lo = _mm256_unpacklo_epi16(self, other);
auto hi = _mm256_unpackhi_epi16(self, other);
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
auto lo = _mm256_unpacklo_epi32(self, other);
auto hi = _mm256_unpackhi_epi32(self, other);
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
auto lo = _mm256_unpacklo_epi64(self, other);
auto hi = _mm256_unpackhi_epi64(self, other);
return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}
}
}
#endif

627
third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,627 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512BW_HPP
#define XSIMD_AVX512BW_HPP
#include <array>
#include <type_traits>
#include "../types/xsimd_avx512bw_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
namespace detail
{
template <class A, class T, int Cmp>
inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
{
using register_type = typename batch_bool<T, A>::register_type;
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
}
}
}
}
// abs
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
{
if (std::is_unsigned<T>::value)
{
return self;
}
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_abs_epi8(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_abs_epi16(self);
}
else
{
return abs(self, avx512dq {});
}
}
// add
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_add_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_add_epi16(self, other);
}
else
{
return add(self, other, avx512dq {});
}
}
// bitwise_lshift
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
{
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
#else
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_slli_epi16(self, other);
#endif
}
else
{
return bitwise_lshift(self, other, avx512dq {});
}
}
// bitwise_rshift
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
__m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
__m512i zeros = _mm512_setzero_si512();
__mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
__m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
__m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
#else
__m512i res = _mm512_srai_epi16(self, other);
#endif
return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
#else
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_srai_epi16(self, other);
#endif
}
else
{
return bitwise_rshift(self, other, avx512dq {});
}
}
else
{
#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
#else
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_srli_epi16(self, other);
#endif
}
else
{
return bitwise_rshift(self, other, avx512dq {});
}
}
}
// eq
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
}
// ge
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
}
// gt
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
}
// le
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
}
// lt
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
}
// max
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_max_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_max_epi16(self, other);
}
else
{
return max(self, other, avx512dq {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_max_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_max_epu16(self, other);
}
else
{
return max(self, other, avx512dq {});
}
}
}
// min
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_min_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_min_epi16(self, other);
}
else
{
return min(self, other, avx512dq {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_min_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_min_epu16(self, other);
}
else
{
return min(self, other, avx512dq {});
}
}
}
// mul
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
__m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
__m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
return _mm512_or_si512(upper, lower);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_mullo_epi16(self, other);
}
else
{
return mul(self, other, avx512dq {});
}
}
// neq
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
}
// sadd
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_adds_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_adds_epi16(self, other);
}
else
{
return sadd(self, other, avx512dq {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_adds_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_adds_epu16(self, other);
}
else
{
return sadd(self, other, avx512dq {});
}
}
}
// select
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
}
else
{
return select(cond, true_br, false_br, avx512dq {});
}
}
// slide_left
namespace detail
{
template <size_t... Is>
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
{
return { (Is == 0 ? 8 : Is - 1)... };
}
template <size_t N, size_t... Is>
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
{
return { (Is >= N ? Is - N : 0)... };
}
template <size_t N, size_t... Is>
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
{
return { (Is >= N ? 0xFFFF : 0x0000)... };
}
}
template <size_t N, class A, class T>
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
{
constexpr unsigned BitCount = N * 8;
if (BitCount == 0)
{
return x;
}
if (BitCount >= 512)
{
return batch<T, A>(T(0));
}
batch<T, A> xx;
if (N & 1)
{
alignas(A::alignment()) uint64_t buffer[8];
_mm512_store_epi64(&buffer[0], x);
for (int i = 7; i > 0; --i)
buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
buffer[0] = buffer[0] << 8;
xx = _mm512_load_epi64(&buffer[0]);
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
__m512i xl = _mm512_slli_epi64(x, 8);
__m512i xr = _mm512_srli_epi64(x, 56);
xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
xx = _mm512_or_si512(xr, xl);
if (N == 1)
return xx;
}
else
{
xx = x;
}
alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
}
// slide_right
namespace detail
{
template <size_t... Is>
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
{
return { (Is + 1)... };
}
template <size_t N, size_t... Is>
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
{
return { (Is < (32 - N) ? Is + N : 0)... };
}
template <size_t N, size_t... Is>
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
{
return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
}
}
template <size_t N, class A, class T>
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
{
constexpr unsigned BitCount = N * 8;
if (BitCount == 0)
{
return x;
}
if (BitCount >= 512)
{
return batch<T, A>(T(0));
}
batch<T, A> xx;
if (N & 1)
{
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
__m512i xr = _mm512_srli_epi64(x, 8);
__m512i xl = _mm512_slli_epi64(x, 56);
xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
xx = _mm512_or_si512(xr, xl);
if (N == 1)
return xx;
}
else
{
xx = x;
}
alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
}
// ssub
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_subs_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_subs_epi16(self, other);
}
else
{
return ssub(self, other, avx512dq {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_subs_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_subs_epu16(self, other);
}
else
{
return ssub(self, other, avx512dq {});
}
}
}
// sub
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm512_sub_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm512_sub_epi16(self, other);
}
else
{
return sub(self, other, avx512dq {});
}
}
// swizzle
template <class A, uint16_t... Vs>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
}
template <class A, uint16_t... Vs>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
}
template <class A, uint8_t... Vs>
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
}
template <class A, uint8_t... Vs>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
}
// zip_hi
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
__m512i lo, hi;
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
lo = _mm512_unpacklo_epi8(self, other);
hi = _mm512_unpackhi_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
lo = _mm512_unpacklo_epi16(self, other);
hi = _mm512_unpackhi_epi16(self, other);
}
else
{
return zip_hi(self, other, avx512f {});
}
return _mm512_inserti32x4(
_mm512_inserti32x4(
_mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
_mm512_extracti32x4_epi32(lo, 3),
2),
_mm512_extracti32x4_epi32(hi, 2),
1);
}
// zip_lo
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
{
__m512i lo, hi;
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
lo = _mm512_unpacklo_epi8(self, other);
hi = _mm512_unpackhi_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
lo = _mm512_unpacklo_epi16(self, other);
hi = _mm512_unpackhi_epi16(self, other);
}
else
{
return zip_lo(self, other, avx512f {});
}
return _mm512_inserti32x4(
_mm512_inserti32x4(
_mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
_mm512_extracti32x4_epi32(hi, 1),
3),
_mm512_extracti32x4_epi32(lo, 1),
2);
}
}
}
#endif

28
third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,28 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512CD_HPP
#define XSIMD_AVX512CD_HPP
#include "../types/xsimd_avx512cd_register.hpp"
namespace xsimd
{
namespace kernel
{
// Nothing there yet.
}
}
#endif

212
third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,212 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512_DQHPP
#define XSIMD_AVX512_D_HPP
#include "../types/xsimd_avx512dq_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// bitwise_and
template <class A>
inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_and_ps(self, other);
}
template <class A>
inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_and_pd(self, other);
}
// bitwise_andnot
template <class A>
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_andnot_ps(other, self);
}
template <class A>
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_andnot_pd(other, self);
}
// bitwise_not
template <class A>
inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
}
template <class A>
inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
}
// bitwise_or
template <class A>
inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_or_ps(self, other);
}
template <class A>
inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_or_pd(self, other);
}
template <class A, class T>
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
{
using register_type = typename batch_bool<T, A>::register_type;
return register_type(self.data | other.data);
}
// bitwise_xor
template <class A>
inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_xor_ps(self, other);
}
template <class A>
inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_xor_pd(self, other);
}
// haddp
template <class A>
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
{
// The following folds over the vector once:
// tmp1 = [a0..8, b0..8]
// tmp2 = [a8..f, b8..f]
#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
batch<float, avx512f> res##I; \
{ \
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
res##I = _mm512_add_ps(tmp1, tmp2); \
}
XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
#undef XSIMD_AVX512_HADDP_STEP1
// The following flds the code and shuffles so that hadd_ps produces the correct result
// tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3)
// tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4)
// tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
batch<float, avx2> halfx##I; \
{ \
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
\
auto resx1 = _mm512_add_ps(tmp1, tmp2); \
\
auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
\
auto resx2 = _mm512_add_ps(tmp3, tmp4); \
\
auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
\
auto resx3 = _mm512_add_ps(tmp5, tmp6); \
\
halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
_mm512_extractf32x8_ps(resx3, 1)); \
}
XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
#undef XSIMD_AVX512_HADDP_STEP2
auto concat = _mm512_castps256_ps512(halfx0);
concat = _mm512_insertf32x8(concat, halfx1, 1);
return concat;
}
// ldexp
template <class A>
inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
{
return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
}
// mul
template <class A>
inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_mullo_epi64(self, other);
}
template <class A>
inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_mullo_epi64(self, other);
}
// nearbyint_as_int
template <class A>
inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
requires_arch<avx512dq>) noexcept
{
return _mm512_cvtpd_epi64(self);
}
// reduce_add
template <class A>
inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
{
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
return reduce_add(batch<float, avx2>(res1), avx2 {});
}
// convert
namespace detail
{
template <class A>
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
{
return _mm512_cvtepi64_pd(self);
}
template <class A>
inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
{
return _mm512_cvttpd_epi64(self);
}
}
}
}
#endif

1989
third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

384
third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,384 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
#define XSIMD_NUMERICAL_CONSTANT_HPP
#include <limits>
#include "../types/xsimd_utils.hpp"
namespace xsimd
{
namespace constants
{
#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
template <class T> \
inline T NAME() noexcept \
{ \
return T(NAME<typename T::value_type>()); \
} \
template <> \
inline float NAME<float>() noexcept \
{ \
return SINGLE; \
} \
template <> \
inline double NAME<double>() noexcept \
{ \
return DOUBLE; \
}
#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
template <class T> \
inline T NAME() noexcept \
{ \
return T(NAME<typename T::value_type>()); \
} \
template <> \
inline float NAME<float>() noexcept \
{ \
return bit_cast<float>((uint32_t)SINGLE); \
} \
template <> \
inline double NAME<double>() noexcept \
{ \
return bit_cast<double>((uint64_t)DOUBLE); \
}
XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
#undef XSIMD_DEFINE_CONSTANT
#undef XSIMD_DEFINE_CONSTANT_HEX
template <class T>
constexpr T allbits() noexcept;
template <class T>
constexpr as_integer_t<T> mask1frexp() noexcept;
template <class T>
constexpr as_integer_t<T> mask2frexp() noexcept;
template <class T>
constexpr as_integer_t<T> maxexponent() noexcept;
template <class T>
constexpr as_integer_t<T> maxexponentm1() noexcept;
template <class T>
constexpr int32_t nmb() noexcept;
template <class T>
constexpr T zero() noexcept;
template <class T>
constexpr T minvalue() noexcept;
template <class T>
constexpr T maxvalue() noexcept;
/**************************
* allbits implementation *
**************************/
namespace detail
{
template <class T, bool = std::is_integral<T>::value>
struct allbits_impl
{
static constexpr T get_value() noexcept
{
return T(~0);
}
};
template <class T>
struct allbits_impl<T, false>
{
static constexpr T get_value() noexcept
{
return nan<T>();
}
};
}
template <class T>
inline constexpr T allbits() noexcept
{
return T(detail::allbits_impl<typename T::value_type>::get_value());
}
/*****************************
* mask1frexp implementation *
*****************************/
template <class T>
inline constexpr as_integer_t<T> mask1frexp() noexcept
{
return as_integer_t<T>(mask1frexp<typename T::value_type>());
}
template <>
inline constexpr int32_t mask1frexp<float>() noexcept
{
return 0x7f800000;
}
template <>
inline constexpr int64_t mask1frexp<double>() noexcept
{
return 0x7ff0000000000000;
}
/*****************************
* mask2frexp implementation *
*****************************/
template <class T>
inline constexpr as_integer_t<T> mask2frexp() noexcept
{
return as_integer_t<T>(mask2frexp<typename T::value_type>());
}
template <>
inline constexpr int32_t mask2frexp<float>() noexcept
{
return 0x3f000000;
}
template <>
inline constexpr int64_t mask2frexp<double>() noexcept
{
return 0x3fe0000000000000;
}
/******************************
* maxexponent implementation *
******************************/
template <class T>
inline constexpr as_integer_t<T> maxexponent() noexcept
{
return as_integer_t<T>(maxexponent<typename T::value_type>());
}
template <>
inline constexpr int32_t maxexponent<float>() noexcept
{
return 127;
}
template <>
inline constexpr int64_t maxexponent<double>() noexcept
{
return 1023;
}
/******************************
* maxexponent implementation *
******************************/
template <class T>
inline constexpr as_integer_t<T> maxexponentm1() noexcept
{
return as_integer_t<T>(maxexponentm1<typename T::value_type>());
}
template <>
inline constexpr int32_t maxexponentm1<float>() noexcept
{
return 126;
}
template <>
inline constexpr int64_t maxexponentm1<double>() noexcept
{
return 1022;
}
/**********************
* nmb implementation *
**********************/
template <class T>
inline constexpr int32_t nmb() noexcept
{
return nmb<typename T::value_type>();
}
template <>
inline constexpr int32_t nmb<float>() noexcept
{
return 23;
}
template <>
inline constexpr int32_t nmb<double>() noexcept
{
return 52;
}
/***********************
* zero implementation *
***********************/
template <class T>
inline constexpr T zero() noexcept
{
return T(typename T::value_type(0));
}
/***************************
* minvalue implementation *
***************************/
namespace detail
{
template <class T>
struct minvalue_impl
{
static constexpr T get_value() noexcept
{
return std::numeric_limits<typename T::value_type>::min();
}
};
template <class T>
struct minvalue_common
{
static constexpr T get_value() noexcept
{
return std::numeric_limits<T>::min();
}
};
template <>
struct minvalue_impl<int8_t> : minvalue_common<int8_t>
{
};
template <>
struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
{
};
template <>
struct minvalue_impl<int16_t> : minvalue_common<int16_t>
{
};
template <>
struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
{
};
template <>
struct minvalue_impl<int32_t> : minvalue_common<int32_t>
{
};
template <>
struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
{
};
template <>
struct minvalue_impl<int64_t> : minvalue_common<int64_t>
{
};
template <>
struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
{
};
template <>
struct minvalue_impl<float>
{
static float get_value() noexcept
{
return bit_cast<float>((uint32_t)0xff7fffff);
}
};
template <>
struct minvalue_impl<double>
{
static double get_value() noexcept
{
return bit_cast<double>((uint64_t)0xffefffffffffffff);
}
};
}
template <class T>
inline constexpr T minvalue() noexcept
{
return T(detail::minvalue_impl<typename T::value_type>::get_value());
}
/***************************
* maxvalue implementation *
***************************/
template <class T>
inline constexpr T maxvalue() noexcept
{
return T(std::numeric_limits<typename T::value_type>::max());
}
}
}
#endif

80
third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,80 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA3_AVX_HPP
#define XSIMD_FMA3_AVX_HPP
#include "../types/xsimd_fma3_avx_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// fnma
template <class A>
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fnmadd_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fnmadd_pd(x, y, z);
}
// fnms
template <class A>
inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fnmsub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fnmsub_pd(x, y, z);
}
// fma
template <class A>
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fmadd_ps(x, y, z);
}
template <class A>
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fmadd_pd(x, y, z);
}
// fms
template <class A>
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fmsub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
{
return _mm256_fmsub_pd(x, y, z);
}
}
}
#endif

46
third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,46 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA3_AVX2_HPP
#define XSIMD_FMA3_AVX2_HPP
#include "../types/xsimd_fma3_avx2_register.hpp"
// Allow inclusion of xsimd_fma3_avx.hpp
#ifdef XSIMD_FMA3_AVX_HPP
#undef XSIMD_FMA3_AVX_HPP
#define XSIMD_FORCE_FMA3_AVX_HPP
#endif
// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
#define XSIMD_FMA3_AVX_REGISTER_HPP
#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
#endif
// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
#define avx avx2
#include "./xsimd_fma3_avx.hpp"
#undef avx
#undef XSIMD_FMA3_AVX_HPP
// Carefully restore guards
#ifdef XSIMD_FORCE_FMA3_AVX_HPP
#define XSIMD_FMA3_AVX_HPP
#undef XSIMD_FORCE_FMA3_AVX_HPP
#endif
#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
#undef XSIMD_FMA3_AVX_REGISTER_HPP
#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
#endif
#endif

79
third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,79 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA3_SSE_HPP
#define XSIMD_FMA3_SSE_HPP
#include "../types/xsimd_fma3_sse_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// fnma
template <class A>
inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fnmadd_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fnmadd_pd(x, y, z);
}
// fnms
template <class A>
inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fnmsub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fnmsub_pd(x, y, z);
}
// fma
template <class A>
inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fmadd_ps(x, y, z);
}
template <class A>
inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fmadd_pd(x, y, z);
}
// fms
template <class A>
inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fmsub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
{
return _mm_fmsub_pd(x, y, z);
}
}
}
#endif

79
third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,79 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA4_HPP
#define XSIMD_FMA4_HPP
#include "../types/xsimd_fma4_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// fnma
template <class A>
inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_nmacc_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_nmacc_pd(x, y, z);
}
// fnms
template <class A>
inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_nmsub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_nmsub_pd(x, y, z);
}
// fma
template <class A>
inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_macc_ps(x, y, z);
}
template <class A>
inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_macc_pd(x, y, z);
}
// fms
template <class A>
inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_msub_ps(x, y, z);
}
template <class A>
inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
{
return _mm_msub_pd(x, y, z);
}
}
}
#endif

23
third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,23 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_HPP
#define XSIMD_GENERIC_HPP
#include "./generic/xsimd_generic_arithmetic.hpp"
#include "./generic/xsimd_generic_complex.hpp"
#include "./generic/xsimd_generic_logical.hpp"
#include "./generic/xsimd_generic_math.hpp"
#include "./generic/xsimd_generic_memory.hpp"
#include "./generic/xsimd_generic_rounding.hpp"
#include "./generic/xsimd_generic_trigo.hpp"
#endif

38
third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,38 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_FWD_HPP
#define XSIMD_GENERIC_FWD_HPP
#include "../types/xsimd_batch_constant.hpp"
#include <type_traits>
namespace xsimd
{
namespace kernel
{
// forward declaration
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T>
inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
}
}
#endif

86
third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,86 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_ISA_HPP
#define XSIMD_ISA_HPP
#include "../config/xsimd_arch.hpp"
#include "./xsimd_generic_fwd.hpp"
#if XSIMD_WITH_SSE2
#include "./xsimd_sse2.hpp"
#endif
#if XSIMD_WITH_SSE3
#include "./xsimd_sse3.hpp"
#endif
#if XSIMD_WITH_SSSE3
#include "./xsimd_ssse3.hpp"
#endif
#if XSIMD_WITH_SSE4_1
#include "./xsimd_sse4_1.hpp"
#endif
#if XSIMD_WITH_SSE4_2
#include "./xsimd_sse4_2.hpp"
#endif
#if XSIMD_WITH_FMA3_SSE
#include "./xsimd_fma3_sse.hpp"
#endif
#if XSIMD_WITH_FMA4
#include "./xsimd_fma4.hpp"
#endif
#if XSIMD_WITH_AVX
#include "./xsimd_avx.hpp"
#endif
#if XSIMD_WITH_FMA3_AVX
#include "./xsimd_fma3_avx.hpp"
#endif
#if XSIMD_WITH_AVX2
#include "./xsimd_avx2.hpp"
#endif
#if XSIMD_WITH_FMA3_AVX2
#include "./xsimd_fma3_avx2.hpp"
#endif
#if XSIMD_WITH_AVX512F
#include "./xsimd_avx512f.hpp"
#endif
#if XSIMD_WITH_AVX512BW
#include "./xsimd_avx512bw.hpp"
#endif
#if XSIMD_WITH_NEON
#include "./xsimd_neon.hpp"
#endif
#if XSIMD_WITH_NEON64
#include "./xsimd_neon64.hpp"
#endif
#if XSIMD_WITH_SVE
#include "./xsimd_sve.hpp"
#endif
// Must come last to have access to all conversion specializations.
#include "./xsimd_generic.hpp"
#endif

2615
third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1314
third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1024
third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

1695
third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

64
third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,64 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE3_HPP
#define XSIMD_SSE3_HPP
#include "../types/xsimd_sse3_register.hpp"
#include <type_traits>
namespace xsimd
{
namespace kernel
{
using namespace types;
// haddp
template <class A>
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
{
return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
_mm_hadd_ps(row[2], row[3]));
}
template <class A>
inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
{
return _mm_hadd_pd(row[0], row[1]);
}
// load_unaligned
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
{
return _mm_lddqu_si128((__m128i const*)mem);
}
// reduce_add
template <class A>
inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
{
__m128 tmp0 = _mm_hadd_ps(self, self);
__m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
return _mm_cvtss_f32(tmp1);
}
template <class A>
inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
{
__m128d tmp0 = _mm_hadd_pd(self, self);
return _mm_cvtsd_f64(tmp0);
}
}
}
#endif

350
third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,350 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE4_1_HPP
#define XSIMD_SSE4_1_HPP
#include <type_traits>
#include "../types/xsimd_sse4_1_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// any
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
{
return !_mm_testz_si128(self, self);
}
// ceil
template <class A>
inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_ceil_ps(self);
}
template <class A>
inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_ceil_pd(self);
}
// fast_cast
namespace detail
{
template <class A>
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
{
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
__m128i xH = _mm_srai_epi32(x, 16);
xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}
template <class A>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
{
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
__m128i xH = _mm_srli_epi64(x, 32);
xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84
__m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); // 2^52
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
return _mm_add_pd(f, _mm_castsi128_pd(xL));
}
template <class A>
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
{
return _mm_castps_si128(
_mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
_mm_castsi128_ps(_mm_xor_si128(
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
_mm_set1_epi32(1u << 31))),
_mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
}
}
// eq
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm_cmpeq_epi64(self, other);
}
else
{
return eq(self, other, ssse3 {});
}
}
// floor
template <class A>
inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_floor_ps(self);
}
template <class A>
inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_floor_pd(self);
}
// insert
template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_insert_epi8(self, val, I);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_insert_epi32(self, val, I);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
return _mm_insert_epi64(self, val, I);
#else
uint32_t lo, hi;
memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
#endif
}
else
{
return insert(self, val, pos, ssse3 {});
}
}
// max
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_max_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_max_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_max_epi32(self, other);
}
else
{
return max(self, other, ssse3 {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_max_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_max_epu16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_max_epu32(self, other);
}
else
{
return max(self, other, ssse3 {});
}
}
}
// min
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
{
if (std::is_signed<T>::value)
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_min_epi8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_min_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_min_epi32(self, other);
}
else
{
return min(self, other, ssse3 {});
}
}
else
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_min_epu8(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_min_epu16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_min_epu32(self, other);
}
else
{
return min(self, other, ssse3 {});
}
}
}
// mul
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_or_si128(
_mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
_mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_mullo_epi16(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_mullo_epi32(self, other);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm_add_epi64(
_mm_mul_epu32(self, other),
_mm_slli_epi64(
_mm_add_epi64(
_mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
_mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
32));
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}
// nearbyint
template <class A>
inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
}
template <class A>
inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
}
// select
namespace detail
{
template <class T>
inline constexpr T interleave(T const& cond) noexcept
{
return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
}
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
{
return _mm_blendv_epi8(false_br, true_br, cond);
}
template <class A>
inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
{
return _mm_blendv_ps(false_br, true_br, cond);
}
template <class A>
inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
{
return _mm_blendv_pd(false_br, true_br, cond);
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
{
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_blend_epi16(false_br, true_br, mask);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
constexpr int imask = detail::interleave(mask);
return _mm_blend_epi16(false_br, true_br, imask);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
constexpr int imask = detail::interleave(mask);
constexpr int imask2 = detail::interleave(imask);
return _mm_blend_epi16(false_br, true_br, imask2);
}
else
{
return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
}
}
template <class A, bool... Values>
inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
{
constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
return _mm_blend_ps(false_br, true_br, mask);
}
template <class A, bool... Values>
inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
{
constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
return _mm_blend_pd(false_br, true_br, mask);
}
// trunc
template <class A>
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
}
template <class A>
inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
{
return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
}
}
}
#endif

44
third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,44 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE4_2_HPP
#define XSIMD_SSE4_2_HPP
#include <limits>
#include "../types/xsimd_sse4_2_register.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// lt
template <class A>
inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
{
return _mm_cmpgt_epi64(other, self);
}
template <class A>
inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
{
auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
return _mm_cmpgt_epi64(xother, xself);
}
}
}
#endif

142
third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,142 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSSE3_HPP
#define XSIMD_SSSE3_HPP
#include <cstddef>
#include <type_traits>
#include "../types/xsimd_ssse3_register.hpp"
#include "../types/xsimd_utils.hpp"
namespace xsimd
{
namespace kernel
{
using namespace types;
// abs
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
{
return _mm_abs_epi8(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_abs_epi16(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
return _mm_abs_epi32(self);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
return _mm_abs_epi64(self);
}
else
{
assert(false && "unsupported arch/op combination");
return {};
}
}
// extract_pair
namespace detail
{
template <class T, class A>
inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
{
return other;
}
template <class T, class A, std::size_t I, std::size_t... Is>
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
{
if (i == I)
{
return _mm_alignr_epi8(self, other, sizeof(T) * I);
}
else
return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
}
}
template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
{
constexpr std::size_t size = batch<T, A>::size;
assert(0 <= i && i < size && "index in bounds");
return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
}
// reduce_add
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
__m128i tmp1 = _mm_hadd_epi16(self, self);
__m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
__m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
__m128i tmp1 = _mm_hadd_epi32(self, self);
__m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
return _mm_cvtsi128_si32(tmp2);
}
else
{
return reduce_add(self, sse3 {});
}
}
// swizzle
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
{
constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
mask8;
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
}
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
{
return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
{
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
{
return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
}
}
}
#endif

1126
third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

249
third_party/xsimd/include/xsimd/config/xsimd_arch.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,249 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_ARCH_HPP
#define XSIMD_ARCH_HPP
#include <initializer_list>
#include <type_traits>
#include <utility>
#include "../types/xsimd_all_registers.hpp"
#include "./xsimd_config.hpp"
#include "./xsimd_cpuid.hpp"
namespace xsimd
{
namespace detail
{
// Checks whether T appears in Tys.
template <class T, class... Tys>
struct contains;
template <class T>
struct contains<T> : std::false_type
{
};
template <class T, class Ty, class... Tys>
struct contains<T, Ty, Tys...>
: std::conditional<std::is_same<Ty, T>::value, std::true_type,
contains<T, Tys...>>::type
{
};
template <class... Archs>
struct is_sorted;
template <>
struct is_sorted<> : std::true_type
{
};
template <class Arch>
struct is_sorted<Arch> : std::true_type
{
};
template <class A0, class A1, class... Archs>
struct is_sorted<A0, A1, Archs...>
: std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
std::false_type>::type
{
};
template <typename T>
inline constexpr T max_of(T value) noexcept
{
return value;
}
template <typename T, typename... Ts>
inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
{
return max_of((head0 > head1 ? head0 : head1), tail...);
}
} // namespace detail
// An arch_list is a list of architectures, sorted by version number.
template <class... Archs>
struct arch_list
{
#ifndef NDEBUG
static_assert(detail::is_sorted<Archs...>::value,
"architecture list must be sorted by version");
#endif
template <class Arch>
using add = arch_list<Archs..., Arch>;
template <class... OtherArchs>
using extend = arch_list<Archs..., OtherArchs...>;
template <class Arch>
static constexpr bool contains() noexcept
{
return detail::contains<Arch, Archs...>::value;
}
template <class F>
static void for_each(F&& f) noexcept
{
(void)std::initializer_list<bool> { (f(Archs {}), true)... };
}
static constexpr std::size_t alignment() noexcept
{
// all alignments are a power of two
return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
}
};
struct unavailable
{
static constexpr bool supported() noexcept { return false; }
static constexpr bool available() noexcept { return false; }
static constexpr unsigned version() noexcept { return 0; }
static constexpr std::size_t alignment() noexcept { return 0; }
static constexpr bool requires_alignment() noexcept { return false; }
static constexpr char const* name() noexcept { return "<none>"; }
};
namespace detail
{
// Pick the best architecture in arch_list L, which is the last
// because architectures are sorted by version.
template <class L>
struct best;
template <>
struct best<arch_list<>>
{
using type = unavailable;
};
template <class Arch, class... Archs>
struct best<arch_list<Arch, Archs...>>
{
using type = Arch;
};
// Filter archlists Archs, picking only supported archs and adding
// them to L.
template <class L, class... Archs>
struct supported_helper;
template <class L>
struct supported_helper<L, arch_list<>>
{
using type = L;
};
template <class L, class Arch, class... Archs>
struct supported_helper<L, arch_list<Arch, Archs...>>
: supported_helper<
typename std::conditional<Arch::supported(),
typename L::template add<Arch>, L>::type,
arch_list<Archs...>>
{
};
template <class... Archs>
struct supported : supported_helper<arch_list<>, Archs...>
{
};
// Joins all arch_list Archs in a single arch_list.
template <class... Archs>
struct join;
template <class Arch>
struct join<Arch>
{
using type = Arch;
};
template <class Arch, class... Archs, class... Args>
struct join<Arch, arch_list<Archs...>, Args...>
: join<typename Arch::template extend<Archs...>, Args...>
{
};
} // namespace detail
struct unsupported
{
};
using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
using supported_architectures = typename detail::supported<all_architectures>::type;
using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
// using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
arm_arch,
x86_arch>::type;
namespace detail
{
template <class F, class ArchList>
class dispatcher
{
const unsigned best_arch;
F functor;
template <class Arch, class... Tys>
auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
{
assert(Arch::available() && "At least one arch must be supported during dispatch");
return functor(Arch {}, std::forward<Tys>(args)...);
}
template <class Arch, class ArchNext, class... Archs, class... Tys>
auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
{
if (Arch::version() <= best_arch)
return functor(Arch {}, std::forward<Tys>(args)...);
else
return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
}
public:
dispatcher(F f) noexcept
: best_arch(available_architectures().best)
, functor(f)
{
}
template <class... Tys>
auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
{
return walk_archs(ArchList {}, std::forward<Tys>(args)...);
}
};
}
// Generic function dispatch, à la ifunc
template <class ArchList = supported_architectures, class F>
inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
{
return { std::forward<F>(f) };
}
} // namespace xsimd
#endif

341
third_party/xsimd/include/xsimd/config/xsimd_config.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,341 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_CONFIG_HPP
#define XSIMD_CONFIG_HPP
#define XSIMD_VERSION_MAJOR 10
#define XSIMD_VERSION_MINOR 0
#define XSIMD_VERSION_PATCH 0
/**
* high level free functions
*
* @defgroup xsimd_config_macro Instruction Set Detection
*/
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE2__
#define XSIMD_WITH_SSE2 1
#else
#define XSIMD_WITH_SSE2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE3__
#define XSIMD_WITH_SSE3 1
#else
#define XSIMD_WITH_SSE3 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSSE3__
#define XSIMD_WITH_SSSE3 1
#else
#define XSIMD_WITH_SSSE3 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE4_1__
#define XSIMD_WITH_SSE4_1 1
#else
#define XSIMD_WITH_SSE4_1 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE4_2__
#define XSIMD_WITH_SSE4_2 1
#else
#define XSIMD_WITH_SSE4_2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX__
#define XSIMD_WITH_AVX 1
#else
#define XSIMD_WITH_AVX 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX2__
#define XSIMD_WITH_AVX2 1
#else
#define XSIMD_WITH_AVX2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
*/
#ifdef __FMA__
#if defined(__SSE__)
#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
#define XSIMD_WITH_FMA3_SSE 1
#endif
#else
#if XSIMD_WITH_FMA3_SSE
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_SSE 0
#endif
#else
#if XSIMD_WITH_FMA3_SSE
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_SSE 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
*/
#ifdef __FMA__
#if defined(__AVX__)
#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
#define XSIMD_WITH_FMA3_AVX 1
#endif
#else
#if XSIMD_WITH_FMA3_AVX
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_AVX 0
#endif
#if defined(__AVX2__)
#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
#define XSIMD_WITH_FMA3_AVX2 1
#endif
#else
#if XSIMD_WITH_FMA3_AVX2
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_AVX2 0
#endif
#else
#if XSIMD_WITH_FMA3_AVX
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
#endif
#if XSIMD_WITH_FMA3_AVX2
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_AVX 0
#define XSIMD_WITH_FMA3_AVX2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
*/
#ifdef __FMA4__
#define XSIMD_WITH_FMA4 1
#else
#define XSIMD_WITH_FMA4 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512F__
// AVX512 instructions are supported starting with gcc 6
// see https://www.gnu.org/software/gcc/gcc-6/changes.html
// check clang first, newer clang always defines __GNUC__ = 4
#if defined(__clang__) && __clang_major__ >= 6
#define XSIMD_WITH_AVX512F 1
#elif defined(__GNUC__) && __GNUC__ < 6
#define XSIMD_WITH_AVX512F 0
#else
#define XSIMD_WITH_AVX512F 1
#if __GNUC__ == 6
#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
#endif
#endif
#else
#define XSIMD_WITH_AVX512F 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512CD__
// Avoids repeating the GCC workaround over and over
#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
#else
#define XSIMD_WITH_AVX512CD 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512DQ__
#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
#else
#define XSIMD_WITH_AVX512DQ 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512BW__
#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
#else
#define XSIMD_WITH_AVX512BW 0
#endif
#ifdef __ARM_NEON
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if NEON is available at compile-time, to 0 otherwise.
*/
#if __ARM_ARCH >= 7
#define XSIMD_WITH_NEON 1
#else
#define XSIMD_WITH_NEON 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
*/
#ifdef __aarch64__
#define XSIMD_WITH_NEON64 1
#else
#define XSIMD_WITH_NEON64 0
#endif
#else
#define XSIMD_WITH_NEON 0
#define XSIMD_WITH_NEON64 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
*/
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
#define XSIMD_WITH_SVE 1
#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
#else
#define XSIMD_WITH_SVE 0
#define XSIMD_SVE_BITS 0
#endif
// Workaround for MSVC compiler
#ifdef _MSC_VER
#if XSIMD_WITH_AVX512
#undef XSIMD_WITH_AVX2
#define XSIMD_WITH_AVX2 1
#endif
#if XSIMD_WITH_AVX2
#undef XSIMD_WITH_AVX
#define XSIMD_WITH_AVX 1
#undef XSIMD_WITH_FMA3_AVX
#define XSIMD_WITH_FMA3_AVX 1
#undef XSIMD_WITH_FMA3_AVX2
#define XSIMD_WITH_FMA3_AVX2 1
#endif
#if XSIMD_WITH_AVX
#undef XSIMD_WITH_SSE4_2
#define XSIMD_WITH_SSE4_2 1
#endif
#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
#undef XSIMD_WITH_SSE4_2
#define XSIMD_WITH_SSE4_2 1
#endif
#if XSIMD_WITH_SSE4_2
#undef XSIMD_WITH_SSE4_1
#define XSIMD_WITH_SSE4_1 1
#endif
#if XSIMD_WITH_SSE4_1
#undef XSIMD_WITH_SSSE3
#define XSIMD_WITH_SSSE3 1
#endif
#if XSIMD_WITH_SSSE3
#undef XSIMD_WITH_SSE3
#define XSIMD_WITH_SSE3 1
#endif
#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#undef XSIMD_WITH_SSE2
#define XSIMD_WITH_SSE2 1
#endif
#endif
#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
#define XSIMD_NO_SUPPORTED_ARCHITECTURE
#endif
#endif

341
third_party/xsimd/include/xsimd/config/xsimd_config.hpp.orig поставляемый Normal file
Просмотреть файл

@ -0,0 +1,341 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_CONFIG_HPP
#define XSIMD_CONFIG_HPP
#define XSIMD_VERSION_MAJOR 10
#define XSIMD_VERSION_MINOR 0
#define XSIMD_VERSION_PATCH 0
/**
* high level free functions
*
* @defgroup xsimd_config_macro Instruction Set Detection
*/
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE2__
#define XSIMD_WITH_SSE2 1
#else
#define XSIMD_WITH_SSE2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE3__
#define XSIMD_WITH_SSE3 1
#else
#define XSIMD_WITH_SSE3 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSSE3__
#define XSIMD_WITH_SSSE3 1
#else
#define XSIMD_WITH_SSSE3 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE4_1__
#define XSIMD_WITH_SSE4_1 1
#else
#define XSIMD_WITH_SSE4_1 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
*/
#ifdef __SSE4_2__
#define XSIMD_WITH_SSE4_2 1
#else
#define XSIMD_WITH_SSE4_2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX__
#define XSIMD_WITH_AVX 1
#else
#define XSIMD_WITH_AVX 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX2__
#define XSIMD_WITH_AVX2 1
#else
#define XSIMD_WITH_AVX2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
*/
#ifdef __FMA__
#if defined(__SSE__) && !defined(__AVX__)
#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
#define XSIMD_WITH_FMA3_SSE 1
#endif
#else
#if XSIMD_WITH_FMA3_SSE
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_SSE 0
#endif
#else
#if XSIMD_WITH_FMA3_SSE
#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_SSE 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
*/
#ifdef __FMA__
#if defined(__AVX__)
#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
#define XSIMD_WITH_FMA3_AVX 1
#endif
#else
#if XSIMD_WITH_FMA3_AVX
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_AVX 0
#endif
#if defined(__AVX2__)
#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
#define XSIMD_WITH_FMA3_AVX2 1
#endif
#else
#if XSIMD_WITH_FMA3_AVX2
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_AVX2 0
#endif
#else
#if XSIMD_WITH_FMA3_AVX
#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
#endif
#if XSIMD_WITH_FMA3_AVX2
#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
#endif
#define XSIMD_WITH_FMA3_AVX 0
#define XSIMD_WITH_FMA3_AVX2 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
*/
#ifdef __FMA4__
#define XSIMD_WITH_FMA4 1
#else
#define XSIMD_WITH_FMA4 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512F__
// AVX512 instructions are supported starting with gcc 6
// see https://www.gnu.org/software/gcc/gcc-6/changes.html
// check clang first, newer clang always defines __GNUC__ = 4
#if defined(__clang__) && __clang_major__ >= 6
#define XSIMD_WITH_AVX512F 1
#elif defined(__GNUC__) && __GNUC__ < 6
#define XSIMD_WITH_AVX512F 0
#else
#define XSIMD_WITH_AVX512F 1
#if __GNUC__ == 6
#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
#endif
#endif
#else
#define XSIMD_WITH_AVX512F 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512CD__
// Avoids repeating the GCC workaround over and over
#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
#else
#define XSIMD_WITH_AVX512CD 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512DQ__
#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
#else
#define XSIMD_WITH_AVX512DQ 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
*/
#ifdef __AVX512BW__
#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
#else
#define XSIMD_WITH_AVX512BW 0
#endif
#ifdef __ARM_NEON
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if NEON is available at compile-time, to 0 otherwise.
*/
#if __ARM_ARCH >= 7
#define XSIMD_WITH_NEON 1
#else
#define XSIMD_WITH_NEON 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
*/
#ifdef __aarch64__
#define XSIMD_WITH_NEON64 1
#else
#define XSIMD_WITH_NEON64 0
#endif
#else
#define XSIMD_WITH_NEON 0
#define XSIMD_WITH_NEON64 0
#endif
/**
* @ingroup xsimd_config_macro
*
* Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
*/
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
#define XSIMD_WITH_SVE 1
#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
#else
#define XSIMD_WITH_SVE 0
#define XSIMD_SVE_BITS 0
#endif
// Workaround for MSVC compiler
#ifdef _MSC_VER
#if XSIMD_WITH_AVX512
#undef XSIMD_WITH_AVX2
#define XSIMD_WITH_AVX2 1
#endif
#if XSIMD_WITH_AVX2
#undef XSIMD_WITH_AVX
#define XSIMD_WITH_AVX 1
#undef XSIMD_WITH_FMA3_AVX
#define XSIMD_WITH_FMA3_AVX 1
#undef XSIMD_WITH_FMA3_AVX2
#define XSIMD_WITH_FMA3_AVX2 1
#endif
#if XSIMD_WITH_AVX
#undef XSIMD_WITH_SSE4_2
#define XSIMD_WITH_SSE4_2 1
#endif
#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
#undef XSIMD_WITH_SSE4_2
#define XSIMD_WITH_SSE4_2 1
#endif
#if XSIMD_WITH_SSE4_2
#undef XSIMD_WITH_SSE4_1
#define XSIMD_WITH_SSE4_1 1
#endif
#if XSIMD_WITH_SSE4_1
#undef XSIMD_WITH_SSSE3
#define XSIMD_WITH_SSSE3 1
#endif
#if XSIMD_WITH_SSSE3
#undef XSIMD_WITH_SSE3
#define XSIMD_WITH_SSE3 1
#endif
#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
#undef XSIMD_WITH_SSE2
#define XSIMD_WITH_SSE2 1
#endif
#endif
#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
#define XSIMD_NO_SUPPORTED_ARCHITECTURE
#endif
#endif

181
third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,181 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_CPUID_HPP
#define XSIMD_CPUID_HPP
#include <algorithm>
#include <cstring>
#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
#include <asm/hwcap.h>
#include <sys/auxv.h>
#endif
#if defined(_MSC_VER)
// Contains the definition of __cpuidex
#include <intrin.h>
#endif
#include "../types/xsimd_all_registers.hpp"
namespace xsimd
{
namespace detail
{
struct supported_arch
{
unsigned sse2 : 1;
unsigned sse3 : 1;
unsigned ssse3 : 1;
unsigned sse4_1 : 1;
unsigned sse4_2 : 1;
unsigned sse4a : 1;
unsigned fma3_sse : 1;
unsigned fma4 : 1;
unsigned xop : 1;
unsigned avx : 1;
unsigned fma3_avx : 1;
unsigned avx2 : 1;
unsigned fma3_avx2 : 1;
unsigned avx512f : 1;
unsigned avx512cd : 1;
unsigned avx512dq : 1;
unsigned avx512bw : 1;
unsigned neon : 1;
unsigned neon64 : 1;
// version number of the best arch available
unsigned best;
supported_arch() noexcept
{
memset(this, 0, sizeof(supported_arch));
#if defined(__aarch64__) || defined(_M_ARM64)
neon = 1;
neon64 = 1;
best = neon64::version();
#elif defined(__ARM_NEON) || defined(_M_ARM)
#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
#else
// that's very conservative :-/
neon = 0;
#endif
neon64 = 0;
best = neon::version() * neon;
#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
auto get_cpuid = [](int reg[4], int func_id) noexcept
{
#if defined(_MSC_VER)
__cpuidex(reg, func_id, 0);
#elif defined(__INTEL_COMPILER)
__cpuid(reg, func_id);
#elif defined(__GNUC__) || defined(__clang__)
#if defined(__i386__) && defined(__PIC__)
// %ebx may be the PIC register
__asm__("xchg{l}\t{%%}ebx, %1\n\t"
"cpuid\n\t"
"xchg{l}\t{%%}ebx, %1\n\t"
: "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
"=d"(reg[3])
: "a"(func_id), "c"(0));
#else
__asm__("cpuid\n\t"
: "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
"=d"(reg[3])
: "a"(func_id), "c"(0));
#endif
#else
#error "Unsupported configuration"
#endif
};
int regs[4];
get_cpuid(regs, 0x1);
sse2 = regs[3] >> 26 & 1;
best = std::max(best, sse2::version() * sse2);
sse3 = regs[2] >> 0 & 1;
best = std::max(best, sse3::version() * sse3);
ssse3 = regs[2] >> 9 & 1;
best = std::max(best, ssse3::version() * ssse3);
sse4_1 = regs[2] >> 19 & 1;
best = std::max(best, sse4_1::version() * sse4_1);
sse4_2 = regs[2] >> 20 & 1;
best = std::max(best, sse4_2::version() * sse4_2);
fma3_sse = regs[2] >> 12 & 1;
if (sse4_2)
best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
get_cpuid(regs, 0x80000001);
fma4 = regs[2] >> 16 & 1;
best = std::max(best, fma4::version() * fma4);
// sse4a = regs[2] >> 6 & 1;
// best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
// xop = regs[2] >> 11 & 1;
// best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
avx = regs[2] >> 28 & 1;
best = std::max(best, avx::version() * avx);
fma3_avx = avx && fma3_sse;
best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
get_cpuid(regs, 0x7);
avx2 = regs[1] >> 5 & 1;
best = std::max(best, avx2::version() * avx2);
fma3_avx2 = avx2 && fma3_sse;
best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
avx512f = regs[1] >> 16 & 1;
best = std::max(best, avx512f::version() * avx512f);
avx512cd = regs[1] >> 28 & 1;
best = std::max(best, avx512cd::version() * avx512cd * avx512f);
avx512dq = regs[1] >> 17 & 1;
best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
avx512bw = regs[1] >> 30 & 1;
best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
#endif
}
};
}
inline detail::supported_arch available_architectures() noexcept
{
static detail::supported_arch supported;
return supported;
}
}
#endif

719
third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,719 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#include <cmath>
#include <cstdint>
#include <cstring>
namespace xsimd
{
namespace detail
{
/* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
/*
* ====================================================
* copyright 2016 NumScale SAS
*
* Distributed under the Boost Software License, Version 1.0.
* (See copy at http://boost.org/LICENSE_1_0.txt)
* ====================================================
*/
#if defined(_MSC_VER)
#define ONCE0 \
__pragma(warning(push)) \
__pragma(warning(disable : 4127)) while (0) \
__pragma(warning(pop)) /**/
#else
#define ONCE0 while (0)
#endif
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunPro, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
#if defined(__GNUC__) && defined(__BYTE_ORDER__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define XSIMD_LITTLE_ENDIAN
#endif
#elif defined(_WIN32)
// We can safely assume that Windows is always little endian
#define XSIMD_LITTLE_ENDIAN
#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
#define XSIMD_LITTLE_ENDIAN
#endif
#ifdef XSIMD_LITTLE_ENDIAN
#define LOW_WORD_IDX 0
#define HIGH_WORD_IDX sizeof(std::uint32_t)
#else
#define LOW_WORD_IDX sizeof(std::uint32_t)
#define HIGH_WORD_IDX 0
#endif
#define GET_HIGH_WORD(i, d) \
do \
{ \
double f = (d); \
std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
sizeof(std::uint32_t)); \
} \
ONCE0 \
/**/
#define GET_LOW_WORD(i, d) \
do \
{ \
double f = (d); \
std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
sizeof(std::uint32_t)); \
} \
ONCE0 \
/**/
#define SET_HIGH_WORD(d, v) \
do \
{ \
double f = (d); \
std::uint32_t value = (v); \
std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
&value, sizeof(std::uint32_t)); \
(d) = f; \
} \
ONCE0 \
/**/
#define SET_LOW_WORD(d, v) \
do \
{ \
double f = (d); \
std::uint32_t value = (v); \
std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
&value, sizeof(std::uint32_t)); \
(d) = f; \
} \
ONCE0 \
/**/
/*
* __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
* double x[],y[]; int e0,nx,prec; int ipio2[];
*
* __kernel_rem_pio2 return the last three digits of N with
* y = x - N*pi/2
* so that |y| < pi/2.
*
* The method is to compute the integer (mod 8) and fraction parts of
* (2/pi)*x without doing the full multiplication. In general we
* skip the part of the product that are known to be a huge integer (
* more accurately, = 0 mod 8 ). Thus the number of operations are
* independent of the exponent of the input.
*
* (2/pi) is represented by an array of 24-bit integers in ipio2[].
*
* Input parameters:
* x[] The input value (must be positive) is broken into nx
* pieces of 24-bit integers in double precision format.
* x[i] will be the i-th 24 bit of x. The scaled exponent
* of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
* match x's up to 24 bits.
*
* Example of breaking a double positive z into x[0]+x[1]+x[2]:
* e0 = ilogb(z)-23
* z = scalbn(z,-e0)
* for i = 0,1,2
* x[i] = floor(z)
* z = (z-x[i])*2**24
*
*
* y[] ouput result in an array of double precision numbers.
* The dimension of y[] is:
* 24-bit precision 1
* 53-bit precision 2
* 64-bit precision 2
* 113-bit precision 3
* The actual value is the sum of them. Thus for 113-bit
* precison, one may have to do something like:
*
* long double t,w,r_head, r_tail;
* t = (long double)y[2] + (long double)y[1];
* w = (long double)y[0];
* r_head = t+w;
* r_tail = w - (r_head - t);
*
* e0 The exponent of x[0]
*
* nx dimension of x[]
*
* prec an integer indicating the precision:
* 0 24 bits (single)
* 1 53 bits (double)
* 2 64 bits (extended)
* 3 113 bits (quad)
*
* ipio2[]
* integer array, contains the (24*i)-th to (24*i+23)-th
* bit of 2/pi after binary point. The corresponding
* floating value is
*
* ipio2[i] * 2^(-24(i+1)).
*
* External function:
* double scalbn(), floor();
*
*
* Here is the description of some local variables:
*
* jk jk+1 is the initial number of terms of ipio2[] needed
* in the computation. The recommended value is 2,3,4,
* 6 for single, double, extended,and quad.
*
* jz local integer variable indicating the number of
* terms of ipio2[] used.
*
* jx nx - 1
*
* jv index for pointing to the suitable ipio2[] for the
* computation. In general, we want
* ( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
* is an integer. Thus
* e0-3-24*jv >= 0 or (e0-3)/24 >= jv
* Hence jv = max(0,(e0-3)/24).
*
* jp jp+1 is the number of terms in PIo2[] needed, jp = jk.
*
* q[] double array with integral value, representing the
* 24-bits chunk of the product of x and 2/pi.
*
* q0 the corresponding exponent of q[0]. Note that the
* exponent for q[i] would be q0-24*i.
*
* PIo2[] double precision array, obtained by cutting pi/2
* into 24 bits chunks.
*
* f[] ipio2[] in floating point
*
* iq[] integer array by breaking up q[] in 24-bits chunk.
*
* fq[] final product of x*(2/pi) in fq[0],..,fq[jk]
*
* ih integer. If >0 it indicates q[] is >= 0.5, hence
* it also indicates the *sign* of the result.
*
*/
inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
{
static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
static const double PIo2[] = {
1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
};
static const double
zero
= 0.0,
one = 1.0,
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
double z, fw, f[20], fq[20], q[20];
/* initialize jk*/
jk = init_jk[prec];
jp = jk;
/* determine jx,jv,q0, note that 3>q0 */
jx = nx - 1;
jv = (e0 - 3) / 24;
if (jv < 0)
jv = 0;
q0 = e0 - 24 * (jv + 1);
/* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
j = jv - jx;
m = jx + jk;
for (i = 0; i <= m; i++, j++)
f[i] = (j < 0) ? zero : (double)ipio2[j];
/* compute q[0],q[1],...q[jk] */
for (i = 0; i <= jk; i++)
{
for (j = 0, fw = 0.0; j <= jx; j++)
fw += x[j] * f[jx + i - j];
q[i] = fw;
}
jz = jk;
recompute:
/* distill q[] into iq[] reversingly */
for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
{
fw = (double)((int32_t)(twon24 * z));
iq[i] = (int)(z - two24 * fw);
z = q[j - 1] + fw;
}
/* compute n */
z = std::scalbn(z, q0); /* actual value of z */
z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
n = (int32_t)z;
z -= (double)n;
ih = 0;
if (q0 > 0)
{ /* need iq[jz-1] to determine n */
i = (iq[jz - 1] >> (24 - q0));
n += i;
iq[jz - 1] -= i << (24 - q0);
ih = iq[jz - 1] >> (23 - q0);
}
else if (q0 == 0)
ih = iq[jz - 1] >> 23;
else if (z >= 0.5)
ih = 2;
if (ih > 0)
{ /* q > 0.5 */
n += 1;
carry = 0;
for (i = 0; i < jz; i++)
{ /* compute 1-q */
j = iq[i];
if (carry == 0)
{
if (j != 0)
{
carry = 1;
iq[i] = 0x1000000 - j;
}
}
else
iq[i] = 0xffffff - j;
}
if (q0 > 0)
{ /* rare case: chance is 1 in 12 */
switch (q0)
{
case 1:
iq[jz - 1] &= 0x7fffff;
break;
case 2:
iq[jz - 1] &= 0x3fffff;
break;
}
}
if (ih == 2)
{
z = one - z;
if (carry != 0)
z -= std::scalbn(one, q0);
}
}
/* check if recomputation is needed */
if (z == zero)
{
j = 0;
for (i = jz - 1; i >= jk; i--)
j |= iq[i];
if (j == 0)
{ /* need recomputation */
for (k = 1; iq[jk - k] == 0; k++)
; /* k = no. of terms needed */
for (i = jz + 1; i <= jz + k; i++)
{ /* add q[jz+1] to q[jz+k] */
f[jx + i] = (double)ipio2[jv + i];
for (j = 0, fw = 0.0; j <= jx; j++)
fw += x[j] * f[jx + i - j];
q[i] = fw;
}
jz += k;
goto recompute;
}
}
/* chop off zero terms */
if (z == 0.0)
{
jz -= 1;
q0 -= 24;
while (iq[jz] == 0)
{
jz--;
q0 -= 24;
}
}
else
{ /* break z into 24-bit if necessary */
z = std::scalbn(z, -q0);
if (z >= two24)
{
fw = (double)((int32_t)(twon24 * z));
iq[jz] = (int32_t)(z - two24 * fw);
jz += 1;
q0 += 24;
iq[jz] = (int32_t)fw;
}
else
iq[jz] = (int32_t)z;
}
/* convert integer "bit" chunk to floating-point value */
fw = scalbn(one, q0);
for (i = jz; i >= 0; i--)
{
q[i] = fw * (double)iq[i];
fw *= twon24;
}
/* compute PIo2[0,...,jp]*q[jz,...,0] */
for (i = jz; i >= 0; i--)
{
for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
fw += PIo2[k] * q[i + k];
fq[jz - i] = fw;
}
/* compress fq[] into y[] */
switch (prec)
{
case 0:
fw = 0.0;
for (i = jz; i >= 0; i--)
fw += fq[i];
y[0] = (ih == 0) ? fw : -fw;
break;
case 1:
case 2:
fw = 0.0;
for (i = jz; i >= 0; i--)
fw += fq[i];
y[0] = (ih == 0) ? fw : -fw;
fw = fq[0] - fw;
for (i = 1; i <= jz; i++)
fw += fq[i];
y[1] = (ih == 0) ? fw : -fw;
break;
case 3: /* painful */
for (i = jz; i > 0; i--)
{
fw = fq[i - 1] + fq[i];
fq[i] += fq[i - 1] - fw;
fq[i - 1] = fw;
}
for (i = jz; i > 1; i--)
{
fw = fq[i - 1] + fq[i];
fq[i] += fq[i - 1] - fw;
fq[i - 1] = fw;
}
for (fw = 0.0, i = jz; i >= 2; i--)
fw += fq[i];
if (ih == 0)
{
y[0] = fq[0];
y[1] = fq[1];
y[2] = fw;
}
else
{
y[0] = -fq[0];
y[1] = -fq[1];
y[2] = -fw;
}
}
return n & 7;
}
inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
{
static const std::int32_t two_over_pi[] = {
0xA2F983,
0x6E4E44,
0x1529FC,
0x2757D1,
0xF534DD,
0xC0DB62,
0x95993C,
0x439041,
0xFE5163,
0xABDEBB,
0xC561B7,
0x246E3A,
0x424DD2,
0xE00649,
0x2EEA09,
0xD1921C,
0xFE1DEB,
0x1CB129,
0xA73EE8,
0x8235F5,
0x2EBB44,
0x84E99C,
0x7026B4,
0x5F7E41,
0x3991D6,
0x398353,
0x39F49C,
0x845F8B,
0xBDF928,
0x3B1FF8,
0x97FFDE,
0x05980F,
0xEF2F11,
0x8B5A0A,
0x6D1F6D,
0x367ECF,
0x27CB09,
0xB74F46,
0x3F669E,
0x5FEA2D,
0x7527BA,
0xC7EBE5,
0xF17B3D,
0x0739F7,
0x8A5292,
0xEA6BFB,
0x5FB11F,
0x8D5D08,
0x560330,
0x46FC7B,
0x6BABF0,
0xCFBC20,
0x9AF436,
0x1DA9E3,
0x91615E,
0xE61B08,
0x659985,
0x5F14A0,
0x68408D,
0xFFD880,
0x4D7327,
0x310606,
0x1556CA,
0x73A8C9,
0x60E27B,
0xC08C6B,
};
static const std::int32_t npio2_hw[] = {
0x3FF921FB,
0x400921FB,
0x4012D97C,
0x401921FB,
0x401F6A7A,
0x4022D97C,
0x4025FDBB,
0x402921FB,
0x402C463A,
0x402F6A7A,
0x4031475C,
0x4032D97C,
0x40346B9C,
0x4035FDBB,
0x40378FDB,
0x403921FB,
0x403AB41B,
0x403C463A,
0x403DD85A,
0x403F6A7A,
0x40407E4C,
0x4041475C,
0x4042106C,
0x4042D97C,
0x4043A28C,
0x40446B9C,
0x404534AC,
0x4045FDBB,
0x4046C6CB,
0x40478FDB,
0x404858EB,
0x404921FB,
};
/*
* invpio2: 53 bits of 2/pi
* pio2_1: first 33 bit of pi/2
* pio2_1t: pi/2 - pio2_1
* pio2_2: second 33 bit of pi/2
* pio2_2t: pi/2 - (pio2_1+pio2_2)
* pio2_3: third 33 bit of pi/2
* pio2_3t: pi/2 - (pio2_1+pio2_2+pio2_3)
*/
static const double
zero
= 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
double z = 0., w, t, r, fn;
double tx[3];
std::int32_t e0, i, j, nx, n, ix, hx;
std::uint32_t low;
GET_HIGH_WORD(hx, x); /* high word of x */
ix = hx & 0x7fffffff;
if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
{
y[0] = x;
y[1] = 0;
return 0;
}
if (ix < 0x4002d97c)
{ /* |x| < 3pi/4, special case with n=+-1 */
if (hx > 0)
{
z = x - pio2_1;
if (ix != 0x3ff921fb)
{ /* 33+53 bit pi is good enough */
y[0] = z - pio2_1t;
y[1] = (z - y[0]) - pio2_1t;
}
else
{ /* near pi/2, use 33+33+53 bit pi */
z -= pio2_2;
y[0] = z - pio2_2t;
y[1] = (z - y[0]) - pio2_2t;
}
return 1;
}
else
{ /* negative x */
z = x + pio2_1;
if (ix != 0x3ff921fb)
{ /* 33+53 bit pi is good enough */
y[0] = z + pio2_1t;
y[1] = (z - y[0]) + pio2_1t;
}
else
{ /* near pi/2, use 33+33+53 bit pi */
z += pio2_2;
y[0] = z + pio2_2t;
y[1] = (z - y[0]) + pio2_2t;
}
return -1;
}
}
if (ix <= 0x413921fb)
{ /* |x| ~<= 2^19*(pi/2), medium_ size */
t = std::fabs(x);
n = (std::int32_t)(t * invpio2 + half);
fn = (double)n;
r = t - fn * pio2_1;
w = fn * pio2_1t; /* 1st round good to 85 bit */
if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
{
y[0] = r - w; /* quick check no cancellation */
}
else
{
std::uint32_t high;
j = ix >> 20;
y[0] = r - w;
GET_HIGH_WORD(high, y[0]);
i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
if (i > 16)
{ /* 2nd iteration needed, good to 118 */
t = r;
w = fn * pio2_2;
r = t - w;
w = fn * pio2_2t - ((t - r) - w);
y[0] = r - w;
GET_HIGH_WORD(high, y[0]);
i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
if (i > 49)
{ /* 3rd iteration need, 151 bits acc */
t = r; /* will cover all possible cases */
w = fn * pio2_3;
r = t - w;
w = fn * pio2_3t - ((t - r) - w);
y[0] = r - w;
}
}
}
y[1] = (r - y[0]) - w;
if (hx < 0)
{
y[0] = -y[0];
y[1] = -y[1];
return -n;
}
else
return n;
}
/*
* all other (large) arguments
*/
if (ix >= 0x7ff00000)
{ /* x is inf or NaN */
y[0] = y[1] = x - x;
return 0;
}
/* set z = scalbn(|x|,ilogb(x)-23) */
GET_LOW_WORD(low, x);
SET_LOW_WORD(z, low);
e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
for (i = 0; i < 2; i++)
{
tx[i] = (double)((std::int32_t)(z));
z = (z - tx[i]) * two24;
}
tx[2] = z;
nx = 3;
while (tx[nx - 1] == zero)
nx--; /* skip zero term */
n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
if (hx < 0)
{
y[0] = -y[0];
y[1] = -y[1];
return -n;
}
return n;
}
}
#undef XSIMD_LITTLE_ENDIAN
#undef SET_LOW_WORD
#undef SET_HIGH_WORD
#undef GET_LOW_WORD
#undef GET_HIGH_WORD
#undef HIGH_WORD_IDX
#undef LOW_WORD_IDX
#undef ONCE0
}

Просмотреть файл

@ -0,0 +1,349 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
#define XSIMD_ALIGNED_ALLOCATOR_HPP
#include <algorithm>
#include <cstddef>
#include <utility>
#ifdef _WIN32
#include <malloc.h>
#else
#include <cstdlib>
#endif
#include <cassert>
#include <memory>
#include "../config/xsimd_arch.hpp"
namespace xsimd
{
/**
* @class aligned_allocator
* @brief Allocator for aligned memory
*
* The aligned_allocator class template is an allocator that
* performs memory allocation aligned by the specified value.
*
* @tparam T type of objects to allocate.
* @tparam Align alignment in bytes.
*/
template <class T, size_t Align = default_arch::alignment()>
class aligned_allocator
{
public:
using value_type = T;
using pointer = T*;
using const_pointer = const T*;
using reference = T&;
using const_reference = const T&;
using size_type = size_t;
using difference_type = ptrdiff_t;
static constexpr size_t alignment = Align;
template <class U>
struct rebind
{
using other = aligned_allocator<U, Align>;
};
aligned_allocator() noexcept;
aligned_allocator(const aligned_allocator& rhs) noexcept;
template <class U>
aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
~aligned_allocator();
pointer address(reference) noexcept;
const_pointer address(const_reference) const noexcept;
pointer allocate(size_type n, const void* hint = 0);
void deallocate(pointer p, size_type n);
size_type max_size() const noexcept;
size_type size_max() const noexcept;
template <class U, class... Args>
void construct(U* p, Args&&... args);
template <class U>
void destroy(U* p);
};
template <class T1, size_t Align1, class T2, size_t Align2>
bool operator==(const aligned_allocator<T1, Align1>& lhs,
const aligned_allocator<T2, Align2>& rhs) noexcept;
template <class T1, size_t Align1, class T2, size_t Align2>
bool operator!=(const aligned_allocator<T1, Align1>& lhs,
const aligned_allocator<T2, Align2>& rhs) noexcept;
void* aligned_malloc(size_t size, size_t alignment);
void aligned_free(void* ptr);
template <class T>
size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
/************************************
* aligned_allocator implementation *
************************************/
/**
* Default constructor.
*/
template <class T, size_t A>
inline aligned_allocator<T, A>::aligned_allocator() noexcept
{
}
/**
* Copy constructor.
*/
template <class T, size_t A>
inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
{
}
/**
* Extended copy constructor.
*/
template <class T, size_t A>
template <class U>
inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
{
}
/**
* Destructor.
*/
template <class T, size_t A>
inline aligned_allocator<T, A>::~aligned_allocator()
{
}
/**
* Returns the actual address of \c r even in presence of overloaded \c operator&.
* @param r the object to acquire address of.
* @return the actual address of \c r.
*/
template <class T, size_t A>
inline auto
aligned_allocator<T, A>::address(reference r) noexcept -> pointer
{
return &r;
}
/**
* Returns the actual address of \c r even in presence of overloaded \c operator&.
* @param r the object to acquire address of.
* @return the actual address of \c r.
*/
template <class T, size_t A>
inline auto
aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
{
return &r;
}
/**
* Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
* The alignment may require some extra memory allocation.
* @param n the number of objects to allocate storage for.
* @param hint unused parameter provided for standard compliance.
* @return a pointer to the first byte of a memory block suitably aligned and sufficient to
* hold an array of \c n objects of type \c T.
*/
template <class T, size_t A>
inline auto
aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
{
pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
if (res == nullptr)
throw std::bad_alloc();
#endif
return res;
}
/**
* Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
* an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
* to allocate() that originally produced \c p; otherwise, the behavior is undefined.
* @param p pointer obtained from allocate().
* @param n number of objects earlier passed to allocate().
*/
template <class T, size_t A>
inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
{
aligned_free(p);
}
/**
* Returns the maximum theoretically possible value of \c n, for which the
* call allocate(n, 0) could succeed.
* @return the maximum supported allocated size.
*/
template <class T, size_t A>
inline auto
aligned_allocator<T, A>::max_size() const noexcept -> size_type
{
return size_type(-1) / sizeof(T);
}
/**
* This method is deprecated, use max_size() instead
*/
template <class T, size_t A>
inline auto
aligned_allocator<T, A>::size_max() const noexcept -> size_type
{
return size_type(-1) / sizeof(T);
}
/**
* Constructs an object of type \c T in allocated uninitialized memory
* pointed to by \c p, using placement-new.
* @param p pointer to allocated uninitialized memory.
* @param args the constructor arguments to use.
*/
template <class T, size_t A>
template <class U, class... Args>
inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
{
new ((void*)p) U(std::forward<Args>(args)...);
}
/**
* Calls the destructor of the object pointed to by \c p.
* @param p pointer to the object that is going to be destroyed.
*/
template <class T, size_t A>
template <class U>
inline void aligned_allocator<T, A>::destroy(U* p)
{
p->~U();
}
/**
* @defgroup allocator_comparison Comparison operators
*/
/**
* @ingroup allocator_comparison
* Compares two aligned memory allocator for equality. Since allocators
* are stateless, return \c true iff <tt>A1 == A2</tt>.
* @param lhs aligned_allocator to compare.
* @param rhs aligned_allocator to compare.
* @return true if the allocators have the same alignment.
*/
template <class T1, size_t A1, class T2, size_t A2>
inline bool operator==(const aligned_allocator<T1, A1>& lhs,
const aligned_allocator<T2, A2>& rhs) noexcept
{
return lhs.alignment == rhs.alignment;
}
/**
* @ingroup allocator_comparison
* Compares two aligned memory allocator for inequality. Since allocators
* are stateless, return \c true iff <tt>A1 != A2</tt>.
* @param lhs aligned_allocator to compare.
* @param rhs aligned_allocator to compare.
* @return true if the allocators have different alignments.
*/
template <class T1, size_t A1, class T2, size_t A2>
inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
const aligned_allocator<T2, A2>& rhs) noexcept
{
return !(lhs == rhs);
}
/****************************************
* aligned malloc / free implementation *
****************************************/
namespace detail
{
inline void* xaligned_malloc(size_t size, size_t alignment)
{
assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
void* res = nullptr;
#ifdef _WIN32
res = _aligned_malloc(size, alignment);
#else
if (posix_memalign(&res, alignment, size) != 0)
{
res = nullptr;
}
#endif
return res;
}
inline void xaligned_free(void* ptr)
{
#ifdef _WIN32
_aligned_free(ptr);
#else
free(ptr);
#endif
}
}
inline void* aligned_malloc(size_t size, size_t alignment)
{
return detail::xaligned_malloc(size, alignment);
}
inline void aligned_free(void* ptr)
{
detail::xaligned_free(ptr);
}
template <class T>
inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
{
// size_t block_size = simd_traits<T>::size;
if (block_size == 1)
{
// The simd_block consists of exactly one scalar so that all
// elements of the array
// are "well" aligned.
return 0;
}
else if (size_t(p) & (sizeof(T) - 1))
{
// The array is not aligned to the size of a single element, so that
// no element
// of the array is well aligned
return size;
}
else
{
size_t block_mask = block_size - 1;
return std::min<size_t>(
(block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
size);
}
}
template <class T, class A = default_arch>
using default_allocator = typename std::conditional<A::requires_alignment(),
aligned_allocator<T, A::alignment()>,
std::allocator<T>>::type;
}
#endif

76
third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,76 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_ALIGNMENT_HPP
#define XSIMD_ALIGNMENT_HPP
#include "../types/xsimd_utils.hpp"
#include "xsimd_aligned_allocator.hpp"
namespace xsimd
{
/**
* @struct aligned_mode
* @brief tag for load and store of aligned memory.
*/
struct aligned_mode
{
};
/**
* @struct unaligned_mode
* @brief tag for load and store of unaligned memory.
*/
struct unaligned_mode
{
};
/***********************
* Allocator alignment *
***********************/
template <class A>
struct allocator_alignment
{
using type = unaligned_mode;
};
template <class T>
struct allocator_alignment<aligned_allocator<T>>
{
using type = aligned_mode;
};
template <class A>
using allocator_alignment_t = typename allocator_alignment<A>::type;
/***********************
* container alignment *
***********************/
template <class C, class = void>
struct container_alignment
{
using type = unaligned_mode;
};
template <class C>
struct container_alignment<C, detail::void_t<typename C::allocator_type>>
{
using type = allocator_alignment_t<typename C::allocator_type>;
};
template <class C>
using container_alignment_t = typename container_alignment<C>::type;
}
#endif

32
third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,32 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#include "xsimd_fma3_sse_register.hpp"
#include "xsimd_fma4_register.hpp"
#include "xsimd_sse2_register.hpp"
#include "xsimd_sse3_register.hpp"
#include "xsimd_sse4_1_register.hpp"
#include "xsimd_sse4_2_register.hpp"
#include "xsimd_avx2_register.hpp"
#include "xsimd_avx_register.hpp"
#include "xsimd_fma3_avx2_register.hpp"
#include "xsimd_fma3_avx_register.hpp"
#include "xsimd_avx512bw_register.hpp"
#include "xsimd_avx512cd_register.hpp"
#include "xsimd_avx512dq_register.hpp"
#include "xsimd_avx512f_register.hpp"
#include "xsimd_neon64_register.hpp"
#include "xsimd_neon_register.hpp"
#include "xsimd_sve_register.hpp"

2309
third_party/xsimd/include/xsimd/types/xsimd_api.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

40
third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,40 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX2_REGISTER_HPP
#define XSIMD_AVX2_REGISTER_HPP
#include "./xsimd_avx_register.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* AVX2 instructions
*/
struct avx2 : avx
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
static constexpr char const* name() noexcept { return "avx2"; }
};
#if XSIMD_WITH_AVX2
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
}
#endif
}
#endif

Просмотреть файл

@ -0,0 +1,48 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512BW_REGISTER_HPP
#define XSIMD_AVX512BW_REGISTER_HPP
#include "./xsimd_avx512dq_register.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* AVX512BW instructions
*/
struct avx512bw : avx512dq
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
static constexpr char const* name() noexcept { return "avx512bw"; }
};
#if XSIMD_WITH_AVX512BW
namespace types
{
template <class T>
struct get_bool_simd_register<T, avx512bw>
{
using type = simd_avx512_bool_register<T>;
};
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
}
#endif
}
#endif

Просмотреть файл

@ -0,0 +1,48 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512CD_REGISTER_HPP
#define XSIMD_AVX512CD_REGISTER_HPP
#include "./xsimd_avx512f_register.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* AVX512CD instrutions
*/
struct avx512cd : avx512f
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
static constexpr char const* name() noexcept { return "avx512cd"; }
};
#if XSIMD_WITH_AVX512CD
namespace types
{
template <class T>
struct get_bool_simd_register<T, avx512cd>
{
using type = simd_avx512_bool_register<T>;
};
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
}
#endif
}
#endif

Просмотреть файл

@ -0,0 +1,48 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512DQ_REGISTER_HPP
#define XSIMD_AVX512DQ_REGISTER_HPP
#include "./xsimd_avx512cd_register.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* AVX512DQ instructions
*/
struct avx512dq : avx512cd
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
static constexpr char const* name() noexcept { return "avx512dq"; }
};
#if XSIMD_WITH_AVX512DQ
namespace types
{
template <class T>
struct get_bool_simd_register<T, avx512dq>
{
using type = simd_avx512_bool_register<T>;
};
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
}
#endif
}
#endif

Просмотреть файл

@ -0,0 +1,75 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX512F_REGISTER_HPP
#define XSIMD_AVX512F_REGISTER_HPP
#include "./xsimd_generic_arch.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* AVX512F instructions
*/
struct avx512f : generic
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
static constexpr std::size_t alignment() noexcept { return 64; }
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr char const* name() noexcept { return "avx512f"; }
};
#if XSIMD_WITH_AVX512F
namespace types
{
template <class T>
struct simd_avx512_bool_register
{
using register_type = typename std::conditional<
(sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
register_type data;
simd_avx512_bool_register() = default;
simd_avx512_bool_register(register_type r) { data = r; }
operator register_type() const noexcept { return data; }
};
template <class T>
struct get_bool_simd_register<T, avx512f>
{
using type = simd_avx512_bool_register<T>;
};
XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
}
#endif
}
#endif

62
third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,62 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_AVX_REGISTER_HPP
#define XSIMD_AVX_REGISTER_HPP
#include "./xsimd_generic_arch.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* AVX instructions
*/
struct avx : generic
{
static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
static constexpr std::size_t alignment() noexcept { return 32; }
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr char const* name() noexcept { return "avx"; }
};
}
#if XSIMD_WITH_AVX
#include <immintrin.h>
namespace xsimd
{
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
}
}
#endif
#endif

1491
third_party/xsimd/include/xsimd/types/xsimd_batch.hpp поставляемый Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

147
third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,147 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_BATCH_CONSTANT_HPP
#define XSIMD_BATCH_CONSTANT_HPP
#include "./xsimd_batch.hpp"
#include "./xsimd_utils.hpp"
namespace xsimd
{
/**
* @brief batch of boolean constant
*
* Abstract representation of a batch of boolean constants.
*
* @tparam batch_type the type of the associated batch values.
* @tparam Values boolean constant represented by this batch
**/
template <class batch_type, bool... Values>
struct batch_bool_constant
{
static constexpr std::size_t size = sizeof...(Values);
using arch_type = typename batch_type::arch_type;
using value_type = bool;
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
bool get(size_t i) const noexcept
{
return std::array<value_type, size> { { Values... } }[i];
}
static constexpr int mask() noexcept
{
return mask_helper(0, static_cast<int>(Values)...);
}
private:
static constexpr int mask_helper(int acc) noexcept { return acc; }
template <class... Tys>
static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
{
return mask_helper(acc | mask, (masks << 1)...);
}
};
/**
* @brief batch of integral constants
*
* Abstract representation of a batch of integral constants.
*
* @tparam batch_type the type of the associated batch values.
* @tparam Values constants represented by this batch
**/
template <class batch_type, typename batch_type::value_type... Values>
struct batch_constant
{
static constexpr std::size_t size = sizeof...(Values);
using arch_type = typename batch_type::arch_type;
using value_type = typename batch_type::value_type;
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
/**
* @brief Generate a batch of @p batch_type from this @p batch_constant
*/
operator batch_type() const noexcept { return { Values... }; }
/**
* @brief Get the @p i th element of this @p batch_constant
*/
constexpr value_type get(size_t i) const noexcept
{
return get(i, std::array<value_type, size> { Values... });
}
private:
constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
{
return values[i];
}
};
namespace detail
{
template <class batch_type, class G, std::size_t... Is>
inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
-> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
{
return {};
}
template <class batch_type, class G, std::size_t... Is>
inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
-> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
{
return {};
}
} // namespace detail
/**
* @brief Build a @c batch_constant out of a generator function
*
* @tparam batch_type type of the (non-constant) batch to build
* @tparam G type used to generate that batch. That type must have a static
* member @c get that's used to generate the batch constant. Conversely, the
* generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
*
* The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
*
* @code
* struct Rot
* {
* static constexpr unsigned get(unsigned i, unsigned n)
* {
* return (i + n - 1) % n;
* }
* };
* @endcode
*/
template <class batch_type, class G>
inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
{
return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
}
template <class batch_type, class G>
inline constexpr auto make_batch_bool_constant() noexcept
-> decltype(detail::make_batch_bool_constant<batch_type, G>(
detail::make_index_sequence<batch_type::size>()))
{
return detail::make_batch_bool_constant<batch_type, G>(
detail::make_index_sequence<batch_type::size>());
}
} // namespace xsimd
#endif

Просмотреть файл

@ -0,0 +1,46 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
#define XSIMD_FMA3_AVX2_REGISTER_HPP
#include "./xsimd_avx2_register.hpp"
namespace xsimd
{
template <typename arch>
struct fma3;
/**
* @ingroup arch
*
* AVX2 + FMA instructions
*/
template <>
struct fma3<avx2> : avx2
{
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
static constexpr char const* name() noexcept { return "fma3+avx2"; }
};
#if XSIMD_WITH_FMA3_AVX2
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
}
#endif
}
#endif

Просмотреть файл

@ -0,0 +1,46 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
#define XSIMD_FMA3_AVX_REGISTER_HPP
#include "./xsimd_avx_register.hpp"
namespace xsimd
{
template <typename arch>
struct fma3;
/**
* @ingroup arch
*
* AVX + FMA instructions
*/
template <>
struct fma3<avx> : avx
{
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
static constexpr char const* name() noexcept { return "fma3+avx"; }
};
#if XSIMD_WITH_FMA3_AVX
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
}
#endif
}
#endif

Просмотреть файл

@ -0,0 +1,46 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
#define XSIMD_FMA3_SSE_REGISTER_HPP
#include "./xsimd_sse4_2_register.hpp"
namespace xsimd
{
template <typename arch>
struct fma3;
/**
* @ingroup arch
*
* SSE4.2 + FMA instructions
*/
template <>
struct fma3<sse4_2> : sse4_2
{
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
};
#if XSIMD_WITH_FMA3_SSE
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
}
#endif
}
#endif

42
third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,42 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_FMA4_REGISTER_HPP
#define XSIMD_FMA4_REGISTER_HPP
#include "./xsimd_sse4_2_register.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* FMA4 instructions
*/
struct fma4 : sse4_2
{
static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
static constexpr char const* name() noexcept { return "fma4"; }
};
#if XSIMD_WITH_FMA4
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
}
#endif
}
#endif

35
third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,35 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_GENERIC_ARCH_HPP
#define XSIMD_GENERIC_ARCH_HPP
#include "../config/xsimd_config.hpp"
/**
* @defgroup arch Architecture description
* */
namespace xsimd
{
struct generic
{
static constexpr bool supported() noexcept { return true; }
static constexpr bool available() noexcept { return true; }
static constexpr std::size_t alignment() noexcept { return 0; }
static constexpr bool requires_alignment() noexcept { return false; }
static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
protected:
static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
};
}
#endif

52
third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,52 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_NEON64_REGISTER_HPP
#define XSIMD_NEON64_REGISTER_HPP
#include "xsimd_neon_register.hpp"
namespace xsimd
{
/**
* @ingroup arch
*
* NEON instructions for arm64
*/
struct neon64 : neon
{
static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
static constexpr bool available() noexcept { return true; }
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr std::size_t alignment() noexcept { return 16; }
static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
static constexpr char const* name() noexcept { return "arm64+neon"; }
};
#if XSIMD_WITH_NEON64
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
template <class T>
struct get_bool_simd_register<T, neon64>
: detail::neon_bool_simd_register<T, neon64>
{
};
}
#endif
}
#endif

155
third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,155 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_NEON_REGISTER_HPP
#define XSIMD_NEON_REGISTER_HPP
#include "xsimd_generic_arch.hpp"
#include "xsimd_register.hpp"
#if XSIMD_WITH_NEON
#include <arm_neon.h>
#endif
namespace xsimd
{
/**
* @ingroup arch
*
* NEON instructions for arm32
*/
struct neon : generic
{
static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
static constexpr bool available() noexcept { return true; }
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr std::size_t alignment() noexcept { return 16; }
static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
static constexpr char const* name() noexcept { return "arm32+neon"; }
};
#if XSIMD_WITH_NEON
namespace types
{
namespace detail
{
template <size_t S>
struct neon_vector_type_impl;
template <>
struct neon_vector_type_impl<8>
{
using signed_type = int8x16_t;
using unsigned_type = uint8x16_t;
};
template <>
struct neon_vector_type_impl<16>
{
using signed_type = int16x8_t;
using unsigned_type = uint16x8_t;
};
template <>
struct neon_vector_type_impl<32>
{
using signed_type = int32x4_t;
using unsigned_type = uint32x4_t;
};
template <>
struct neon_vector_type_impl<64>
{
using signed_type = int64x2_t;
using unsigned_type = uint64x2_t;
};
template <class T>
using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
template <class T>
using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
template <class T>
using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
signed_neon_vector_type<T>,
unsigned_neon_vector_type<T>>::type;
using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
signed_neon_vector_type<char>,
unsigned_neon_vector_type<char>>::type;
}
XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
namespace detail
{
template <size_t S>
struct get_unsigned_type;
template <>
struct get_unsigned_type<1>
{
using type = uint8_t;
};
template <>
struct get_unsigned_type<2>
{
using type = uint16_t;
};
template <>
struct get_unsigned_type<4>
{
using type = uint32_t;
};
template <>
struct get_unsigned_type<8>
{
using type = uint64_t;
};
template <size_t S>
using get_unsigned_type_t = typename get_unsigned_type<S>::type;
template <class T, class A>
struct neon_bool_simd_register
{
using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
};
}
template <class T>
struct get_bool_simd_register<T, neon>
: detail::neon_bool_simd_register<T, neon>
{
};
}
#endif
}
#endif

94
third_party/xsimd/include/xsimd/types/xsimd_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,94 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_REGISTER_HPP
#define XSIMD_REGISTER_HPP
#include <type_traits>
namespace xsimd
{
namespace types
{
template <class T, class A>
struct has_simd_register : std::false_type
{
};
template <class T, class Arch>
struct simd_register
{
struct register_type
{
};
};
#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
template <> \
struct simd_register<SCALAR_TYPE, ISA> \
{ \
using register_type = VECTOR_TYPE; \
register_type data; \
operator register_type() const noexcept \
{ \
return data; \
} \
}; \
template <> \
struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type \
{ \
}
#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA) \
template <> \
struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
{ \
}
#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE) \
template <class T> \
struct simd_register<T, ISA> : simd_register<T, ISA_BASE> \
{ \
using register_type = typename simd_register<T, ISA_BASE>::register_type; \
simd_register(register_type reg) noexcept \
: simd_register<T, ISA_BASE> { reg } \
{ \
} \
simd_register() = default; \
}; \
template <class T> \
struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE> \
{ \
}
template <class T, class Arch>
struct get_bool_simd_register
{
using type = simd_register<T, Arch>;
};
template <class T, class Arch>
using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
}
namespace kernel
{
template <class A>
// makes requires_arch equal to A const&, using type_traits functions
using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
template <class T>
struct convert
{
};
}
}
#endif

61
third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,61 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE2_REGISTER_HPP
#define XSIMD_SSE2_REGISTER_HPP
#include "./xsimd_generic_arch.hpp"
#include "./xsimd_register.hpp"
#if XSIMD_WITH_SSE2
#include <emmintrin.h>
#include <xmmintrin.h>
#endif
namespace xsimd
{
/**
* @ingroup arch
*
* SSE2 instructions
*/
struct sse2 : generic
{
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
static constexpr bool available() noexcept { return true; }
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
static constexpr std::size_t alignment() noexcept { return 16; }
static constexpr char const* name() noexcept { return "sse2"; }
};
#if XSIMD_WITH_SSE2
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
}
#endif
}
#endif

45
third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,45 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE3_REGISTER_HPP
#define XSIMD_SSE3_REGISTER_HPP
#include "./xsimd_sse2_register.hpp"
#if XSIMD_WITH_SSE3
#include <pmmintrin.h>
#endif
namespace xsimd
{
/**
* @ingroup arch
*
* SSE3 instructions
*/
struct sse3 : sse2
{
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
static constexpr char const* name() noexcept { return "sse3"; }
};
#if XSIMD_WITH_SSE3
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
}
#endif
}
#endif

44
third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,44 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE4_1_REGISTER_HPP
#define XSIMD_SSE4_1_REGISTER_HPP
#include "./xsimd_ssse3_register.hpp"
#if XSIMD_WITH_SSE4_1
#include <smmintrin.h>
#endif
namespace xsimd
{
/**
* @ingroup arch
*
* SSE4.1 instructions
*/
struct sse4_1 : ssse3
{
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
static constexpr char const* name() noexcept { return "sse4.1"; }
};
#if XSIMD_WITH_SSE4_1
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
}
#endif
}
#endif

44
third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,44 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSE4_2_REGISTER_HPP
#define XSIMD_SSE4_2_REGISTER_HPP
#include "./xsimd_sse4_1_register.hpp"
#if XSIMD_WITH_SSE4_2
#include <nmmintrin.h>
#endif
namespace xsimd
{
/**
* @ingroup arch
*
* SSE4.2 instructions
*/
struct sse4_2 : sse4_1
{
static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
static constexpr char const* name() noexcept { return "sse4.2"; }
};
#if XSIMD_WITH_SSE4_2
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
}
#endif
}
#endif

44
third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,44 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SSSE3_REGISTER_HPP
#define XSIMD_SSSE3_REGISTER_HPP
#include "./xsimd_sse3_register.hpp"
#if XSIMD_WITH_SSSE3
#include <tmmintrin.h>
#endif
namespace xsimd
{
/**
* @ingroup arch
*
* SSSE3 instructions
*/
struct ssse3 : sse3
{
static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
static constexpr bool available() noexcept { return true; }
static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
static constexpr char const* name() noexcept { return "ssse3"; }
};
#if XSIMD_WITH_SSSE3
namespace types
{
XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
}
#endif
}
#endif

155
third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,155 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* Copyright (c) Yibo Cai *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_SVE_REGISTER_HPP
#define XSIMD_SVE_REGISTER_HPP
#include "xsimd_generic_arch.hpp"
#include "xsimd_register.hpp"
#if XSIMD_WITH_SVE
#include <arm_sve.h>
#endif
namespace xsimd
{
namespace detail
{
/**
* @ingroup arch
*
* SVE instructions (fixed vector size) for arm64
*/
template <size_t Width>
struct sve : xsimd::generic
{
static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
static constexpr bool available() noexcept { return true; }
static constexpr bool requires_alignment() noexcept { return true; }
static constexpr std::size_t alignment() noexcept { return 16; }
static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
static constexpr char const* name() noexcept { return "arm64+sve"; }
};
}
#if XSIMD_WITH_SVE
using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
namespace types
{
namespace detail
{
// define fixed size alias per SVE sizeless type
#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
#undef SVE_TO_FIXED_SIZE
template <size_t S>
struct sve_vector_type_impl;
template <>
struct sve_vector_type_impl<8>
{
using signed_type = sve_int8_t;
using unsigned_type = sve_uint8_t;
using floating_point_type = void;
};
template <>
struct sve_vector_type_impl<16>
{
using signed_type = sve_int16_t;
using unsigned_type = sve_uint16_t;
using floating_point_type = void;
};
template <>
struct sve_vector_type_impl<32>
{
using signed_type = sve_int32_t;
using unsigned_type = sve_uint32_t;
using floating_point_type = sve_float32_t;
};
template <>
struct sve_vector_type_impl<64>
{
using signed_type = sve_int64_t;
using unsigned_type = sve_uint64_t;
using floating_point_type = sve_float64_t;
};
template <class T>
using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
template <class T>
using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
template <class T>
using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
template <class T>
using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
floating_point_sve_vector_type<T>,
signed_int_sve_vector_type<T>>::type;
template <class T>
using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
signed_int_or_floating_point_sve_vector_type<T>,
unsigned_int_sve_vector_type<T>>::type;
} // namespace detail
XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
namespace detail
{
struct sve_bool_simd_register
{
using register_type = sve_bool_t;
register_type data;
operator register_type() const noexcept { return data; }
};
} // namespace detail
template <class T>
struct get_bool_simd_register<T, sve>
{
using type = detail::sve_bool_simd_register;
};
} // namespace types
#endif
} // namespace xsimd
#endif

251
third_party/xsimd/include/xsimd/types/xsimd_traits.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,251 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_TRAITS_HPP
#define XSIMD_TRAITS_HPP
#include <type_traits>
#include "xsimd_batch.hpp"
namespace xsimd
{
/**************************************
* simd_traits and revert_simd_traits *
**************************************/
template <class T, class A = default_arch>
struct has_simd_register : types::has_simd_register<T, A>
{
};
namespace detail
{
template <class T, bool>
struct simd_traits_impl;
template <class T>
struct simd_traits_impl<T, false>
{
using type = T;
using bool_type = bool;
static constexpr size_t size = 1;
};
template <class T>
constexpr size_t simd_traits_impl<T, false>::size;
template <class T>
struct simd_traits_impl<T, true>
{
using type = batch<T>;
using bool_type = typename type::batch_bool_type;
static constexpr size_t size = type::size;
};
template <class T>
constexpr size_t simd_traits_impl<T, true>::size;
template <class T, class A>
struct static_check_supported_config_emitter
{
static_assert(A::supported(),
"usage of batch type with unsupported architecture");
static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
"usage of batch type with unsupported type");
};
template <class T, class A>
struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
{
};
#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class T, class A, bool i3ec>
struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
{
};
#endif
// consistency checker
template <class T, class A>
void static_check_supported_config()
{
(void)static_check_supported_config_emitter<T, A>();
}
}
template <class T>
struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
{
};
template <class T>
struct simd_traits<std::complex<T>>
: detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
{
};
#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class T, bool i3ec>
struct simd_traits<xtl::xcomplex<T, T, i3ec>>
: detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
{
};
#endif
template <class T>
struct revert_simd_traits
{
using type = T;
static constexpr size_t size = simd_traits<type>::size;
};
template <class T>
constexpr size_t revert_simd_traits<T>::size;
template <class T>
struct revert_simd_traits<batch<T>>
{
using type = T;
static constexpr size_t size = batch<T>::size;
};
template <class T>
constexpr size_t revert_simd_traits<batch<T>>::size;
template <class T>
using simd_type = typename simd_traits<T>::type;
template <class T>
using simd_bool_type = typename simd_traits<T>::bool_type;
template <class T>
using revert_simd_type = typename revert_simd_traits<T>::type;
/********************
* simd_return_type *
********************/
namespace detail
{
template <class T1, class T2>
struct simd_condition
{
static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
};
template <class T1, class T2, class A>
struct simd_return_type_impl
: std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
{
};
template <class T2, class A>
struct simd_return_type_impl<bool, T2, A>
: std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
{
};
template <class T2, class A>
struct simd_return_type_impl<bool, std::complex<T2>, A>
: std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
{
};
template <class T1, class T2, class A>
struct simd_return_type_impl<std::complex<T1>, T2, A>
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
{
};
template <class T1, class T2, class A>
struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
{
};
#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class T1, class T2, bool I3EC, class A>
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
{
};
template <class T1, class T2, bool I3EC, class A>
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
{
};
template <class T1, class T2, bool I3EC, class A>
struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
{
};
template <class T1, class T2, bool I3EC, class A>
struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
: std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
{
};
#endif
}
template <class T1, class T2, class A = default_arch>
using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
/************
* is_batch *
************/
template <class V>
struct is_batch : std::false_type
{
};
template <class T, class A>
struct is_batch<batch<T, A>> : std::true_type
{
};
/*****************
* is_batch_bool *
*****************/
template <class V>
struct is_batch_bool : std::false_type
{
};
template <class T, class A>
struct is_batch_bool<batch_bool<T, A>> : std::true_type
{
};
/********************
* is_batch_complex *
********************/
template <class V>
struct is_batch_complex : std::false_type
{
};
template <class T, class A>
struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
{
};
}
#endif

530
third_party/xsimd/include/xsimd/types/xsimd_utils.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,530 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_UTILS_HPP
#define XSIMD_UTILS_HPP
#include <complex>
#include <cstdint>
#include <cstring>
#include <tuple>
#include <type_traits>
#ifdef XSIMD_ENABLE_XTL_COMPLEX
#include "xtl/xcomplex.hpp"
#endif
namespace xsimd
{
template <class T, class A>
class batch;
template <class T, class A>
class batch_bool;
/**************
* index *
**************/
template <size_t I>
using index = std::integral_constant<size_t, I>;
/**************
* as_integer *
**************/
template <class T>
struct as_integer : std::make_signed<T>
{
};
template <>
struct as_integer<float>
{
using type = int32_t;
};
template <>
struct as_integer<double>
{
using type = int64_t;
};
template <class T, class A>
struct as_integer<batch<T, A>>
{
using type = batch<typename as_integer<T>::type, A>;
};
template <class B>
using as_integer_t = typename as_integer<B>::type;
/***********************
* as_unsigned_integer *
***********************/
template <class T>
struct as_unsigned_integer : std::make_unsigned<T>
{
};
template <>
struct as_unsigned_integer<float>
{
using type = uint32_t;
};
template <>
struct as_unsigned_integer<double>
{
using type = uint64_t;
};
template <class T, class A>
struct as_unsigned_integer<batch<T, A>>
{
using type = batch<typename as_unsigned_integer<T>::type, A>;
};
template <class T>
using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
/*********************
* as_signed_integer *
*********************/
template <class T>
struct as_signed_integer : std::make_signed<T>
{
};
template <class T>
using as_signed_integer_t = typename as_signed_integer<T>::type;
/******************
* flip_sign_type *
******************/
namespace detail
{
template <class T, bool is_signed>
struct flipped_sign_type_impl : std::make_signed<T>
{
};
template <class T>
struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
{
};
}
template <class T>
struct flipped_sign_type
: detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
{
};
template <class T>
using flipped_sign_type_t = typename flipped_sign_type<T>::type;
/***********
* as_float *
************/
template <class T>
struct as_float;
template <>
struct as_float<int32_t>
{
using type = float;
};
template <>
struct as_float<int64_t>
{
using type = double;
};
template <class T, class A>
struct as_float<batch<T, A>>
{
using type = batch<typename as_float<T>::type, A>;
};
template <class T>
using as_float_t = typename as_float<T>::type;
/**************
* as_logical *
**************/
template <class T>
struct as_logical;
template <class T, class A>
struct as_logical<batch<T, A>>
{
using type = batch_bool<T, A>;
};
template <class T>
using as_logical_t = typename as_logical<T>::type;
/********************
* bit_cast *
********************/
template <class To, class From>
inline To bit_cast(From val) noexcept
{
static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
// FIXME: Some old version of GCC don't support that trait
// static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
// static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
To res;
std::memcpy(&res, &val, sizeof(val));
return res;
}
namespace kernel
{
namespace detail
{
/**************************************
* enabling / disabling metafunctions *
**************************************/
template <class T>
using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
template <class T, size_t S>
using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
template <class T, size_t S>
using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
template <class T, size_t S>
using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
template <class T, size_t S>
using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
template <class T, size_t S>
using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
/********************************
* Matching & mismatching sizes *
********************************/
template <class T, class U, class B = int>
using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
template <class T, class U, class B = int>
using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
template <class T, class U, class B = int>
using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
} // namespace detail
} // namespace kernel
/*****************************************
* Backport of index_sequence from c++14 *
*****************************************/
// TODO: Remove this once we drop C++11 support
namespace detail
{
template <typename T>
struct identity
{
using type = T;
};
#ifdef __cpp_lib_integer_sequence
using std::index_sequence;
using std::integer_sequence;
using std::make_index_sequence;
using std::make_integer_sequence;
using std::index_sequence_for;
#else
template <typename T, T... Is>
struct integer_sequence
{
using value_type = T;
static constexpr std::size_t size() noexcept { return sizeof...(Is); }
};
template <typename Lhs, typename Rhs>
struct make_integer_sequence_concat;
template <typename T, T... Lhs, T... Rhs>
struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
integer_sequence<T, Rhs...>>
: identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
{
};
template <typename T>
struct make_integer_sequence_impl;
template <typename T>
struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
{
};
template <typename T>
struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
{
};
template <typename T, T N>
struct make_integer_sequence_impl<std::integral_constant<T, N>>
: make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
{
};
template <typename T, T N>
using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
template <std::size_t... Is>
using index_sequence = integer_sequence<std::size_t, Is...>;
template <std::size_t N>
using make_index_sequence = make_integer_sequence<std::size_t, N>;
template <typename... Ts>
using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
#endif
template <int... Is>
using int_sequence = integer_sequence<int, Is...>;
template <int N>
using make_int_sequence = make_integer_sequence<int, N>;
template <typename... Ts>
using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
// Type-casted index sequence.
template <class P, size_t... Is>
inline P indexes_from(index_sequence<Is...>) noexcept
{
return { static_cast<typename P::value_type>(Is)... };
}
template <class P>
inline P make_sequence_as_batch() noexcept
{
return indexes_from<P>(make_index_sequence<P::size>());
}
}
/***********************************
* Backport of std::get from C++14 *
***********************************/
namespace detail
{
template <class T, class... Types, size_t I, size_t... Is>
inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
{
return std::get<I>(t);
}
template <class T, class U, class... Types, size_t I, size_t... Is>
inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
{
using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
}
template <class T, class... Types>
inline const T& get(const std::tuple<Types...>& t) noexcept
{
using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
}
}
/*********************************
* Backport of void_t from C++17 *
*********************************/
namespace detail
{
template <class... T>
struct make_void
{
using type = void;
};
template <class... T>
using void_t = typename make_void<T...>::type;
}
/**************************************************
* Equivalent of void_t but with size_t parameter *
**************************************************/
namespace detail
{
template <std::size_t>
struct check_size
{
using type = void;
};
template <std::size_t S>
using check_size_t = typename check_size<S>::type;
}
/*****************************************
* Supplementary std::array constructors *
*****************************************/
namespace detail
{
// std::array constructor from scalar value ("broadcast")
template <typename T, std::size_t... Is>
inline constexpr std::array<T, sizeof...(Is)>
array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
{
// You can safely ignore this silly ternary, the "scalar" is all
// that matters. The rest is just a dirty workaround...
return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
}
template <typename T, std::size_t N>
inline constexpr std::array<T, N>
array_from_scalar(const T& scalar) noexcept
{
return array_from_scalar_impl(scalar, make_index_sequence<N>());
}
// std::array constructor from C-style pointer (handled as an array)
template <typename T, std::size_t... Is>
inline constexpr std::array<T, sizeof...(Is)>
array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
{
return std::array<T, sizeof...(Is)> { c_array[Is]... };
}
template <typename T, std::size_t N>
inline constexpr std::array<T, N>
array_from_pointer(const T* c_array) noexcept
{
return array_from_pointer_impl(c_array, make_index_sequence<N>());
}
}
/************************
* is_array_initializer *
************************/
namespace detail
{
template <bool...>
struct bool_pack;
template <bool... bs>
using all_true = std::is_same<
bool_pack<bs..., true>, bool_pack<true, bs...>>;
template <typename T, typename... Args>
using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
template <typename T, std::size_t N, typename... Args>
using is_array_initializer = std::enable_if<
(sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
// Check that a variadic argument pack is a list of N values of type T,
// as usable for instantiating a value of type std::array<T, N>.
template <typename T, std::size_t N, typename... Args>
using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
}
/**************
* is_complex *
**************/
// This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
// However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
// so we cannot define is_complex in xsimd_traits.hpp. Besides, if
// no file defining batches is included, we still need this definition
// in xsimd_traits.hpp, so let's define it here.
namespace detail
{
template <class T>
struct is_complex : std::false_type
{
};
template <class T>
struct is_complex<std::complex<T>> : std::true_type
{
};
#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class T, bool i3ec>
struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
{
};
#endif
}
/*******************
* real_batch_type *
*******************/
template <class B>
struct real_batch_type
{
using type = B;
};
template <class T, class A>
struct real_batch_type<batch<std::complex<T>, A>>
{
using type = batch<T, A>;
};
template <class B>
using real_batch_type_t = typename real_batch_type<B>::type;
/**********************
* complex_batch_type *
**********************/
template <class B>
struct complex_batch_type
{
using real_value_type = typename B::value_type;
using arch_type = typename B::arch_type;
using type = batch<std::complex<real_value_type>, arch_type>;
};
template <class T, class A>
struct complex_batch_type<batch<std::complex<T>, A>>
{
using type = batch<std::complex<T>, A>;
};
template <class B>
using complex_batch_type_t = typename complex_batch_type<B>::type;
}
#endif

68
third_party/xsimd/include/xsimd/xsimd.hpp поставляемый Normal file
Просмотреть файл

@ -0,0 +1,68 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_HPP
#define XSIMD_HPP
#if defined(__has_cpp_attribute)
// if this check passes, then the compiler supports feature test macros
#if __has_cpp_attribute(nodiscard) >= 201603L
// if this check passes, then the compiler supports [[nodiscard]] without a message
#define XSIMD_NO_DISCARD [[nodiscard]]
#endif
#endif
#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
// this means that the previous tests failed, but we are using C++17 or higher
#define XSIMD_NO_DISCARD [[nodiscard]]
#endif
#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
// this means that the previous checks failed, but we are using GCC or Clang
#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
#endif
#if !defined(XSIMD_NO_DISCARD)
// this means that all the previous checks failed, so we fallback to doing nothing
#define XSIMD_NO_DISCARD
#endif
#ifdef __cpp_if_constexpr
// this means that the compiler supports the `if constexpr` construct
#define XSIMD_IF_CONSTEXPR if constexpr
#endif
#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
// this means that the previous test failed, but we are using C++17 or higher
#define XSIMD_IF_CONSTEXPR if constexpr
#endif
#if !defined(XSIMD_IF_CONSTEXPR)
// this means that all the previous checks failed, so we fallback to a normal `if`
#define XSIMD_IF_CONSTEXPR if
#endif
#include "config/xsimd_config.hpp"
#include "arch/xsimd_scalar.hpp"
#include "memory/xsimd_aligned_allocator.hpp"
#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
// to type definition or anything appart from scalar definition and aligned allocator
#else
#include "types/xsimd_batch.hpp"
#include "types/xsimd_batch_constant.hpp"
#include "types/xsimd_traits.hpp"
// This include must come last
#include "types/xsimd_api.hpp"
#endif
#endif

37
third_party/xsimd/moz.yaml поставляемый Normal file
Просмотреть файл

@ -0,0 +1,37 @@
schema: 1
bugzilla:
product: Toolkit
component: "General"
origin:
name: xsimd
description: C++ wrappers for SIMD intrinsics
url: https://github.com/QuantStack/xsimd
release: 75b043b8e031f1ada8053fe80d5ba635e2a75588 (2023-01-05T06:45:23Z).
revision: 75b043b8e031f1ada8053fe80d5ba635e2a75588
license: BSD-3-Clause
vendoring:
url: https://github.com/QuantStack/xsimd
source-hosting: github
tracking: commit
exclude:
- ".*"
- "*.md"
- "*.yml"
- "*.txt"
- "*.in"
- "*.sh"
- benchmark
- cmake
- docs
- examples
- test
keep:
- include/

Просмотреть файл

@ -2029,6 +2029,7 @@ into source code and to files in the following directories:
#ifdef MOZ_JXL
<li><code>third_party/jpeg-xl/</code></li>
#endif
<li><code>third_party/xsimd/</code></li>
</ul>
See the individual LICENSE files for copyright owners.</p>