Bug 1801557 - import xsimd to third_party r=glandium

Differential Revision: https://phabricator.services.mozilla.com/D162537
2023-01-16 11:05:19 +00:00 · 2023-01-16 11:05:19 +00:00 · 46a6cbf6ca
--- a/third_party/moz.build
+++ b/third_party/moz.build
@ -40,6 +40,9 @@ with Files('rust/**'):
 with Files('webkit/**'):
    BUG_COMPONENT = ('Firefox Build System', 'General')

+with Files('xsimd/**'):
+    BUG_COMPONENT = ('Firefox Build System', 'General')
+
 with Files('prio/**'):
    BUG_COMPONENT = ('Firefox Build System', 'General')

--- a/third_party/xsimd/LICENSE
+++ b/third_party/xsimd/LICENSE
@ -0,0 +1,29 @@
+Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
+Copyright (c) 2016, QuantStack
+Copyright (c) 2018, Serge Guelton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@ -0,0 +1,152 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
+#define XSIMD_GENERIC_ARITHMETIC_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // bitwise_lshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x << y; },
+                                 self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x >> y; },
+                                 self, other);
+        }
+
+        // div
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x / y; },
+                                 self, other);
+        }
+
+        // fma
+        template <class A, class T>
+        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y + z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fms
+        template <class A, class T>
+        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y - z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnma
+        template <class A, class T>
+        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y + z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnms
+        template <class A, class T>
+        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y - z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // mul
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x * y; },
+                                 self, other);
+        }
+
+        // sadd
+        template <class A>
+        inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A>
+        inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+        // ssub
+        template <class A>
+        inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A>
+        inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
@ -0,0 +1,96 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_COMPLEX_HPP
+#define XSIMD_GENERIC_COMPLEX_HPP
+
+#include <complex>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // real
+        template <class A, class T>
+        inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        template <class A, class T>
+        inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.real();
+        }
+
+        // imag
+        template <class A, class T>
+        inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(T(0));
+        }
+
+        template <class A, class T>
+        inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.imag();
+        }
+
+        // arg
+        template <class A, class T>
+        inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return atan2(imag(self), real(self));
+        }
+
+        // conj
+        template <class A, class T>
+        inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { real(self), -imag(self) };
+        }
+
+        // norm
+        template <class A, class T>
+        inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { fma(real(self), real(self), imag(self) * imag(self)) };
+        }
+
+        // proj
+        template <class A, class T>
+        inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = complex_batch_type_t<batch<T, A>>;
+            using real_batch = typename batch_type::real_batch;
+            using real_value_type = typename real_batch::value_type;
+            auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
+            return select(cond,
+                          batch_type(constants::infinity<real_batch>(),
+                                     copysign(real_batch(real_value_type(0)), imag(self))),
+                          batch_type(self));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
+        }
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@ -0,0 +1,239 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_DETAILS_HPP
+#define XSIMD_GENERIC_DETAILS_HPP
+
+#include <complex>
+
+#include "../../math/xsimd_rem_pio2.hpp"
+#include "../../types/xsimd_generic_arch.hpp"
+#include "../../types/xsimd_utils.hpp"
+#include "../xsimd_constants.hpp"
+
+namespace xsimd
+{
+    // Forward declaration. Should we put them in a separate file?
+    template <class T, class A>
+    inline batch<T, A> abs(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
+    template <class T, class A>
+    inline bool any(batch_bool<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    template <class A, class T_out, class T_in>
+    inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
+    template <class T, class A>
+    inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
+    template <class B, class T, class A>
+    inline B bitwise_cast(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> cos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> exp(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A, uint64_t... Coefs>
+    inline batch<T, A> horner(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A>
+    inline batch<T, A> log(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sign(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sin(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> tan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
+
+    namespace kernel
+    {
+
+        namespace detail
+        {
+            template <class F, class A, class T, class... Batches>
+            inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                constexpr std::size_t size = batch<T, A>::size;
+                alignas(A::alignment()) T self_buffer[size];
+                alignas(A::alignment()) T other_buffer[size];
+                self.store_aligned(&self_buffer[0]);
+                other.store_aligned(&other_buffer[0]);
+                for (std::size_t i = 0; i < size; ++i)
+                {
+                    self_buffer[i] = func(self_buffer[i], other_buffer[i]);
+                }
+                return batch<T, A>::load_aligned(self_buffer);
+            }
+
+            template <class U, class F, class A, class T>
+            inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
+            {
+                static_assert(batch<T, A>::size == batch<U, A>::size,
+                              "Source and destination sizes must match");
+                constexpr std::size_t src_size = batch<T, A>::size;
+                constexpr std::size_t dest_size = batch<U, A>::size;
+                alignas(A::alignment()) T self_buffer[src_size];
+                alignas(A::alignment()) U other_buffer[dest_size];
+                self.store_aligned(&self_buffer[0]);
+                for (std::size_t i = 0; i < src_size; ++i)
+                {
+                    other_buffer[i] = func(self_buffer[i]);
+                }
+                return batch<U, A>::load_aligned(other_buffer);
+            }
+        }
+
+        namespace detail
+        {
+            // Generic conversion handling machinery. Each architecture must define
+            // conversion function when such conversions exits in the form of
+            // intrinsic. Then we use that information to automatically decide whether
+            // to use scalar or vector conversion when doing load / store / batch_cast
+            struct with_fast_conversion
+            {
+            };
+            struct with_slow_conversion
+            {
+            };
+
+            template <class A, class From, class To, class = void>
+            struct conversion_type_impl
+            {
+                using type = with_slow_conversion;
+            };
+
+            using xsimd::detail::void_t;
+
+            template <class A, class From, class To>
+            struct conversion_type_impl<A, From, To,
+                                        void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
+                                                                  std::declval<const batch<To, A>&>(),
+                                                                  std::declval<const A&>()))>>
+            {
+                using type = with_fast_conversion;
+            };
+
+            template <class A, class From, class To>
+            using conversion_type = typename conversion_type_impl<A, From, To>::type;
+        }
+
+        namespace detail
+        {
+            /* origin: boost/simdfunction/horn.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B, uint64_t c>
+            inline B coef() noexcept
+            {
+                using value_type = typename B::value_type;
+                return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
+            }
+            template <class B>
+            inline B horner(const B&) noexcept
+            {
+                return B(typename B::value_type(0.));
+            }
+
+            template <class B, uint64_t c0>
+            inline B horner(const B&) noexcept
+            {
+                return coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            inline B horner(const B& self) noexcept
+            {
+                return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
+            }
+
+            /* origin: boost/simdfunction/horn1.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            inline B horner1(const B&) noexcept
+            {
+                return B(1.);
+            }
+
+            template <class B, uint64_t c0>
+            inline B horner1(const B& x) noexcept
+            {
+                return x + detail::coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            inline B horner1(const B& x) noexcept
+            {
+                return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
+            }
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
@ -0,0 +1,163 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_LOGICAL_HPP
+#define XSIMD_GENERIC_LOGICAL_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // from  mask
+        template <class A, class T>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                buffer[i] = mask & (1ull << i);
+            return batch_bool<T, A>::load_aligned(buffer);
+        }
+
+        // ge
+        template <class A, class T>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other <= self;
+        }
+
+        // gt
+        template <class A, class T>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other < self;
+        }
+
+        // is_even
+        template <class A, class T>
+        inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_flint(self * T(0.5));
+        }
+
+        // is_flint
+        template <class A, class T>
+        inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
+            return frac == T(0.);
+        }
+
+        // is_odd
+        template <class A, class T>
+        inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_even(self - T(1.));
+        }
+
+        // isinf
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+        template <class A>
+        inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<float>::infinity();
+        }
+        template <class A>
+        inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<double>::infinity();
+        }
+
+        // isfinite
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(true);
+        }
+        template <class A>
+        inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.f;
+        }
+        template <class A>
+        inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.;
+        }
+
+        // isnan
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return (self < other) || (self == other);
+        }
+
+        // neq
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return !(other == self);
+        }
+
+        // logical_and
+        template <class A, class T>
+        inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x && y; },
+                                 self, other);
+        }
+
+        // logical_or
+        template <class A, class T>
+        inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x || y; },
+                                 self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(buffer);
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            uint64_t res = 0;
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                if (buffer[i])
+                    res |= 1ul << i;
+            return res;
+        }
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@ -0,0 +1,397 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MEMORY_HPP
+#define XSIMD_GENERIC_MEMORY_HPP
+
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // extract_pair
+        template <class A, class T>
+        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(i < size && "index in bounds");
+
+            alignas(A::alignment()) T self_buffer[size];
+            self.store_aligned(self_buffer);
+
+            alignas(A::alignment()) T other_buffer[size];
+            other.store_aligned(other_buffer);
+
+            alignas(A::alignment()) T concat_buffer[size];
+
+            for (std::size_t j = 0; j < (size - i); ++j)
+            {
+                concat_buffer[j] = other_buffer[i + j];
+                if (j < i)
+                {
+                    concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
+                }
+            }
+            return batch<T, A>::load_aligned(concat_buffer);
+        }
+
+        // gather
+        namespace detail
+        {
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline batch<T, A> gather(U const* src, batch<V, A> const& index,
+                                      ::xsimd::index<N> I) noexcept
+            {
+                return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline batch<T, A>
+            gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                const auto test = gather<N - 1, T, A>(src, index, {});
+                return insert(test, static_cast<T>(src[index.get(I)]), I);
+            }
+        } // namespace detail
+
+        template <typename T, typename A, typename V>
+        inline batch<T, A>
+        gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and mismatched strides.
+        template <typename T, typename A, typename U, typename V>
+        inline detail::sizes_mismatch_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and matching strides.
+        template <typename T, typename A, typename U, typename V>
+        inline detail::stride_match_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
+        }
+
+        // insert
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
+        {
+            struct index_mask
+            {
+                static constexpr bool get(size_t index, size_t /* size*/)
+                {
+                    return index != I;
+                }
+            };
+            batch<T, A> tmp(val);
+            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+        }
+
+        // get
+        template <class A, size_t I, class T>
+        inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, class T>
+        inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            using T2 = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        // load_aligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
+            }
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                using batch_type_out = batch<T_out, A>;
+                alignas(A::alignment()) T_out buffer[batch_type_out::size];
+                std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
+                return batch_type_out::load_aligned(buffer);
+            }
+        }
+        template <class A, class T_in, class T_out>
+        inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // load_unaligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
+            }
+
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
+            }
+        }
+        template <class A, class T_in, class T_out>
+        inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        namespace detail
+        {
+            // Scatter with runtime indexes.
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline void scatter(batch<T, A> const& src, U* dst,
+                                batch<V, A> const& index,
+                                ::xsimd::index<N> I) noexcept
+            {
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline void
+            scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
+                    ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                kernel::detail::scatter<N - 1, T, A, U, V>(
+                    src, dst, index, {});
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+        } // namespace detail
+
+        template <typename A, typename T, typename V>
+        inline void
+        scatter(batch<T, A> const& src, T* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        inline detail::sizes_mismatch_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        inline detail::stride_match_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            const auto tmp = batch_cast<U>(src);
+            kernel::scatter<A>(tmp, dst, index, A {});
+        }
+
+        // store
+        template <class T, class A>
+        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            constexpr auto size = batch_bool<T, A>::size;
+            alignas(A::alignment()) T buffer[size];
+            kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = bool(buffer[i]);
+        }
+
+        // store_aligned
+        template <class A, class T_in, class T_out>
+        inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
+            store_aligned(&buffer[0], self);
+            std::copy(std::begin(buffer), std::end(buffer), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T_in, class T_out>
+        inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            return store_aligned<A>(mem, self, generic {});
+        }
+
+        // swizzle
+        template <class A, class T, class ITy, ITy... Vs>
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        {
+            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+        }
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
+            }
+        }
+
+        // load_complex_aligned
+        template <class A, class T_out, class T_in>
+        inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_aligned(buffer),
+                       lo = real_batch::load_aligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // load_complex_unaligned
+        template <class A, class T_out, class T_in>
+        inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_unaligned(buffer),
+                       lo = real_batch::load_unaligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // store_complex_aligned
+        template <class A, class T_out, class T_in>
+        inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_aligned(buffer);
+            hi.store_aligned(buffer + real_batch::size);
+        }
+
+        // store_compelx_unaligned
+        template <class A, class T_out, class T_in>
+        inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_unaligned(buffer);
+            hi.store_unaligned(buffer + real_batch::size);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
@ -0,0 +1,72 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ROUNDING_HPP
+#define XSIMD_GENERIC_ROUNDING_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // ceil
+        template <class A, class T>
+        inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self < self, truncated_self + 1, truncated_self);
+        }
+
+        // floor
+        template <class A, class T>
+        inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self > self, truncated_self - 1, truncated_self);
+        }
+
+        // round
+        template <class A, class T>
+        inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto v = abs(self);
+            auto c = ceil(v);
+            auto cp = select(c - 0.5 > v, c - 1, c);
+            return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
+        }
+
+        // trunc
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
@ -0,0 +1,969 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_TRIGO_HPP
+#define XSIMD_GENERIC_TRIGO_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+#include <array>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        using namespace types;
+
+        // acos
+        template <class A, class T>
+        inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
+            x = asin(x);
+            x = select(x_larger_05, x + x, x);
+            x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
+            return select(x_larger_05, x, constants::pio2<batch_type>() - x);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type tmp = asin(z);
+            return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
+        }
+
+        // acosh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = self - batch_type(1.);
+            auto test = x > constants::oneotwoeps<batch_type>();
+            batch_type z = select(test, self, x + sqrt(x + x + x * x));
+            batch_type l1pz = log1p(z);
+            return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = acos(z);
+            w = batch_type(-w.imag(), w.real());
+            return w;
+        }
+
+        // asin
+        template <class A>
+        inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type sign = bitofsign(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
+            x = select(x_larger_05, sqrt(z), x);
+            batch_type z1 = detail::horner<batch_type,
+                                           0x3e2aaae4,
+                                           0x3d9980f6,
+                                           0x3d3a3ec7,
+                                           0x3cc617e3,
+                                           0x3d2cb352>(z);
+            z1 = fma(z1, z * x, x);
+            z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
+            return z ^ sign;
+        }
+        template <class A>
+        inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto small_cond = x < constants::sqrteps<batch_type>();
+            batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
+            batch_type zz1 = batch_type(1.) - x;
+            batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
+            zz1 = sqrt(zz1 + zz1);
+            batch_type z = constants::pio4<batch_type>() - zz1;
+            zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
+            z = z - zz1;
+            zz1 = z + constants::pio4<batch_type>();
+            batch_type zz2 = self * self;
+            z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
+            zz2 = fma(x, z, x);
+            return select(x > batch_type(1.), constants::nan<batch_type>(),
+                          select(small_cond, x,
+                                 select(x > ct1, zz1, zz2))
+                              ^ bitofsign(self));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+
+            batch_type ct(-y, x);
+            batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
+            zz = log(ct + sqrt(zz));
+            batch_type resg(zz.imag(), -zz.real());
+
+            return select(y == real_batch(0.),
+                          select(fabs(x) > real_batch(1.),
+                                 batch_type(constants::pio2<real_batch>(), real_batch(0.)),
+                                 batch_type(asin(x), real_batch(0.))),
+                          resg);
+        }
+
+        // asinh
+        /* origin: boost/simd/arch/common/simd/function/asinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A>
+            average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                return (x1 & x2) + ((x1 ^ x2) >> 1);
+            }
+
+            template <class A, class T>
+            inline batch<T, A>
+            averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                using batch_type = batch<T, A>;
+                return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
+            }
+            template <class A>
+            inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+            template <class A>
+            inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+        }
+        template <class A>
+        inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto lthalf = x < batch_type(0.5);
+            batch_type x2 = x * x;
+            batch_type bts = bitofsign(self);
+            batch_type z(0.);
+            if (any(lthalf))
+            {
+                z = detail::horner<batch_type,
+                                   0x3f800000,
+                                   0xbe2aa9ad,
+                                   0x3d9949b1,
+                                   0xbd2ee581,
+                                   0x3ca4d6e6>(x2)
+                    * x;
+                if (all(lthalf))
+                    return z ^ bts;
+            }
+            batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_NANS
+            return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
+#else
+            return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
+#endif
+        }
+        template <class A>
+        inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto test = x > constants::oneosqrteps<batch_type>();
+            batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_INFINITIES
+            z = select(x == constants::infinity<batch_type>(), x, z);
+#endif
+            batch_type l1pz = log1p(z);
+            z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+            return bitofsign(self) ^ z;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = asin(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan
+        namespace detail
+        {
+            template <class A>
+            static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
+            {
+                using batch_type = batch<float, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                const batch_type z = xx * xx;
+                batch_type z1 = detail::horner<batch_type,
+                                               0xbeaaaa2aul,
+                                               0x3e4c925ful,
+                                               0xbe0e1b85ul,
+                                               0x3da4f0d1ul>(z);
+                z1 = fma(xx, z1 * z, xx);
+                z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
+                z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
+                return yy + z1;
+            }
+            template <class A>
+            static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
+            {
+                using batch_type = batch<double, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                batch_type z = xx * xx;
+                z *= detail::horner<batch_type,
+                                    0xc0503669fd28ec8eull,
+                                    0xc05eb8bf2d05ba25ull,
+                                    0xc052c08c36880273ull,
+                                    0xc03028545b6b807aull,
+                                    0xbfec007fa1f72594ull>(z)
+                    / detail::horner1<batch_type,
+                                      0x4068519efbbd62ecull,
+                                      0x407e563f13b049eaull,
+                                      0x407b0e18d2e2be3bull,
+                                      0x4064a0dd43b8fa25ull,
+                                      0x4038dbc45b14603cull>(z);
+                z = fma(xx, z, xx);
+                z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
+                z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
+                return yy + z;
+            }
+        }
+        template <class A, class T>
+        inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type absa = abs(self);
+            const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
+            return x ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch x2 = x * x;
+            real_batch one(1.);
+            real_batch a = one - x2 - (y * y);
+            real_batch w = 0.5 * atan2(2. * x, a);
+            real_batch num = y + one;
+            num = x2 + num * num;
+            real_batch den = y - one;
+            den = x2 + den * den;
+            batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
+                                    batch_type(real_batch(0.), constants::infinity<real_batch>()),
+                                    batch_type(w, 0.25 * log(num / den)));
+            return res;
+        }
+
+        // atanh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            batch_type t = x + x;
+            batch_type z = batch_type(1.) - x;
+            auto test = x < batch_type(0.5);
+            batch_type tmp = select(test, x, t) / z;
+            return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = atan(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan2
+        template <class A, class T>
+        inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type q = abs(self / other);
+            const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
+            return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
+        }
+
+        // cos
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
+            {
+                return x & batch<T, A>(3);
+            }
+
+            template <class A>
+            inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
+            {
+                return to_float(quadrant(to_int(x)));
+            }
+
+            template <class A>
+            inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type a = x * batch_type(0.25);
+                return (a - floor(a)) * batch_type(4.);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            template <class A>
+            inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3d2aaaa5,
+                                              0xbab60619,
+                                              0x37ccf5ce>(z);
+                return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
+            }
+
+            template <class A>
+            inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbe2aaaa2,
+                                              0x3c08839d,
+                                              0xb94ca1f9>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type zz = z * z;
+                batch_type y = detail::horner<batch_type,
+                                              0x3eaaaa6f,
+                                              0x3e0896dd,
+                                              0x3d5ac5c9,
+                                              0x3cc821b5,
+                                              0x3b4c779c,
+                                              0x3c19c53b>(zz);
+                return fma(y, zz * z, z);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3fe0000000000000ull,
+                                              0xbfa5555555555551ull,
+                                              0x3f56c16c16c15d47ull,
+                                              0xbefa01a019ddbcd9ull,
+                                              0x3e927e4f8e06d9a5ull,
+                                              0xbe21eea7c1e514d4ull,
+                                              0x3da8ff831ad9b219ull>(z);
+                return batch_type(1.) - y * z;
+            }
+
+            template <class A>
+            static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbfc5555555555548ull,
+                                              0x3f8111111110f7d0ull,
+                                              0xbf2a01a019bfdf03ull,
+                                              0x3ec71de3567d4896ull,
+                                              0xbe5ae5e5a9291691ull,
+                                              0x3de5d8fd1fcf0ec1ull>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type zz = z * z;
+                batch_type num = detail::horner<batch_type,
+                                                0xc1711fead3299176ull,
+                                                0x413199eca5fc9dddull,
+                                                0xc0c992d8d24f3f38ull>(zz);
+                batch_type den = detail::horner1<batch_type,
+                                                 0xc189afe03cbe5a31ull,
+                                                 0x4177d98fc2ead8efull,
+                                                 0xc13427bc582abc96ull,
+                                                 0x40cab8a5eeb36572ull>(zz);
+                return fma(z, (zz * (num / den)), z);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            struct trigo_radian_tag
+            {
+            };
+            struct trigo_pi_tag
+            {
+            };
+
+            template <class B, class Tag = trigo_radian_tag>
+            struct trigo_reducer
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    if (all(x <= constants::pio4<B>()))
+                    {
+                        xr = x;
+                        return B(0.);
+                    }
+                    else if (all(x <= constants::pio2<B>()))
+                    {
+                        auto test = x > constants::pio4<B>();
+                        xr = x - constants::pio2_1<B>();
+                        xr -= constants::pio2_2<B>();
+                        xr -= constants::pio2_3<B>();
+                        xr = select(test, xr, x);
+                        return select(test, B(1.), B(0.));
+                    }
+                    else if (all(x <= constants::twentypi<B>()))
+                    {
+                        B xi = nearbyint(x * constants::twoopi<B>());
+                        xr = fnma(xi, constants::pio2_1<B>(), x);
+                        xr -= xi * constants::pio2_2<B>();
+                        xr -= xi * constants::pio2_3<B>();
+                        return quadrant(xi);
+                    }
+                    else if (all(x <= constants::mediumpi<B>()))
+                    {
+                        B fn = nearbyint(x * constants::twoopi<B>());
+                        B r = x - fn * constants::pio2_1<B>();
+                        B w = fn * constants::pio2_1t<B>();
+                        B t = r;
+                        w = fn * constants::pio2_2<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_2t<B>() - ((t - r) - w);
+                        t = r;
+                        w = fn * constants::pio2_3<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_3t<B>() - ((t - r) - w);
+                        xr = r - w;
+                        return quadrant(fn);
+                    }
+                    else
+                    {
+                        static constexpr std::size_t size = B::size;
+                        using value_type = typename B::value_type;
+                        alignas(B) std::array<value_type, size> tmp;
+                        alignas(B) std::array<value_type, size> txr;
+                        alignas(B) std::array<value_type, size> args;
+                        x.store_aligned(args.data());
+
+                        for (std::size_t i = 0; i < size; ++i)
+                        {
+                            double arg = args[i];
+                            if (arg == std::numeric_limits<value_type>::infinity())
+                            {
+                                tmp[i] = 0.;
+                                txr[i] = std::numeric_limits<value_type>::quiet_NaN();
+                            }
+                            else
+                            {
+                                double y[2];
+                                std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
+                                tmp[i] = value_type(n & 3);
+                                txr[i] = value_type(y[0]);
+                            }
+                        }
+                        xr = B::load_aligned(&txr[0]);
+                        B res = B::load_aligned(&tmp[0]);
+                        return res;
+                    }
+                }
+            };
+
+            template <class B>
+            struct trigo_reducer<B, trigo_pi_tag>
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    B xi = nearbyint(x * B(2.));
+                    B x2 = x - xi * B(0.5);
+                    xr = x2 * constants::pi<B>();
+                    return quadrant(xi);
+                }
+            };
+
+        }
+        template <class A, class T>
+        inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
+            return z1 ^ sign_bit;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
+        }
+
+        // cosh
+
+        /* origin: boost/simd/arch/common/simd/function/cosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A, class T>
+        inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = batch_type(0.5) * tmp;
+            return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { cosh(x) * cos(y), sinh(x) * sin(y) };
+        }
+
+        // sin
+        namespace detail
+        {
+            template <class A, class T, class Tag = trigo_radian_tag>
+            inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type x = abs(self);
+                batch_type xr = constants::nan<batch_type>();
+                const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
+                auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+                auto swap_bit = fma(batch_type(-2.), tmp, n);
+                auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+                const batch_type z = xr * xr;
+                const batch_type se = detail::sin_eval(z, xr);
+                const batch_type ce = detail::cos_eval(z);
+                const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
+                return z1 ^ sign_bit;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::sin(self);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
+        }
+
+        // sincos
+        template <class A, class T>
+        inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
+            auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
+            return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
+        }
+
+        template <class A, class T>
+        inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
+        sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch rcos = cos(z.real());
+            real_batch rsin = sin(z.real());
+            real_batch icosh = cosh(z.imag());
+            real_batch isinh = sinh(z.imag());
+            return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
+        }
+
+        // sinh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type sqr_self = self * self;
+                return detail::horner<batch_type,
+                                      0x3f800000, // 1.0f
+                                      0x3e2aaacc, // 1.66667160211E-1f
+                                      0x3c087bbe, // 8.33028376239E-3f
+                                      0x39559e2f // 2.03721912945E-4f
+                                      >(sqr_self)
+                    * self;
+            }
+
+            template <class A>
+            inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type sqrself = self * self;
+                return fma(self, (detail::horner<batch_type,
+                                                 0xc115782bdbf6ab05ull, //  -3.51754964808151394800E5
+                                                 0xc0c694b8c71d6182ull, //  -1.15614435765005216044E4,
+                                                 0xc064773a398ff4feull, //  -1.63725857525983828727E2,
+                                                 0xbfe9435fe8bb3cd6ull //  -7.89474443963537015605E-1
+                                                 >(sqrself)
+                                  / detail::horner1<batch_type,
+                                                    0xc1401a20e4f90044ull, //  -2.11052978884890840399E6
+                                                    0x40e1a7ba7ed72245ull, //   3.61578279834431989373E4,
+                                                    0xc0715b6096e96484ull //  -2.77711081420602794433E2,
+                                                    >(sqrself))
+                               * sqrself,
+                           self);
+            }
+        }
+        /* origin: boost/simd/arch/common/simd/function/sinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type half(0.5);
+            batch_type x = abs(a);
+            auto lt1 = x < batch_type(1.);
+            batch_type bts = bitofsign(a);
+            batch_type z(0.);
+            if (any(lt1))
+            {
+                z = detail::sinh_kernel(x);
+                if (all(lt1))
+                    return z ^ bts;
+            }
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, half, batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = half * tmp;
+            batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
+            return select(lt1, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { sinh(x) * cos(y), cosh(x) * sin(y) };
+        }
+
+        // tan
+        template <class A, class T>
+        inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto test = (swap_bit == batch_type(0.));
+            const batch_type y = detail::tan_eval(xr, test);
+            return y ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
+            batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
+            real_batch wreal = sin(2 * z.real()) / d;
+            real_batch wimag = sinh(2 * z.imag());
+            batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
+            return select(d == real_batch(0.), winf, wres);
+        }
+
+        // tanh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tanh_kernel;
+
+            template <class A>
+            struct tanh_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(detail::horner<batch_type,
+                                              0xbeaaaa99, //    -3.33332819422E-1F
+                                              0x3e088393, //    +1.33314422036E-1F
+                                              0xbd5c1e2d, //    -5.37397155531E-2F
+                                              0x3ca9134e, //    +2.06390887954E-2F
+                                              0xbbbaf0ea //    -5.70498872745E-3F
+                                              >(sqrx)
+                                   * sqrx,
+                               x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    return batch_type(1.) / tanh(x);
+                }
+            };
+
+            template <class A>
+            struct tanh_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(sqrx * p(sqrx) / q(sqrx), x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    batch_type qval = q(sqrx);
+                    return qval / (x * fma(p(sqrx), sqrx, qval));
+                }
+
+                static inline batch_type p(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xc0993ac030580563, // -1.61468768441708447952E3
+                                          0xc058d26a0e26682d, // -9.92877231001918586564E1,
+                                          0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
+                                          >(x);
+                }
+
+                static inline batch_type q(const batch_type& x) noexcept
+                {
+                    return detail::horner1<batch_type,
+                                           0x40b2ec102442040c, //  4.84406305325125486048E3
+                                           0x40a176fa0e5535fa, //  2.23548839060100448583E3,
+                                           0x405c33f28a581B86 //  1.12811678491632931402E2,
+                                           >(x);
+                }
+            };
+
+        }
+        /* origin: boost/simd/arch/common/simd/function/tanh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type one(1.);
+            batch_type x = abs(self);
+            auto test = x < (batch_type(5.) / batch_type(8.));
+            batch_type bts = bitofsign(self);
+            batch_type z = one;
+            if (any(test))
+            {
+                z = detail::tanh_kernel<batch_type>::tanh(x);
+                if (all(test))
+                    return z ^ bts;
+            }
+            batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
+            return select(test, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using real_batch = typename batch<std::complex<T>, A>::real_batch;
+            auto x = z.real();
+            auto y = z.imag();
+            real_batch two(2);
+            auto d = cosh(two * x) + cos(two * y);
+            return { sinh(two * x) / d, sin(two * y) / d };
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@ -0,0 +1,940 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_HPP
+#define XSIMD_AVX2_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_abs_epi8(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_abs_epi16(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_abs_epi32(self);
+                }
+                else
+                {
+                    return abs(self, avx {});
+                }
+            }
+            return self;
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_add_epi64(self, other);
+            }
+            else
+            {
+                return add(self, other, avx {});
+            }
+        }
+
+        // bitwise_and
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_slli_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sllv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sllv_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        // bitwise_or
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
+                    __m256i res = _mm256_srai_epi16(self, other);
+                    return _mm256_or_si256(
+                        detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                           { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                           sign_mask, cmp_is_negative),
+                        _mm256_andnot_si256(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srai_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srli_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srav_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srlv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srlv_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+
+        // complex_low
+        template <class A>
+        inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // complex_high
+        template <class A>
+        inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srli_epi64(x, 32);
+                xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); //  2^84
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srai_epi32(x, 16);
+                xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); //  3*2^67
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, avx {});
+            }
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+        }
+
+        template <class A, class U,
+                  detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, float const* src,
+                                      batch<U, A> const& index,
+                                      kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_ps(src, index, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<double, A> gather(batch<double, A> const&, double const* src,
+                                       batch<U, A> const& index,
+                                       requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_pd(src, index, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, double const* src,
+                                      batch<V, A> const& index,
+                                      requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                        batch<V, A> const& index,
+                                        requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_cmpgt_epi8(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_cmpgt_epi16(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_cmpgt_epi32(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_cmpgt_epi64(other, self);
+                }
+                else
+                {
+                    return lt(self, other, avx {});
+                }
+            }
+            else
+            {
+                return lt(self, other, avx {});
+            }
+        }
+
+        // load_complex
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type real = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            batch_type imag = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            return { real, imag };
+        }
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            return { real, imag };
+        }
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
+            }
+            else
+            {
+                return mask(self, avx {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_mullo_epi32(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx {});
+            }
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m256i tmp1 = _mm256_hadd_epi32(self, self);
+                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
+                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(res);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, res);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_add(self, avx {});
+            }
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx {});
+            }
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            // FIXME: for some reason mask here is not considered as an immediate,
+            // but it's okay for _mm256_blend_epi32
+            // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blend_epi32(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm256_blend_epi32(false_br, true_br, imask);
+            }
+            else
+            {
+                return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+            }
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bslli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x28);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x28);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bslli_epi128(x, M);
+            auto z = _mm256_bsrli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x28);
+            return _mm256_or_si256(y, w);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bsrli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x81);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x81);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bsrli_epi128(x, M);
+            auto z = _mm256_bslli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x81);
+            return _mm256_or_si256(y, w);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sub_epi64(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx {});
+            }
+        }
+
+        // swizzle
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_pd(self, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_epi64(self, mask);
+        }
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@ -0,0 +1,627 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_HPP
+#define XSIMD_AVX512BW_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <class A, class T, int Cmp>
+            inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_abs_epi16(self);
+            }
+            else
+            {
+                return abs(self, avx512dq {});
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_add_epi16(self, other);
+            }
+            else
+            {
+                return add(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
+#else
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_slli_epi16(self, other);
+#endif
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m512i zeros = _mm512_setzero_si512();
+                    __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
+                    __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                    __m512i res = _mm512_srai_epi16(self, other);
+#endif
+                    return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srai_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srli_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+
+        // ge
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epi16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epu16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epi16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epu16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
+                __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
+                return _mm512_or_si512(upper, lower);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mullo_epi16(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx512dq {});
+            }
+        }
+
+        // neq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx512dq {});
+            }
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is == 0 ? 8 : Is - 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? Is - N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? 0xFFFF : 0x0000)... };
+            }
+        }
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) uint64_t buffer[8];
+                _mm512_store_epi64(&buffer[0], x);
+                for (int i = 7; i > 0; --i)
+                    buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
+                buffer[0] = buffer[0] << 8;
+                xx = _mm512_load_epi64(&buffer[0]);
+
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xl = _mm512_slli_epi64(x, 8);
+                __m512i xr = _mm512_srli_epi64(x, 56);
+                xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is + 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < (32 - N) ? Is + N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
+            }
+        }
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xr = _mm512_srli_epi64(x, 8);
+                __m512i xl = _mm512_slli_epi64(x, 56);
+                xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sub_epi16(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx512dq {});
+            }
+        }
+
+        // swizzle
+
+        template <class A, uint16_t... Vs>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
+        }
+
+        template <class A, uint16_t... Vs>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
+        }
+
+        template <class A, uint8_t... Vs>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
+        }
+
+        template <class A, uint8_t... Vs>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_hi(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_lo(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
@ -0,0 +1,28 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_HPP
+#define XSIMD_AVX512CD_HPP
+
+#include "../types/xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        // Nothing there yet.
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
@ -0,0 +1,212 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512_DQHPP
+#define XSIMD_AVX512_D_HPP
+
+#include "../types/xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_pd(other, self);
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_pd(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                               \
+    batch<float, avx2> halfx##I;                                              \
+    {                                                                         \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                               \
+                                                                              \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                               \
+                                                                              \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
+                                                                              \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                               \
+                                                                              \
+        halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),           \
+                                  _mm512_extractf32x8_ps(resx3, 1));          \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_insertf32x8(concat, halfx1, 1);
+            return concat;
+        }
+
+        // ldexp
+        template <class A>
+        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
+        }
+
+        // mul
+        template <class A>
+        inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        template <class A>
+        inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                  requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_cvtpd_epi64(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
+            return reduce_add(batch<float, avx2>(res1), avx2 {});
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvtepi64_pd(self);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvttpd_epi64(self);
+            }
+
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
@ -0,0 +1,384 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
+#define XSIMD_NUMERICAL_CONSTANT_HPP
+
+#include <limits>
+
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace constants
+    {
+
+#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
+    template <class T>                              \
+    inline T NAME() noexcept                        \
+    {                                               \
+        return T(NAME<typename T::value_type>());   \
+    }                                               \
+    template <>                                     \
+    inline float NAME<float>() noexcept             \
+    {                                               \
+        return SINGLE;                              \
+    }                                               \
+    template <>                                     \
+    inline double NAME<double>() noexcept           \
+    {                                               \
+        return DOUBLE;                              \
+    }
+
+#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
+    template <class T>                                  \
+    inline T NAME() noexcept                            \
+    {                                                   \
+        return T(NAME<typename T::value_type>());       \
+    }                                                   \
+    template <>                                         \
+    inline float NAME<float>() noexcept                 \
+    {                                                   \
+        return bit_cast<float>((uint32_t)SINGLE);       \
+    }                                                   \
+    template <>                                         \
+    inline double NAME<double>() noexcept               \
+    {                                                   \
+        return bit_cast<double>((uint64_t)DOUBLE);      \
+    }
+
+        XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
+        XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
+        XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
+        XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
+        XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
+        XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
+        XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
+        XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
+        XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
+        XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
+        XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
+        XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
+        XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
+        XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
+        XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
+        XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
+        XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
+        XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
+        XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
+        XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
+        XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
+        XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
+        XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
+        XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
+        XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
+        XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
+        XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
+
+#undef XSIMD_DEFINE_CONSTANT
+#undef XSIMD_DEFINE_CONSTANT_HEX
+
+        template <class T>
+        constexpr T allbits() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask1frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask2frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponent() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponentm1() noexcept;
+
+        template <class T>
+        constexpr int32_t nmb() noexcept;
+
+        template <class T>
+        constexpr T zero() noexcept;
+
+        template <class T>
+        constexpr T minvalue() noexcept;
+
+        template <class T>
+        constexpr T maxvalue() noexcept;
+
+        /**************************
+         * allbits implementation *
+         **************************/
+
+        namespace detail
+        {
+            template <class T, bool = std::is_integral<T>::value>
+            struct allbits_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return T(~0);
+                }
+            };
+
+            template <class T>
+            struct allbits_impl<T, false>
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return nan<T>();
+                }
+            };
+        }
+
+        template <class T>
+        inline constexpr T allbits() noexcept
+        {
+            return T(detail::allbits_impl<typename T::value_type>::get_value());
+        }
+
+        /*****************************
+         * mask1frexp implementation *
+         *****************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> mask1frexp() noexcept
+        {
+            return as_integer_t<T>(mask1frexp<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t mask1frexp<float>() noexcept
+        {
+            return 0x7f800000;
+        }
+
+        template <>
+        inline constexpr int64_t mask1frexp<double>() noexcept
+        {
+            return 0x7ff0000000000000;
+        }
+
+        /*****************************
+         * mask2frexp implementation *
+         *****************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> mask2frexp() noexcept
+        {
+            return as_integer_t<T>(mask2frexp<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t mask2frexp<float>() noexcept
+        {
+            return 0x3f000000;
+        }
+
+        template <>
+        inline constexpr int64_t mask2frexp<double>() noexcept
+        {
+            return 0x3fe0000000000000;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> maxexponent() noexcept
+        {
+            return as_integer_t<T>(maxexponent<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t maxexponent<float>() noexcept
+        {
+            return 127;
+        }
+
+        template <>
+        inline constexpr int64_t maxexponent<double>() noexcept
+        {
+            return 1023;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> maxexponentm1() noexcept
+        {
+            return as_integer_t<T>(maxexponentm1<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t maxexponentm1<float>() noexcept
+        {
+            return 126;
+        }
+
+        template <>
+        inline constexpr int64_t maxexponentm1<double>() noexcept
+        {
+            return 1022;
+        }
+
+        /**********************
+         * nmb implementation *
+         **********************/
+
+        template <class T>
+        inline constexpr int32_t nmb() noexcept
+        {
+            return nmb<typename T::value_type>();
+        }
+
+        template <>
+        inline constexpr int32_t nmb<float>() noexcept
+        {
+            return 23;
+        }
+
+        template <>
+        inline constexpr int32_t nmb<double>() noexcept
+        {
+            return 52;
+        }
+
+        /***********************
+         * zero implementation *
+         ***********************/
+
+        template <class T>
+        inline constexpr T zero() noexcept
+        {
+            return T(typename T::value_type(0));
+        }
+
+        /***************************
+         * minvalue implementation *
+         ***************************/
+
+        namespace detail
+        {
+            template <class T>
+            struct minvalue_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<typename T::value_type>::min();
+                }
+            };
+
+            template <class T>
+            struct minvalue_common
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<T>::min();
+                }
+            };
+
+            template <>
+            struct minvalue_impl<int8_t> : minvalue_common<int8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int16_t> : minvalue_common<int16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int32_t> : minvalue_common<int32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int64_t> : minvalue_common<int64_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
+            {
+            };
+
+            template <>
+            struct minvalue_impl<float>
+            {
+                static float get_value() noexcept
+                {
+                    return bit_cast<float>((uint32_t)0xff7fffff);
+                }
+            };
+
+            template <>
+            struct minvalue_impl<double>
+            {
+                static double get_value() noexcept
+                {
+                    return bit_cast<double>((uint64_t)0xffefffffffffffff);
+                }
+            };
+        }
+
+        template <class T>
+        inline constexpr T minvalue() noexcept
+        {
+            return T(detail::minvalue_impl<typename T::value_type>::get_value());
+        }
+
+        /***************************
+         * maxvalue implementation *
+         ***************************/
+
+        template <class T>
+        inline constexpr T maxvalue() noexcept
+        {
+            return T(std::numeric_limits<typename T::value_type>::max());
+        }
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+
+#include "../types/xsimd_fma3_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_HPP
+#define XSIMD_FMA3_AVX2_HPP
+
+#include "../types/xsimd_fma3_avx2_register.hpp"
+
+// Allow inclusion of xsimd_fma3_avx.hpp
+#ifdef XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
+#define avx avx2
+#include "./xsimd_fma3_avx.hpp"
+#undef avx
+#undef XSIMD_FMA3_AVX_HPP
+
+// Carefully restore guards
+#ifdef XSIMD_FORCE_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_HPP
+#define XSIMD_FMA3_SSE_HPP
+
+#include "../types/xsimd_fma3_sse_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_HPP
+#define XSIMD_FMA4_HPP
+
+#include "../types/xsimd_fma4_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_pd(x, y, z);
+        }
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
@ -0,0 +1,23 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_HPP
+#define XSIMD_GENERIC_HPP
+
+#include "./generic/xsimd_generic_arithmetic.hpp"
+#include "./generic/xsimd_generic_complex.hpp"
+#include "./generic/xsimd_generic_logical.hpp"
+#include "./generic/xsimd_generic_math.hpp"
+#include "./generic/xsimd_generic_memory.hpp"
+#include "./generic/xsimd_generic_rounding.hpp"
+#include "./generic/xsimd_generic_trigo.hpp"
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
@ -0,0 +1,38 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_FWD_HPP
+#define XSIMD_GENERIC_FWD_HPP
+
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        // forward declaration
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@ -0,0 +1,86 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ISA_HPP
+#define XSIMD_ISA_HPP
+
+#include "../config/xsimd_arch.hpp"
+
+#include "./xsimd_generic_fwd.hpp"
+
+#if XSIMD_WITH_SSE2
+#include "./xsimd_sse2.hpp"
+#endif
+
+#if XSIMD_WITH_SSE3
+#include "./xsimd_sse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSSE3
+#include "./xsimd_ssse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_1
+#include "./xsimd_sse4_1.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_2
+#include "./xsimd_sse4_2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_SSE
+#include "./xsimd_fma3_sse.hpp"
+#endif
+
+#if XSIMD_WITH_FMA4
+#include "./xsimd_fma4.hpp"
+#endif
+
+#if XSIMD_WITH_AVX
+#include "./xsimd_avx.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX
+#include "./xsimd_fma3_avx.hpp"
+#endif
+
+#if XSIMD_WITH_AVX2
+#include "./xsimd_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#include "./xsimd_fma3_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512F
+#include "./xsimd_avx512f.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512BW
+#include "./xsimd_avx512bw.hpp"
+#endif
+
+#if XSIMD_WITH_NEON
+#include "./xsimd_neon.hpp"
+#endif
+
+#if XSIMD_WITH_NEON64
+#include "./xsimd_neon64.hpp"
+#endif
+
+#if XSIMD_WITH_SVE
+#include "./xsimd_sve.hpp"
+#endif
+
+// Must come last to have access to all conversion specializations.
+#include "./xsimd_generic.hpp"
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
@ -0,0 +1,64 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_HPP
+#define XSIMD_SSE3_HPP
+
+#include "../types/xsimd_sse3_register.hpp"
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
+                               _mm_hadd_ps(row[2], row[3]));
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_pd(row[0], row[1]);
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
+        {
+            return _mm_lddqu_si128((__m128i const*)mem);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp0 = _mm_hadd_ps(self, self);
+            __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
+            return _mm_cvtss_f32(tmp1);
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128d tmp0 = _mm_hadd_pd(self, self);
+            return _mm_cvtsd_f64(tmp0);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
@ -0,0 +1,350 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_HPP
+#define XSIMD_SSE4_1_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_sse4_1_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // any
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return !_mm_testz_si128(self, self);
+        }
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_pd(self);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                return _mm_castps_si128(
+                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
+                                  _mm_castsi128_ps(_mm_xor_si128(
+                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
+                                      _mm_set1_epi32(1u << 31))),
+                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, ssse3 {});
+            }
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_pd(self);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_insert_epi32(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
+                return _mm_insert_epi64(self, val, I);
+#else
+                uint32_t lo, hi;
+                memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
+                memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
+                return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
+#endif
+            }
+            else
+            {
+                return insert(self, val, pos, ssse3 {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_or_si128(
+                    _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
+                    _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_mullo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(
+                    _mm_mul_epu32(self, other),
+                    _mm_slli_epi64(
+                        _mm_add_epi64(
+                            _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
+                            _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
+                        32));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // select
+        namespace detail
+        {
+            template <class T>
+            inline constexpr T interleave(T const& cond) noexcept
+            {
+                return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_epi8(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_pd(false_br, true_br, cond);
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_blend_epi16(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm_blend_epi16(false_br, true_br, imask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                constexpr int imask2 = detail::interleave(imask);
+                return _mm_blend_epi16(false_br, true_br, imask2);
+            }
+            else
+            {
+                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
+            }
+        }
+        template <class A, bool... Values>
+        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            return _mm_blend_ps(false_br, true_br, mask);
+        }
+        template <class A, bool... Values>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            return _mm_blend_pd(false_br, true_br, mask);
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_HPP
+#define XSIMD_SSE4_2_HPP
+
+#include <limits>
+
+#include "../types/xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // lt
+        template <class A>
+        inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            return _mm_cmpgt_epi64(other, self);
+        }
+        template <class A>
+        inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            return _mm_cmpgt_epi64(xother, xself);
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@ -0,0 +1,142 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_HPP
+#define XSIMD_SSSE3_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+#include "../types/xsimd_ssse3_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_abs_epi16(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // extract_pair
+        namespace detail
+        {
+
+            template <class T, class A>
+            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                return other;
+            }
+
+            template <class T, class A, std::size_t I, std::size_t... Is>
+            inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (i == I)
+                {
+                    return _mm_alignr_epi8(self, other, sizeof(T) * I);
+                }
+                else
+                    return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
+            }
+        }
+
+        template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(0 <= i && i < size && "index in bounds");
+            return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m128i tmp1 = _mm_hadd_epi16(self, self);
+                __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
+                __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
+                return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_hadd_epi32(self, self);
+                __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else
+            {
+                return reduce_add(self, sse3 {});
+            }
+        }
+
+        // swizzle
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        {
+            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+                                     2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
+                mask8;
+            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
+        }
+
+    }
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
--- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
@ -0,0 +1,249 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ARCH_HPP
+#define XSIMD_ARCH_HPP
+
+#include <initializer_list>
+#include <type_traits>
+#include <utility>
+
+#include "../types/xsimd_all_registers.hpp"
+#include "./xsimd_config.hpp"
+#include "./xsimd_cpuid.hpp"
+
+namespace xsimd
+{
+
+    namespace detail
+    {
+        // Checks whether T appears in Tys.
+        template <class T, class... Tys>
+        struct contains;
+
+        template <class T>
+        struct contains<T> : std::false_type
+        {
+        };
+
+        template <class T, class Ty, class... Tys>
+        struct contains<T, Ty, Tys...>
+            : std::conditional<std::is_same<Ty, T>::value, std::true_type,
+                               contains<T, Tys...>>::type
+        {
+        };
+
+        template <class... Archs>
+        struct is_sorted;
+
+        template <>
+        struct is_sorted<> : std::true_type
+        {
+        };
+
+        template <class Arch>
+        struct is_sorted<Arch> : std::true_type
+        {
+        };
+
+        template <class A0, class A1, class... Archs>
+        struct is_sorted<A0, A1, Archs...>
+            : std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
+                               std::false_type>::type
+        {
+        };
+
+        template <typename T>
+        inline constexpr T max_of(T value) noexcept
+        {
+            return value;
+        }
+
+        template <typename T, typename... Ts>
+        inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
+        {
+            return max_of((head0 > head1 ? head0 : head1), tail...);
+        }
+
+    } // namespace detail
+
+    // An arch_list is a list of architectures, sorted by version number.
+    template <class... Archs>
+    struct arch_list
+    {
+#ifndef NDEBUG
+        static_assert(detail::is_sorted<Archs...>::value,
+                      "architecture list must be sorted by version");
+#endif
+
+        template <class Arch>
+        using add = arch_list<Archs..., Arch>;
+
+        template <class... OtherArchs>
+        using extend = arch_list<Archs..., OtherArchs...>;
+
+        template <class Arch>
+        static constexpr bool contains() noexcept
+        {
+            return detail::contains<Arch, Archs...>::value;
+        }
+
+        template <class F>
+        static void for_each(F&& f) noexcept
+        {
+            (void)std::initializer_list<bool> { (f(Archs {}), true)... };
+        }
+
+        static constexpr std::size_t alignment() noexcept
+        {
+            // all alignments are a power of two
+            return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
+        }
+    };
+
+    struct unavailable
+    {
+        static constexpr bool supported() noexcept { return false; }
+        static constexpr bool available() noexcept { return false; }
+        static constexpr unsigned version() noexcept { return 0; }
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr char const* name() noexcept { return "<none>"; }
+    };
+
+    namespace detail
+    {
+        // Pick the best architecture in arch_list L, which is the last
+        // because architectures are sorted by version.
+        template <class L>
+        struct best;
+
+        template <>
+        struct best<arch_list<>>
+        {
+            using type = unavailable;
+        };
+
+        template <class Arch, class... Archs>
+        struct best<arch_list<Arch, Archs...>>
+        {
+            using type = Arch;
+        };
+
+        // Filter archlists Archs, picking only supported archs and adding
+        // them to L.
+        template <class L, class... Archs>
+        struct supported_helper;
+
+        template <class L>
+        struct supported_helper<L, arch_list<>>
+        {
+            using type = L;
+        };
+
+        template <class L, class Arch, class... Archs>
+        struct supported_helper<L, arch_list<Arch, Archs...>>
+            : supported_helper<
+                  typename std::conditional<Arch::supported(),
+                                            typename L::template add<Arch>, L>::type,
+                  arch_list<Archs...>>
+        {
+        };
+
+        template <class... Archs>
+        struct supported : supported_helper<arch_list<>, Archs...>
+        {
+        };
+
+        // Joins all arch_list Archs in a single arch_list.
+        template <class... Archs>
+        struct join;
+
+        template <class Arch>
+        struct join<Arch>
+        {
+            using type = Arch;
+        };
+
+        template <class Arch, class... Archs, class... Args>
+        struct join<Arch, arch_list<Archs...>, Args...>
+            : join<typename Arch::template extend<Archs...>, Args...>
+        {
+        };
+    } // namespace detail
+
+    struct unsupported
+    {
+    };
+    using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
+    using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
+    using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
+    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
+
+    using supported_architectures = typename detail::supported<all_architectures>::type;
+
+    using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
+    using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
+    // using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
+    using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
+                                                   arm_arch,
+                                                   x86_arch>::type;
+
+    namespace detail
+    {
+        template <class F, class ArchList>
+        class dispatcher
+        {
+
+            const unsigned best_arch;
+            F functor;
+
+            template <class Arch, class... Tys>
+            auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            {
+                assert(Arch::available() && "At least one arch must be supported during dispatch");
+                return functor(Arch {}, std::forward<Tys>(args)...);
+            }
+
+            template <class Arch, class ArchNext, class... Archs, class... Tys>
+            auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
+            {
+                if (Arch::version() <= best_arch)
+                    return functor(Arch {}, std::forward<Tys>(args)...);
+                else
+                    return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
+            }
+
+        public:
+            dispatcher(F f) noexcept
+                : best_arch(available_architectures().best)
+                , functor(f)
+            {
+            }
+
+            template <class... Tys>
+            auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
+            {
+                return walk_archs(ArchList {}, std::forward<Tys>(args)...);
+            }
+        };
+    }
+
+    // Generic function dispatch, à la ifunc
+    template <class ArchList = supported_architectures, class F>
+    inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
+    {
+        return { std::forward<F>(f) };
+    }
+
+} // namespace xsimd
+
+#endif
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@ -0,0 +1,341 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CONFIG_HPP
+#define XSIMD_CONFIG_HPP
+
+#define XSIMD_VERSION_MAJOR 10
+#define XSIMD_VERSION_MINOR 0
+#define XSIMD_VERSION_PATCH 0
+
+/**
+ * high level free functions
+ *
+ * @defgroup xsimd_config_macro Instruction Set Detection
+ */
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE2__
+#define XSIMD_WITH_SSE2 1
+#else
+#define XSIMD_WITH_SSE2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE3__
+#define XSIMD_WITH_SSE3 1
+#else
+#define XSIMD_WITH_SSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSSE3__
+#define XSIMD_WITH_SSSE3 1
+#else
+#define XSIMD_WITH_SSSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_1__
+#define XSIMD_WITH_SSE4_1 1
+#else
+#define XSIMD_WITH_SSE4_1 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_2__
+#define XSIMD_WITH_SSE4_2 1
+#else
+#define XSIMD_WITH_SSE4_2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX__
+#define XSIMD_WITH_AVX 1
+#else
+#define XSIMD_WITH_AVX 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX2__
+#define XSIMD_WITH_AVX2 1
+#else
+#define XSIMD_WITH_AVX2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__SSE__)
+#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_SSE 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__AVX__)
+#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#endif
+
+#if defined(__AVX2__)
+#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX2 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX2 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#define XSIMD_WITH_FMA3_AVX2 0
+
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA4__
+#define XSIMD_WITH_FMA4 1
+#else
+#define XSIMD_WITH_FMA4 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512F__
+// AVX512 instructions are supported starting with gcc 6
+// see https://www.gnu.org/software/gcc/gcc-6/changes.html
+// check clang first, newer clang always defines __GNUC__ = 4
+#if defined(__clang__) && __clang_major__ >= 6
+#define XSIMD_WITH_AVX512F 1
+#elif defined(__GNUC__) && __GNUC__ < 6
+#define XSIMD_WITH_AVX512F 0
+#else
+#define XSIMD_WITH_AVX512F 1
+#if __GNUC__ == 6
+#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
+#endif
+#endif
+#else
+#define XSIMD_WITH_AVX512F 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512CD__
+// Avoids repeating the GCC workaround over and over
+#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512CD 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512DQ__
+#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512DQ 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512BW__
+#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512BW 0
+#endif
+
+#ifdef __ARM_NEON
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON is available at compile-time, to 0 otherwise.
+ */
+#if __ARM_ARCH >= 7
+#define XSIMD_WITH_NEON 1
+#else
+#define XSIMD_WITH_NEON 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __aarch64__
+#define XSIMD_WITH_NEON64 1
+#else
+#define XSIMD_WITH_NEON64 0
+#endif
+#else
+#define XSIMD_WITH_NEON 0
+#define XSIMD_WITH_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#define XSIMD_WITH_SVE 1
+#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
+#else
+#define XSIMD_WITH_SVE 0
+#define XSIMD_SVE_BITS 0
+#endif
+
+// Workaround for MSVC compiler
+#ifdef _MSC_VER
+
+#if XSIMD_WITH_AVX512
+#undef XSIMD_WITH_AVX2
+#define XSIMD_WITH_AVX2 1
+#endif
+
+#if XSIMD_WITH_AVX2
+#undef XSIMD_WITH_AVX
+#define XSIMD_WITH_AVX 1
+#undef XSIMD_WITH_FMA3_AVX
+#define XSIMD_WITH_FMA3_AVX 1
+#undef XSIMD_WITH_FMA3_AVX2
+#define XSIMD_WITH_FMA3_AVX2 1
+#endif
+
+#if XSIMD_WITH_AVX
+#undef XSIMD_WITH_SSE4_2
+#define XSIMD_WITH_SSE4_2 1
+#endif
+
+#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#undef XSIMD_WITH_SSE4_2
+#define XSIMD_WITH_SSE4_2 1
+#endif
+
+#if XSIMD_WITH_SSE4_2
+#undef XSIMD_WITH_SSE4_1
+#define XSIMD_WITH_SSE4_1 1
+#endif
+
+#if XSIMD_WITH_SSE4_1
+#undef XSIMD_WITH_SSSE3
+#define XSIMD_WITH_SSSE3 1
+#endif
+
+#if XSIMD_WITH_SSSE3
+#undef XSIMD_WITH_SSE3
+#define XSIMD_WITH_SSE3 1
+#endif
+
+#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#undef XSIMD_WITH_SSE2
+#define XSIMD_WITH_SSE2 1
+#endif
+
+#endif
+
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
+#define XSIMD_NO_SUPPORTED_ARCHITECTURE
+#endif
+
+#endif
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp.orig
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp.orig
@ -0,0 +1,341 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CONFIG_HPP
+#define XSIMD_CONFIG_HPP
+
+#define XSIMD_VERSION_MAJOR 10
+#define XSIMD_VERSION_MINOR 0
+#define XSIMD_VERSION_PATCH 0
+
+/**
+ * high level free functions
+ *
+ * @defgroup xsimd_config_macro Instruction Set Detection
+ */
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE2__
+#define XSIMD_WITH_SSE2 1
+#else
+#define XSIMD_WITH_SSE2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE3__
+#define XSIMD_WITH_SSE3 1
+#else
+#define XSIMD_WITH_SSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSSE3__
+#define XSIMD_WITH_SSSE3 1
+#else
+#define XSIMD_WITH_SSSE3 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_1__
+#define XSIMD_WITH_SSE4_1 1
+#else
+#define XSIMD_WITH_SSE4_1 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __SSE4_2__
+#define XSIMD_WITH_SSE4_2 1
+#else
+#define XSIMD_WITH_SSE4_2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX__
+#define XSIMD_WITH_AVX 1
+#else
+#define XSIMD_WITH_AVX 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX2__
+#define XSIMD_WITH_AVX2 1
+#else
+#define XSIMD_WITH_AVX2 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__SSE__) && !defined(__AVX__)
+#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_SSE 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_SSE
+#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_SSE 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA__
+
+#if defined(__AVX__)
+#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#endif
+
+#if defined(__AVX2__)
+#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
+#define XSIMD_WITH_FMA3_AVX2 1
+#endif
+#else
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX2 0
+#endif
+
+#else
+
+#if XSIMD_WITH_FMA3_AVX
+#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
+#endif
+
+#define XSIMD_WITH_FMA3_AVX 0
+#define XSIMD_WITH_FMA3_AVX2 0
+
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __FMA4__
+#define XSIMD_WITH_FMA4 1
+#else
+#define XSIMD_WITH_FMA4 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512F__
+// AVX512 instructions are supported starting with gcc 6
+// see https://www.gnu.org/software/gcc/gcc-6/changes.html
+// check clang first, newer clang always defines __GNUC__ = 4
+#if defined(__clang__) && __clang_major__ >= 6
+#define XSIMD_WITH_AVX512F 1
+#elif defined(__GNUC__) && __GNUC__ < 6
+#define XSIMD_WITH_AVX512F 0
+#else
+#define XSIMD_WITH_AVX512F 1
+#if __GNUC__ == 6
+#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
+#endif
+#endif
+#else
+#define XSIMD_WITH_AVX512F 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512CD__
+// Avoids repeating the GCC workaround over and over
+#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512CD 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512DQ__
+#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512DQ 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512BW__
+#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512BW 0
+#endif
+
+#ifdef __ARM_NEON
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON is available at compile-time, to 0 otherwise.
+ */
+#if __ARM_ARCH >= 7
+#define XSIMD_WITH_NEON 1
+#else
+#define XSIMD_WITH_NEON 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
+ */
+#ifdef __aarch64__
+#define XSIMD_WITH_NEON64 1
+#else
+#define XSIMD_WITH_NEON64 0
+#endif
+#else
+#define XSIMD_WITH_NEON 0
+#define XSIMD_WITH_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+#define XSIMD_WITH_SVE 1
+#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
+#else
+#define XSIMD_WITH_SVE 0
+#define XSIMD_SVE_BITS 0
+#endif
+
+// Workaround for MSVC compiler
+#ifdef _MSC_VER
+
+#if XSIMD_WITH_AVX512
+#undef XSIMD_WITH_AVX2
+#define XSIMD_WITH_AVX2 1
+#endif
+
+#if XSIMD_WITH_AVX2
+#undef XSIMD_WITH_AVX
+#define XSIMD_WITH_AVX 1
+#undef XSIMD_WITH_FMA3_AVX
+#define XSIMD_WITH_FMA3_AVX 1
+#undef XSIMD_WITH_FMA3_AVX2
+#define XSIMD_WITH_FMA3_AVX2 1
+#endif
+
+#if XSIMD_WITH_AVX
+#undef XSIMD_WITH_SSE4_2
+#define XSIMD_WITH_SSE4_2 1
+#endif
+
+#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
+#undef XSIMD_WITH_SSE4_2
+#define XSIMD_WITH_SSE4_2 1
+#endif
+
+#if XSIMD_WITH_SSE4_2
+#undef XSIMD_WITH_SSE4_1
+#define XSIMD_WITH_SSE4_1 1
+#endif
+
+#if XSIMD_WITH_SSE4_1
+#undef XSIMD_WITH_SSSE3
+#define XSIMD_WITH_SSSE3 1
+#endif
+
+#if XSIMD_WITH_SSSE3
+#undef XSIMD_WITH_SSE3
+#define XSIMD_WITH_SSE3 1
+#endif
+
+#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
+#undef XSIMD_WITH_SSE2
+#define XSIMD_WITH_SSE2 1
+#endif
+
+#endif
+
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
+#define XSIMD_NO_SUPPORTED_ARCHITECTURE
+#endif
+
+#endif
--- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
@ -0,0 +1,181 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_CPUID_HPP
+#define XSIMD_CPUID_HPP
+
+#include <algorithm>
+#include <cstring>
+
+#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
+#include <asm/hwcap.h>
+#include <sys/auxv.h>
+#endif
+
+#if defined(_MSC_VER)
+// Contains the definition of __cpuidex
+#include <intrin.h>
+#endif
+
+#include "../types/xsimd_all_registers.hpp"
+
+namespace xsimd
+{
+    namespace detail
+    {
+        struct supported_arch
+        {
+            unsigned sse2 : 1;
+            unsigned sse3 : 1;
+            unsigned ssse3 : 1;
+            unsigned sse4_1 : 1;
+            unsigned sse4_2 : 1;
+            unsigned sse4a : 1;
+            unsigned fma3_sse : 1;
+            unsigned fma4 : 1;
+            unsigned xop : 1;
+            unsigned avx : 1;
+            unsigned fma3_avx : 1;
+            unsigned avx2 : 1;
+            unsigned fma3_avx2 : 1;
+            unsigned avx512f : 1;
+            unsigned avx512cd : 1;
+            unsigned avx512dq : 1;
+            unsigned avx512bw : 1;
+            unsigned neon : 1;
+            unsigned neon64 : 1;
+
+            // version number of the best arch available
+            unsigned best;
+
+            supported_arch() noexcept
+            {
+                memset(this, 0, sizeof(supported_arch));
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+                neon = 1;
+                neon64 = 1;
+                best = neon64::version();
+#elif defined(__ARM_NEON) || defined(_M_ARM)
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
+#else
+                // that's very conservative :-/
+                neon = 0;
+#endif
+                neon64 = 0;
+                best = neon::version() * neon;
+
+#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
+                auto get_cpuid = [](int reg[4], int func_id) noexcept
+                {
+
+#if defined(_MSC_VER)
+                    __cpuidex(reg, func_id, 0);
+
+#elif defined(__INTEL_COMPILER)
+                    __cpuid(reg, func_id);
+
+#elif defined(__GNUC__) || defined(__clang__)
+
+#if defined(__i386__) && defined(__PIC__)
+                    // %ebx may be the PIC register
+                    __asm__("xchg{l}\t{%%}ebx, %1\n\t"
+                            "cpuid\n\t"
+                            "xchg{l}\t{%%}ebx, %1\n\t"
+                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
+                              "=d"(reg[3])
+                            : "a"(func_id), "c"(0));
+
+#else
+                    __asm__("cpuid\n\t"
+                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
+                              "=d"(reg[3])
+                            : "a"(func_id), "c"(0));
+#endif
+
+#else
+#error "Unsupported configuration"
+#endif
+                };
+
+                int regs[4];
+
+                get_cpuid(regs, 0x1);
+
+                sse2 = regs[3] >> 26 & 1;
+                best = std::max(best, sse2::version() * sse2);
+
+                sse3 = regs[2] >> 0 & 1;
+                best = std::max(best, sse3::version() * sse3);
+
+                ssse3 = regs[2] >> 9 & 1;
+                best = std::max(best, ssse3::version() * ssse3);
+
+                sse4_1 = regs[2] >> 19 & 1;
+                best = std::max(best, sse4_1::version() * sse4_1);
+
+                sse4_2 = regs[2] >> 20 & 1;
+                best = std::max(best, sse4_2::version() * sse4_2);
+
+                fma3_sse = regs[2] >> 12 & 1;
+                if (sse4_2)
+                    best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
+
+                get_cpuid(regs, 0x80000001);
+                fma4 = regs[2] >> 16 & 1;
+                best = std::max(best, fma4::version() * fma4);
+
+                // sse4a = regs[2] >> 6 & 1;
+                // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
+
+                // xop = regs[2] >> 11 & 1;
+                // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
+
+                avx = regs[2] >> 28 & 1;
+                best = std::max(best, avx::version() * avx);
+
+                fma3_avx = avx && fma3_sse;
+                best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
+
+                get_cpuid(regs, 0x7);
+                avx2 = regs[1] >> 5 & 1;
+                best = std::max(best, avx2::version() * avx2);
+
+                fma3_avx2 = avx2 && fma3_sse;
+                best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
+
+                avx512f = regs[1] >> 16 & 1;
+                best = std::max(best, avx512f::version() * avx512f);
+
+                avx512cd = regs[1] >> 28 & 1;
+                best = std::max(best, avx512cd::version() * avx512cd * avx512f);
+
+                avx512dq = regs[1] >> 17 & 1;
+                best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
+
+                avx512bw = regs[1] >> 30 & 1;
+                best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
+
+#endif
+            }
+        };
+    }
+
+    inline detail::supported_arch available_architectures() noexcept
+    {
+        static detail::supported_arch supported;
+        return supported;
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
+++ b/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
@ -0,0 +1,719 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+
+namespace xsimd
+{
+    namespace detail
+    {
+
+        /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+#if defined(_MSC_VER)
+#define ONCE0                                       \
+    __pragma(warning(push))                         \
+        __pragma(warning(disable : 4127)) while (0) \
+            __pragma(warning(pop)) /**/
+#else
+#define ONCE0 while (0)
+#endif
+
+        /*
+         * ====================================================
+         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+         *
+         * Developed at SunPro, a Sun Microsystems, Inc. business.
+         * Permission to use, copy, modify, and distribute this
+         * software is freely granted, provided that this notice
+         * is preserved.
+         * ====================================================
+         */
+
+#if defined(__GNUC__) && defined(__BYTE_ORDER__)
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#define XSIMD_LITTLE_ENDIAN
+#endif
+#elif defined(_WIN32)
+// We can safely assume that Windows is always little endian
+#define XSIMD_LITTLE_ENDIAN
+#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
+#define XSIMD_LITTLE_ENDIAN
+#endif
+
+#ifdef XSIMD_LITTLE_ENDIAN
+#define LOW_WORD_IDX 0
+#define HIGH_WORD_IDX sizeof(std::uint32_t)
+#else
+#define LOW_WORD_IDX sizeof(std::uint32_t)
+#define HIGH_WORD_IDX 0
+#endif
+
+#define GET_HIGH_WORD(i, d)                                            \
+    do                                                                 \
+    {                                                                  \
+        double f = (d);                                                \
+        std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
+                    sizeof(std::uint32_t));                            \
+    }                                                                  \
+    ONCE0                                                              \
+    /**/
+
+#define GET_LOW_WORD(i, d)                                            \
+    do                                                                \
+    {                                                                 \
+        double f = (d);                                               \
+        std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
+                    sizeof(std::uint32_t));                           \
+    }                                                                 \
+    ONCE0                                                             \
+    /**/
+
+#define SET_HIGH_WORD(d, v)                                      \
+    do                                                           \
+    {                                                            \
+        double f = (d);                                          \
+        std::uint32_t value = (v);                               \
+        std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
+                    &value, sizeof(std::uint32_t));              \
+        (d) = f;                                                 \
+    }                                                            \
+    ONCE0                                                        \
+    /**/
+
+#define SET_LOW_WORD(d, v)                                      \
+    do                                                          \
+    {                                                           \
+        double f = (d);                                         \
+        std::uint32_t value = (v);                              \
+        std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
+                    &value, sizeof(std::uint32_t));             \
+        (d) = f;                                                \
+    }                                                           \
+    ONCE0                                                       \
+    /**/
+
+        /*
+         * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
+         * double x[],y[]; int e0,nx,prec; int ipio2[];
+         *
+         * __kernel_rem_pio2 return the last three digits of N with
+         *		y = x - N*pi/2
+         * so that |y| < pi/2.
+         *
+         * The method is to compute the integer (mod 8) and fraction parts of
+         * (2/pi)*x without doing the full multiplication. In general we
+         * skip the part of the product that are known to be a huge integer (
+         * more accurately, = 0 mod 8 ). Thus the number of operations are
+         * independent of the exponent of the input.
+         *
+         * (2/pi) is represented by an array of 24-bit integers in ipio2[].
+         *
+         * Input parameters:
+         * 	x[]	The input value (must be positive) is broken into nx
+         *		pieces of 24-bit integers in double precision format.
+         *		x[i] will be the i-th 24 bit of x. The scaled exponent
+         *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
+         *		match x's up to 24 bits.
+         *
+         *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
+         *			e0 = ilogb(z)-23
+         *			z  = scalbn(z,-e0)
+         *		for i = 0,1,2
+         *			x[i] = floor(z)
+         *			z    = (z-x[i])*2**24
+         *
+         *
+         *	y[]	ouput result in an array of double precision numbers.
+         *		The dimension of y[] is:
+         *			24-bit  precision	1
+         *			53-bit  precision	2
+         *			64-bit  precision	2
+         *			113-bit precision	3
+         *		The actual value is the sum of them. Thus for 113-bit
+         *		precison, one may have to do something like:
+         *
+         *		long double t,w,r_head, r_tail;
+         *		t = (long double)y[2] + (long double)y[1];
+         *		w = (long double)y[0];
+         *		r_head = t+w;
+         *		r_tail = w - (r_head - t);
+         *
+         *	e0	The exponent of x[0]
+         *
+         *	nx	dimension of x[]
+         *
+         *  	prec	an integer indicating the precision:
+         *			0	24  bits (single)
+         *			1	53  bits (double)
+         *			2	64  bits (extended)
+         *			3	113 bits (quad)
+         *
+         *	ipio2[]
+         *		integer array, contains the (24*i)-th to (24*i+23)-th
+         *		bit of 2/pi after binary point. The corresponding
+         *		floating value is
+         *
+         *			ipio2[i] * 2^(-24(i+1)).
+         *
+         * External function:
+         *	double scalbn(), floor();
+         *
+         *
+         * Here is the description of some local variables:
+         *
+         * 	jk	jk+1 is the initial number of terms of ipio2[] needed
+         *		in the computation. The recommended value is 2,3,4,
+         *		6 for single, double, extended,and quad.
+         *
+         * 	jz	local integer variable indicating the number of
+         *		terms of ipio2[] used.
+         *
+         *	jx	nx - 1
+         *
+         *	jv	index for pointing to the suitable ipio2[] for the
+         *		computation. In general, we want
+         *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
+         *		is an integer. Thus
+         *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
+         *		Hence jv = max(0,(e0-3)/24).
+         *
+         *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
+         *
+         * 	q[]	double array with integral value, representing the
+         *		24-bits chunk of the product of x and 2/pi.
+         *
+         *	q0	the corresponding exponent of q[0]. Note that the
+         *		exponent for q[i] would be q0-24*i.
+         *
+         *	PIo2[]	double precision array, obtained by cutting pi/2
+         *		into 24 bits chunks.
+         *
+         *	f[]	ipio2[] in floating point
+         *
+         *	iq[]	integer array by breaking up q[] in 24-bits chunk.
+         *
+         *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
+         *
+         *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
+         *		it also indicates the *sign* of the result.
+         *
+         */
+
+        inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
+        {
+            static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
+
+            static const double PIo2[] = {
+                1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
+                7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
+                5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
+                3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
+                1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
+                1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
+                2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
+                2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
+            };
+
+            static const double
+                zero
+                = 0.0,
+                one = 1.0,
+                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+                twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
+
+            int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
+            double z, fw, f[20], fq[20], q[20];
+
+            /* initialize jk*/
+            jk = init_jk[prec];
+            jp = jk;
+
+            /* determine jx,jv,q0, note that 3>q0 */
+            jx = nx - 1;
+            jv = (e0 - 3) / 24;
+            if (jv < 0)
+                jv = 0;
+            q0 = e0 - 24 * (jv + 1);
+
+            /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+            j = jv - jx;
+            m = jx + jk;
+            for (i = 0; i <= m; i++, j++)
+                f[i] = (j < 0) ? zero : (double)ipio2[j];
+
+            /* compute q[0],q[1],...q[jk] */
+            for (i = 0; i <= jk; i++)
+            {
+                for (j = 0, fw = 0.0; j <= jx; j++)
+                    fw += x[j] * f[jx + i - j];
+                q[i] = fw;
+            }
+
+            jz = jk;
+
+        recompute:
+            /* distill q[] into iq[] reversingly */
+            for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
+            {
+                fw = (double)((int32_t)(twon24 * z));
+                iq[i] = (int)(z - two24 * fw);
+                z = q[j - 1] + fw;
+            }
+
+            /* compute n */
+            z = std::scalbn(z, q0); /* actual value of z */
+            z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
+            n = (int32_t)z;
+            z -= (double)n;
+            ih = 0;
+            if (q0 > 0)
+            { /* need iq[jz-1] to determine n */
+                i = (iq[jz - 1] >> (24 - q0));
+                n += i;
+                iq[jz - 1] -= i << (24 - q0);
+                ih = iq[jz - 1] >> (23 - q0);
+            }
+            else if (q0 == 0)
+                ih = iq[jz - 1] >> 23;
+            else if (z >= 0.5)
+                ih = 2;
+
+            if (ih > 0)
+            { /* q > 0.5 */
+                n += 1;
+                carry = 0;
+                for (i = 0; i < jz; i++)
+                { /* compute 1-q */
+                    j = iq[i];
+                    if (carry == 0)
+                    {
+                        if (j != 0)
+                        {
+                            carry = 1;
+                            iq[i] = 0x1000000 - j;
+                        }
+                    }
+                    else
+                        iq[i] = 0xffffff - j;
+                }
+                if (q0 > 0)
+                { /* rare case: chance is 1 in 12 */
+                    switch (q0)
+                    {
+                    case 1:
+                        iq[jz - 1] &= 0x7fffff;
+                        break;
+                    case 2:
+                        iq[jz - 1] &= 0x3fffff;
+                        break;
+                    }
+                }
+                if (ih == 2)
+                {
+                    z = one - z;
+                    if (carry != 0)
+                        z -= std::scalbn(one, q0);
+                }
+            }
+
+            /* check if recomputation is needed */
+            if (z == zero)
+            {
+                j = 0;
+                for (i = jz - 1; i >= jk; i--)
+                    j |= iq[i];
+                if (j == 0)
+                { /* need recomputation */
+                    for (k = 1; iq[jk - k] == 0; k++)
+                        ; /* k = no. of terms needed */
+
+                    for (i = jz + 1; i <= jz + k; i++)
+                    { /* add q[jz+1] to q[jz+k] */
+                        f[jx + i] = (double)ipio2[jv + i];
+                        for (j = 0, fw = 0.0; j <= jx; j++)
+                            fw += x[j] * f[jx + i - j];
+                        q[i] = fw;
+                    }
+                    jz += k;
+                    goto recompute;
+                }
+            }
+
+            /* chop off zero terms */
+            if (z == 0.0)
+            {
+                jz -= 1;
+                q0 -= 24;
+                while (iq[jz] == 0)
+                {
+                    jz--;
+                    q0 -= 24;
+                }
+            }
+            else
+            { /* break z into 24-bit if necessary */
+                z = std::scalbn(z, -q0);
+                if (z >= two24)
+                {
+                    fw = (double)((int32_t)(twon24 * z));
+                    iq[jz] = (int32_t)(z - two24 * fw);
+                    jz += 1;
+                    q0 += 24;
+                    iq[jz] = (int32_t)fw;
+                }
+                else
+                    iq[jz] = (int32_t)z;
+            }
+
+            /* convert integer "bit" chunk to floating-point value */
+            fw = scalbn(one, q0);
+            for (i = jz; i >= 0; i--)
+            {
+                q[i] = fw * (double)iq[i];
+                fw *= twon24;
+            }
+
+            /* compute PIo2[0,...,jp]*q[jz,...,0] */
+            for (i = jz; i >= 0; i--)
+            {
+                for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
+                    fw += PIo2[k] * q[i + k];
+                fq[jz - i] = fw;
+            }
+
+            /* compress fq[] into y[] */
+            switch (prec)
+            {
+            case 0:
+                fw = 0.0;
+                for (i = jz; i >= 0; i--)
+                    fw += fq[i];
+                y[0] = (ih == 0) ? fw : -fw;
+                break;
+            case 1:
+            case 2:
+                fw = 0.0;
+                for (i = jz; i >= 0; i--)
+                    fw += fq[i];
+                y[0] = (ih == 0) ? fw : -fw;
+                fw = fq[0] - fw;
+                for (i = 1; i <= jz; i++)
+                    fw += fq[i];
+                y[1] = (ih == 0) ? fw : -fw;
+                break;
+            case 3: /* painful */
+                for (i = jz; i > 0; i--)
+                {
+                    fw = fq[i - 1] + fq[i];
+                    fq[i] += fq[i - 1] - fw;
+                    fq[i - 1] = fw;
+                }
+                for (i = jz; i > 1; i--)
+                {
+                    fw = fq[i - 1] + fq[i];
+                    fq[i] += fq[i - 1] - fw;
+                    fq[i - 1] = fw;
+                }
+                for (fw = 0.0, i = jz; i >= 2; i--)
+                    fw += fq[i];
+                if (ih == 0)
+                {
+                    y[0] = fq[0];
+                    y[1] = fq[1];
+                    y[2] = fw;
+                }
+                else
+                {
+                    y[0] = -fq[0];
+                    y[1] = -fq[1];
+                    y[2] = -fw;
+                }
+            }
+            return n & 7;
+        }
+
+        inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
+        {
+            static const std::int32_t two_over_pi[] = {
+                0xA2F983,
+                0x6E4E44,
+                0x1529FC,
+                0x2757D1,
+                0xF534DD,
+                0xC0DB62,
+                0x95993C,
+                0x439041,
+                0xFE5163,
+                0xABDEBB,
+                0xC561B7,
+                0x246E3A,
+                0x424DD2,
+                0xE00649,
+                0x2EEA09,
+                0xD1921C,
+                0xFE1DEB,
+                0x1CB129,
+                0xA73EE8,
+                0x8235F5,
+                0x2EBB44,
+                0x84E99C,
+                0x7026B4,
+                0x5F7E41,
+                0x3991D6,
+                0x398353,
+                0x39F49C,
+                0x845F8B,
+                0xBDF928,
+                0x3B1FF8,
+                0x97FFDE,
+                0x05980F,
+                0xEF2F11,
+                0x8B5A0A,
+                0x6D1F6D,
+                0x367ECF,
+                0x27CB09,
+                0xB74F46,
+                0x3F669E,
+                0x5FEA2D,
+                0x7527BA,
+                0xC7EBE5,
+                0xF17B3D,
+                0x0739F7,
+                0x8A5292,
+                0xEA6BFB,
+                0x5FB11F,
+                0x8D5D08,
+                0x560330,
+                0x46FC7B,
+                0x6BABF0,
+                0xCFBC20,
+                0x9AF436,
+                0x1DA9E3,
+                0x91615E,
+                0xE61B08,
+                0x659985,
+                0x5F14A0,
+                0x68408D,
+                0xFFD880,
+                0x4D7327,
+                0x310606,
+                0x1556CA,
+                0x73A8C9,
+                0x60E27B,
+                0xC08C6B,
+            };
+
+            static const std::int32_t npio2_hw[] = {
+                0x3FF921FB,
+                0x400921FB,
+                0x4012D97C,
+                0x401921FB,
+                0x401F6A7A,
+                0x4022D97C,
+                0x4025FDBB,
+                0x402921FB,
+                0x402C463A,
+                0x402F6A7A,
+                0x4031475C,
+                0x4032D97C,
+                0x40346B9C,
+                0x4035FDBB,
+                0x40378FDB,
+                0x403921FB,
+                0x403AB41B,
+                0x403C463A,
+                0x403DD85A,
+                0x403F6A7A,
+                0x40407E4C,
+                0x4041475C,
+                0x4042106C,
+                0x4042D97C,
+                0x4043A28C,
+                0x40446B9C,
+                0x404534AC,
+                0x4045FDBB,
+                0x4046C6CB,
+                0x40478FDB,
+                0x404858EB,
+                0x404921FB,
+            };
+
+            /*
+             * invpio2:  53 bits of 2/pi
+             * pio2_1:   first  33 bit of pi/2
+             * pio2_1t:  pi/2 - pio2_1
+             * pio2_2:   second 33 bit of pi/2
+             * pio2_2t:  pi/2 - (pio2_1+pio2_2)
+             * pio2_3:   third  33 bit of pi/2
+             * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
+             */
+
+            static const double
+                zero
+                = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
+                half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
+                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
+                invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
+                pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
+                pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
+                pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
+                pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
+                pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
+                pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
+
+            double z = 0., w, t, r, fn;
+            double tx[3];
+            std::int32_t e0, i, j, nx, n, ix, hx;
+            std::uint32_t low;
+
+            GET_HIGH_WORD(hx, x); /* high word of x */
+            ix = hx & 0x7fffffff;
+            if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
+            {
+                y[0] = x;
+                y[1] = 0;
+                return 0;
+            }
+            if (ix < 0x4002d97c)
+            { /* |x| < 3pi/4, special case with n=+-1 */
+                if (hx > 0)
+                {
+                    z = x - pio2_1;
+                    if (ix != 0x3ff921fb)
+                    { /* 33+53 bit pi is good enough */
+                        y[0] = z - pio2_1t;
+                        y[1] = (z - y[0]) - pio2_1t;
+                    }
+                    else
+                    { /* near pi/2, use 33+33+53 bit pi */
+                        z -= pio2_2;
+                        y[0] = z - pio2_2t;
+                        y[1] = (z - y[0]) - pio2_2t;
+                    }
+                    return 1;
+                }
+                else
+                { /* negative x */
+                    z = x + pio2_1;
+                    if (ix != 0x3ff921fb)
+                    { /* 33+53 bit pi is good enough */
+                        y[0] = z + pio2_1t;
+                        y[1] = (z - y[0]) + pio2_1t;
+                    }
+                    else
+                    { /* near pi/2, use 33+33+53 bit pi */
+                        z += pio2_2;
+                        y[0] = z + pio2_2t;
+                        y[1] = (z - y[0]) + pio2_2t;
+                    }
+
+                    return -1;
+                }
+            }
+            if (ix <= 0x413921fb)
+            { /* |x| ~<= 2^19*(pi/2), medium_ size */
+                t = std::fabs(x);
+                n = (std::int32_t)(t * invpio2 + half);
+                fn = (double)n;
+                r = t - fn * pio2_1;
+                w = fn * pio2_1t; /* 1st round good to 85 bit */
+                if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
+                {
+                    y[0] = r - w; /* quick check no cancellation */
+                }
+                else
+                {
+                    std::uint32_t high;
+                    j = ix >> 20;
+                    y[0] = r - w;
+                    GET_HIGH_WORD(high, y[0]);
+                    i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
+                    if (i > 16)
+                    { /* 2nd iteration needed, good to 118 */
+                        t = r;
+                        w = fn * pio2_2;
+                        r = t - w;
+                        w = fn * pio2_2t - ((t - r) - w);
+                        y[0] = r - w;
+                        GET_HIGH_WORD(high, y[0]);
+                        i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
+                        if (i > 49)
+                        { /* 3rd iteration need, 151 bits acc */
+                            t = r; /* will cover all possible cases */
+                            w = fn * pio2_3;
+                            r = t - w;
+                            w = fn * pio2_3t - ((t - r) - w);
+                            y[0] = r - w;
+                        }
+                    }
+                }
+                y[1] = (r - y[0]) - w;
+                if (hx < 0)
+                {
+                    y[0] = -y[0];
+                    y[1] = -y[1];
+                    return -n;
+                }
+                else
+                    return n;
+            }
+            /*
+             * all other (large) arguments
+             */
+            if (ix >= 0x7ff00000)
+            { /* x is inf or NaN */
+                y[0] = y[1] = x - x;
+                return 0;
+            }
+            /* set z = scalbn(|x|,ilogb(x)-23) */
+            GET_LOW_WORD(low, x);
+            SET_LOW_WORD(z, low);
+            e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
+            SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
+            for (i = 0; i < 2; i++)
+            {
+                tx[i] = (double)((std::int32_t)(z));
+                z = (z - tx[i]) * two24;
+            }
+            tx[2] = z;
+            nx = 3;
+            while (tx[nx - 1] == zero)
+                nx--; /* skip zero term */
+            n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
+            if (hx < 0)
+            {
+                y[0] = -y[0];
+                y[1] = -y[1];
+                return -n;
+            }
+            return n;
+        }
+    }
+
+#undef XSIMD_LITTLE_ENDIAN
+#undef SET_LOW_WORD
+#undef SET_HIGH_WORD
+#undef GET_LOW_WORD
+#undef GET_HIGH_WORD
+#undef HIGH_WORD_IDX
+#undef LOW_WORD_IDX
+#undef ONCE0
+}
--- a/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
@ -0,0 +1,349 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
+#define XSIMD_ALIGNED_ALLOCATOR_HPP
+
+#include <algorithm>
+#include <cstddef>
+#include <utility>
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#include <cstdlib>
+#endif
+
+#include <cassert>
+#include <memory>
+
+#include "../config/xsimd_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @class aligned_allocator
+     * @brief Allocator for aligned memory
+     *
+     * The aligned_allocator class template is an allocator that
+     * performs memory allocation aligned by the specified value.
+     *
+     * @tparam T type of objects to allocate.
+     * @tparam Align alignment in bytes.
+     */
+    template <class T, size_t Align = default_arch::alignment()>
+    class aligned_allocator
+    {
+    public:
+        using value_type = T;
+        using pointer = T*;
+        using const_pointer = const T*;
+        using reference = T&;
+        using const_reference = const T&;
+        using size_type = size_t;
+        using difference_type = ptrdiff_t;
+
+        static constexpr size_t alignment = Align;
+
+        template <class U>
+        struct rebind
+        {
+            using other = aligned_allocator<U, Align>;
+        };
+
+        aligned_allocator() noexcept;
+        aligned_allocator(const aligned_allocator& rhs) noexcept;
+
+        template <class U>
+        aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
+
+        ~aligned_allocator();
+
+        pointer address(reference) noexcept;
+        const_pointer address(const_reference) const noexcept;
+
+        pointer allocate(size_type n, const void* hint = 0);
+        void deallocate(pointer p, size_type n);
+
+        size_type max_size() const noexcept;
+        size_type size_max() const noexcept;
+
+        template <class U, class... Args>
+        void construct(U* p, Args&&... args);
+
+        template <class U>
+        void destroy(U* p);
+    };
+
+    template <class T1, size_t Align1, class T2, size_t Align2>
+    bool operator==(const aligned_allocator<T1, Align1>& lhs,
+                    const aligned_allocator<T2, Align2>& rhs) noexcept;
+
+    template <class T1, size_t Align1, class T2, size_t Align2>
+    bool operator!=(const aligned_allocator<T1, Align1>& lhs,
+                    const aligned_allocator<T2, Align2>& rhs) noexcept;
+
+    void* aligned_malloc(size_t size, size_t alignment);
+    void aligned_free(void* ptr);
+
+    template <class T>
+    size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
+
+    /************************************
+     * aligned_allocator implementation *
+     ************************************/
+
+    /**
+     * Default constructor.
+     */
+    template <class T, size_t A>
+    inline aligned_allocator<T, A>::aligned_allocator() noexcept
+    {
+    }
+
+    /**
+     * Copy constructor.
+     */
+    template <class T, size_t A>
+    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
+    {
+    }
+
+    /**
+     * Extended copy constructor.
+     */
+    template <class T, size_t A>
+    template <class U>
+    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
+    {
+    }
+
+    /**
+     * Destructor.
+     */
+    template <class T, size_t A>
+    inline aligned_allocator<T, A>::~aligned_allocator()
+    {
+    }
+
+    /**
+     * Returns the actual address of \c r even in presence of overloaded \c operator&.
+     * @param r the object to acquire address of.
+     * @return the actual address of \c r.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::address(reference r) noexcept -> pointer
+    {
+        return &r;
+    }
+
+    /**
+     * Returns the actual address of \c r even in presence of overloaded \c operator&.
+     * @param r the object to acquire address of.
+     * @return the actual address of \c r.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
+    {
+        return &r;
+    }
+
+    /**
+     * Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
+     * The alignment may require some extra memory allocation.
+     * @param n the number of objects to allocate storage for.
+     * @param hint unused parameter provided for standard compliance.
+     * @return a pointer to the first byte of a memory block suitably aligned and sufficient to
+     * hold an array of \c n objects of type \c T.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
+    {
+        pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
+#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
+        if (res == nullptr)
+            throw std::bad_alloc();
+#endif
+        return res;
+    }
+
+    /**
+     * Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
+     * an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
+     * to allocate() that originally produced \c p; otherwise, the behavior is undefined.
+     * @param p pointer obtained from allocate().
+     * @param n number of objects earlier passed to allocate().
+     */
+    template <class T, size_t A>
+    inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
+    {
+        aligned_free(p);
+    }
+
+    /**
+     * Returns the maximum theoretically possible value of \c n, for which the
+     * call allocate(n, 0) could succeed.
+     * @return the maximum supported allocated size.
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::max_size() const noexcept -> size_type
+    {
+        return size_type(-1) / sizeof(T);
+    }
+
+    /**
+     * This method is deprecated, use max_size() instead
+     */
+    template <class T, size_t A>
+    inline auto
+    aligned_allocator<T, A>::size_max() const noexcept -> size_type
+    {
+        return size_type(-1) / sizeof(T);
+    }
+
+    /**
+     * Constructs an object of type \c T in allocated uninitialized memory
+     * pointed to by \c p, using placement-new.
+     * @param p pointer to allocated uninitialized memory.
+     * @param args the constructor arguments to use.
+     */
+    template <class T, size_t A>
+    template <class U, class... Args>
+    inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
+    {
+        new ((void*)p) U(std::forward<Args>(args)...);
+    }
+
+    /**
+     * Calls the destructor of the object pointed to by \c p.
+     * @param p pointer to the object that is going to be destroyed.
+     */
+    template <class T, size_t A>
+    template <class U>
+    inline void aligned_allocator<T, A>::destroy(U* p)
+    {
+        p->~U();
+    }
+
+    /**
+     * @defgroup allocator_comparison Comparison operators
+     */
+
+    /**
+     * @ingroup allocator_comparison
+     * Compares two aligned memory allocator for equality. Since allocators
+     * are stateless, return \c true iff <tt>A1 == A2</tt>.
+     * @param lhs aligned_allocator to compare.
+     * @param rhs aligned_allocator to compare.
+     * @return true if the allocators have the same alignment.
+     */
+    template <class T1, size_t A1, class T2, size_t A2>
+    inline bool operator==(const aligned_allocator<T1, A1>& lhs,
+                           const aligned_allocator<T2, A2>& rhs) noexcept
+    {
+        return lhs.alignment == rhs.alignment;
+    }
+
+    /**
+     * @ingroup allocator_comparison
+     * Compares two aligned memory allocator for inequality. Since allocators
+     * are stateless, return \c true iff <tt>A1 != A2</tt>.
+     * @param lhs aligned_allocator to compare.
+     * @param rhs aligned_allocator to compare.
+     * @return true if the allocators have different alignments.
+     */
+    template <class T1, size_t A1, class T2, size_t A2>
+    inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
+                           const aligned_allocator<T2, A2>& rhs) noexcept
+    {
+        return !(lhs == rhs);
+    }
+
+    /****************************************
+     * aligned malloc / free implementation *
+     ****************************************/
+
+    namespace detail
+    {
+        inline void* xaligned_malloc(size_t size, size_t alignment)
+        {
+            assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
+            assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
+            void* res = nullptr;
+#ifdef _WIN32
+            res = _aligned_malloc(size, alignment);
+#else
+            if (posix_memalign(&res, alignment, size) != 0)
+            {
+                res = nullptr;
+            }
+#endif
+            return res;
+        }
+
+        inline void xaligned_free(void* ptr)
+        {
+#ifdef _WIN32
+            _aligned_free(ptr);
+#else
+            free(ptr);
+#endif
+        }
+    }
+
+    inline void* aligned_malloc(size_t size, size_t alignment)
+    {
+        return detail::xaligned_malloc(size, alignment);
+    }
+
+    inline void aligned_free(void* ptr)
+    {
+        detail::xaligned_free(ptr);
+    }
+
+    template <class T>
+    inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
+    {
+        // size_t block_size = simd_traits<T>::size;
+        if (block_size == 1)
+        {
+            // The simd_block consists of exactly one scalar so that all
+            // elements of the array
+            // are "well" aligned.
+            return 0;
+        }
+        else if (size_t(p) & (sizeof(T) - 1))
+        {
+            // The array is not aligned to the size of a single element, so that
+            // no element
+            // of the array is well aligned
+            return size;
+        }
+        else
+        {
+            size_t block_mask = block_size - 1;
+            return std::min<size_t>(
+                (block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
+                size);
+        }
+    }
+
+    template <class T, class A = default_arch>
+    using default_allocator = typename std::conditional<A::requires_alignment(),
+                                                        aligned_allocator<T, A::alignment()>,
+                                                        std::allocator<T>>::type;
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
@ -0,0 +1,76 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALIGNMENT_HPP
+#define XSIMD_ALIGNMENT_HPP
+
+#include "../types/xsimd_utils.hpp"
+#include "xsimd_aligned_allocator.hpp"
+
+namespace xsimd
+{
+    /**
+     * @struct aligned_mode
+     * @brief tag for load and store of aligned memory.
+     */
+    struct aligned_mode
+    {
+    };
+
+    /**
+     * @struct unaligned_mode
+     * @brief tag for load and store of unaligned memory.
+     */
+    struct unaligned_mode
+    {
+    };
+
+    /***********************
+     * Allocator alignment *
+     ***********************/
+
+    template <class A>
+    struct allocator_alignment
+    {
+        using type = unaligned_mode;
+    };
+
+    template <class T>
+    struct allocator_alignment<aligned_allocator<T>>
+    {
+        using type = aligned_mode;
+    };
+
+    template <class A>
+    using allocator_alignment_t = typename allocator_alignment<A>::type;
+
+    /***********************
+     * container alignment *
+     ***********************/
+
+    template <class C, class = void>
+    struct container_alignment
+    {
+        using type = unaligned_mode;
+    };
+
+    template <class C>
+    struct container_alignment<C, detail::void_t<typename C::allocator_type>>
+    {
+        using type = allocator_alignment_t<typename C::allocator_type>;
+    };
+
+    template <class C>
+    using container_alignment_t = typename container_alignment<C>::type;
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@ -0,0 +1,32 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include "xsimd_fma3_sse_register.hpp"
+#include "xsimd_fma4_register.hpp"
+#include "xsimd_sse2_register.hpp"
+#include "xsimd_sse3_register.hpp"
+#include "xsimd_sse4_1_register.hpp"
+#include "xsimd_sse4_2_register.hpp"
+
+#include "xsimd_avx2_register.hpp"
+#include "xsimd_avx_register.hpp"
+#include "xsimd_fma3_avx2_register.hpp"
+#include "xsimd_fma3_avx_register.hpp"
+
+#include "xsimd_avx512bw_register.hpp"
+#include "xsimd_avx512cd_register.hpp"
+#include "xsimd_avx512dq_register.hpp"
+#include "xsimd_avx512f_register.hpp"
+
+#include "xsimd_neon64_register.hpp"
+#include "xsimd_neon_register.hpp"
+
+#include "xsimd_sve_register.hpp"
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_REGISTER_HPP
+#define XSIMD_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * AVX2 instructions
+     */
+    struct avx2 : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
+        static constexpr char const* name() noexcept { return "avx2"; }
+    };
+
+#if XSIMD_WITH_AVX2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512BW instructions
+     */
+    struct avx512bw : avx512dq
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
+        static constexpr char const* name() noexcept { return "avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512bw>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
+
+    }
+#endif
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_REGISTER_HPP
+#define XSIMD_AVX512CD_REGISTER_HPP
+
+#include "./xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512CD instrutions
+     */
+    struct avx512cd : avx512f
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
+        static constexpr char const* name() noexcept { return "avx512cd"; }
+    };
+
+#if XSIMD_WITH_AVX512CD
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512cd>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
+
+    }
+#endif
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512DQ_REGISTER_HPP
+#define XSIMD_AVX512DQ_REGISTER_HPP
+
+#include "./xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512DQ instructions
+     */
+    struct avx512dq : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
+        static constexpr char const* name() noexcept { return "avx512dq"; }
+    };
+
+#if XSIMD_WITH_AVX512DQ
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512dq>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
+
+    }
+#endif
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
@ -0,0 +1,75 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_REGISTER_HPP
+#define XSIMD_AVX512F_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX512F instructions
+     */
+    struct avx512f : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
+        static constexpr std::size_t alignment() noexcept { return 64; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512f"; }
+    };
+
+#if XSIMD_WITH_AVX512F
+
+    namespace types
+    {
+        template <class T>
+        struct simd_avx512_bool_register
+        {
+            using register_type = typename std::conditional<
+                (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
+                std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
+            register_type data;
+            simd_avx512_bool_register() = default;
+            simd_avx512_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <class T>
+        struct get_bool_simd_register<T, avx512f>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
+
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
@ -0,0 +1,62 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_REGISTER_HPP
+#define XSIMD_AVX_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup arch
+     *
+     * AVX instructions
+     */
+    struct avx : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
+        static constexpr std::size_t alignment() noexcept { return 32; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx"; }
+    };
+}
+
+#if XSIMD_WITH_AVX
+
+#include <immintrin.h>
+
+namespace xsimd
+{
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
+    }
+}
+#endif
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@ -0,0 +1,147 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_CONSTANT_HPP
+#define XSIMD_BATCH_CONSTANT_HPP
+
+#include "./xsimd_batch.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    /**
+     * @brief batch of boolean constant
+     *
+     * Abstract representation of a batch of boolean constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values boolean constant represented by this batch
+     **/
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant
+    {
+        static constexpr std::size_t size = sizeof...(Values);
+        using arch_type = typename batch_type::arch_type;
+        using value_type = bool;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
+
+        bool get(size_t i) const noexcept
+        {
+            return std::array<value_type, size> { { Values... } }[i];
+        }
+
+        static constexpr int mask() noexcept
+        {
+            return mask_helper(0, static_cast<int>(Values)...);
+        }
+
+    private:
+        static constexpr int mask_helper(int acc) noexcept { return acc; }
+        template <class... Tys>
+        static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
+        {
+            return mask_helper(acc | mask, (masks << 1)...);
+        }
+    };
+
+    /**
+     * @brief batch of integral constants
+     *
+     * Abstract representation of a batch of integral constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values constants represented by this batch
+     **/
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant
+    {
+        static constexpr std::size_t size = sizeof...(Values);
+        using arch_type = typename batch_type::arch_type;
+        using value_type = typename batch_type::value_type;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        operator batch_type() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Get the @p i th element of this @p batch_constant
+         */
+        constexpr value_type get(size_t i) const noexcept
+        {
+            return get(i, std::array<value_type, size> { Values... });
+        }
+
+    private:
+        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
+        {
+            return values[i];
+        }
+    };
+
+    namespace detail
+    {
+        template <class batch_type, class G, std::size_t... Is>
+        inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+        template <class batch_type, class G, std::size_t... Is>
+        inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+
+    } // namespace detail
+
+    /**
+     * @brief Build a @c batch_constant out of a generator function
+     *
+     * @tparam batch_type type of the (non-constant) batch to build
+     * @tparam G type used to generate that batch. That type must have a static
+     * member @c get that's used to generate the batch constant. Conversely, the
+     * generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
+     *
+     * The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
+     *
+     * @code
+     * struct Rot
+     * {
+     *     static constexpr unsigned get(unsigned i, unsigned n)
+     *     {
+     *         return (i + n - 1) % n;
+     *     }
+     * };
+     * @endcode
+     */
+    template <class batch_type, class G>
+    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
+    {
+        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
+    }
+
+    template <class batch_type, class G>
+    inline constexpr auto make_batch_bool_constant() noexcept
+        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
+            detail::make_index_sequence<batch_type::size>()))
+    {
+        return detail::make_batch_bool_constant<batch_type, G>(
+            detail::make_index_sequence<batch_type::size>());
+    }
+
+} // namespace xsimd
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
+#define XSIMD_FMA3_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup arch
+     *
+     * AVX2 + FMA instructions
+     */
+    template <>
+    struct fma3<avx2> : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
+        static constexpr char const* name() noexcept { return "fma3+avx2"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX2
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
+
+    }
+#endif
+
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup arch
+     *
+     * AVX + FMA instructions
+     */
+    template <>
+    struct fma3<avx> : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
+        static constexpr char const* name() noexcept { return "fma3+avx"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
+
+    }
+#endif
+
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
+#define XSIMD_FMA3_SSE_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup arch
+     *
+     * SSE4.2 + FMA instructions
+     */
+    template <>
+    struct fma3<sse4_2> : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
+        static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
+    };
+
+#if XSIMD_WITH_FMA3_SSE
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
+
+    }
+#endif
+
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
@ -0,0 +1,42 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_REGISTER_HPP
+#define XSIMD_FMA4_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * FMA4 instructions
+     */
+    struct fma4 : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
+        static constexpr char const* name() noexcept { return "fma4"; }
+    };
+
+#if XSIMD_WITH_FMA4
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
+
+    }
+#endif
+
+}
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
@ -0,0 +1,35 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARCH_HPP
+#define XSIMD_GENERIC_ARCH_HPP
+
+#include "../config/xsimd_config.hpp"
+
+/**
+ * @defgroup arch Architecture description
+ * */
+namespace xsimd
+{
+    struct generic
+    {
+        static constexpr bool supported() noexcept { return true; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
+
+    protected:
+        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
+    };
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
@ -0,0 +1,52 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_REGISTER_HPP
+#define XSIMD_NEON64_REGISTER_HPP
+
+#include "xsimd_neon_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * NEON instructions for arm64
+     */
+    struct neon64 : neon
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
+        static constexpr char const* name() noexcept { return "arm64+neon"; }
+    };
+
+#if XSIMD_WITH_NEON64
+
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
+        XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
+
+        template <class T>
+        struct get_bool_simd_register<T, neon64>
+            : detail::neon_bool_simd_register<T, neon64>
+        {
+        };
+    }
+
+#endif
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
@ -0,0 +1,155 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_REGISTER_HPP
+#define XSIMD_NEON_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_NEON
+#include <arm_neon.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * NEON instructions for arm32
+     */
+    struct neon : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
+        static constexpr char const* name() noexcept { return "arm32+neon"; }
+    };
+
+#if XSIMD_WITH_NEON
+    namespace types
+    {
+        namespace detail
+        {
+            template <size_t S>
+            struct neon_vector_type_impl;
+
+            template <>
+            struct neon_vector_type_impl<8>
+            {
+                using signed_type = int8x16_t;
+                using unsigned_type = uint8x16_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<16>
+            {
+                using signed_type = int16x8_t;
+                using unsigned_type = uint16x8_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<32>
+            {
+                using signed_type = int32x4_t;
+                using unsigned_type = uint32x4_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<64>
+            {
+                using signed_type = int64x2_t;
+                using unsigned_type = uint64x2_t;
+            };
+
+            template <class T>
+            using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                               signed_neon_vector_type<T>,
+                                                               unsigned_neon_vector_type<T>>::type;
+
+            using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
+                                                                    signed_neon_vector_type<char>,
+                                                                    unsigned_neon_vector_type<char>>::type;
+        }
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
+        XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
+        XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
+
+        namespace detail
+        {
+            template <size_t S>
+            struct get_unsigned_type;
+
+            template <>
+            struct get_unsigned_type<1>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct get_unsigned_type<2>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct get_unsigned_type<4>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct get_unsigned_type<8>
+            {
+                using type = uint64_t;
+            };
+
+            template <size_t S>
+            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
+
+            template <class T, class A>
+            struct neon_bool_simd_register
+            {
+                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
+            };
+        }
+
+        template <class T>
+        struct get_bool_simd_register<T, neon>
+            : detail::neon_bool_simd_register<T, neon>
+        {
+        };
+
+    }
+#endif
+
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
@ -0,0 +1,94 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_REGISTER_HPP
+#define XSIMD_REGISTER_HPP
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace types
+    {
+        template <class T, class A>
+        struct has_simd_register : std::false_type
+        {
+        };
+
+        template <class T, class Arch>
+        struct simd_register
+        {
+            struct register_type
+            {
+            };
+        };
+
+#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
+    template <>                                                    \
+    struct simd_register<SCALAR_TYPE, ISA>                         \
+    {                                                              \
+        using register_type = VECTOR_TYPE;                         \
+        register_type data;                                        \
+        operator register_type() const noexcept                    \
+        {                                                          \
+            return data;                                           \
+        }                                                          \
+    };                                                             \
+    template <>                                                    \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type    \
+    {                                                              \
+    }
+
+#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA)    \
+    template <>                                                  \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
+    {                                                            \
+    }
+
+#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE)                          \
+    template <class T>                                                            \
+    struct simd_register<T, ISA> : simd_register<T, ISA_BASE>                     \
+    {                                                                             \
+        using register_type = typename simd_register<T, ISA_BASE>::register_type; \
+        simd_register(register_type reg) noexcept                                 \
+            : simd_register<T, ISA_BASE> { reg }                                  \
+        {                                                                         \
+        }                                                                         \
+        simd_register() = default;                                                \
+    };                                                                            \
+    template <class T>                                                            \
+    struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE>             \
+    {                                                                             \
+    }
+
+        template <class T, class Arch>
+        struct get_bool_simd_register
+        {
+            using type = simd_register<T, Arch>;
+        };
+
+        template <class T, class Arch>
+        using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
+    }
+
+    namespace kernel
+    {
+        template <class A>
+        // makes requires_arch equal to A const&, using type_traits functions
+        using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
+        template <class T>
+        struct convert
+        {
+        };
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
@ -0,0 +1,61 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_REGISTER_HPP
+#define XSIMD_SSE2_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_SSE2
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE2 instructions
+     */
+    struct sse2 : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "sse2"; }
+    };
+
+#if XSIMD_WITH_SSE2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
@ -0,0 +1,45 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_REGISTER_HPP
+#define XSIMD_SSE3_REGISTER_HPP
+
+#include "./xsimd_sse2_register.hpp"
+
+#if XSIMD_WITH_SSE3
+#include <pmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE3 instructions
+     */
+    struct sse3 : sse2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
+        static constexpr char const* name() noexcept { return "sse3"; }
+    };
+
+#if XSIMD_WITH_SSE3
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_REGISTER_HPP
+#define XSIMD_SSE4_1_REGISTER_HPP
+
+#include "./xsimd_ssse3_register.hpp"
+
+#if XSIMD_WITH_SSE4_1
+#include <smmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE4.1 instructions
+     */
+    struct sse4_1 : ssse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
+        static constexpr char const* name() noexcept { return "sse4.1"; }
+    };
+
+#if XSIMD_WITH_SSE4_1
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_REGISTER_HPP
+#define XSIMD_SSE4_2_REGISTER_HPP
+
+#include "./xsimd_sse4_1_register.hpp"
+
+#if XSIMD_WITH_SSE4_2
+#include <nmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSE4.2 instructions
+     */
+    struct sse4_2 : sse4_1
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
+        static constexpr char const* name() noexcept { return "sse4.2"; }
+    };
+
+#if XSIMD_WITH_SSE4_2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_REGISTER_HPP
+#define XSIMD_SSSE3_REGISTER_HPP
+
+#include "./xsimd_sse3_register.hpp"
+
+#if XSIMD_WITH_SSSE3
+#include <tmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup arch
+     *
+     * SSSE3 instructions
+     */
+    struct ssse3 : sse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
+        static constexpr char const* name() noexcept { return "ssse3"; }
+    };
+
+#if XSIMD_WITH_SSSE3
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
+    }
+#endif
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
@ -0,0 +1,155 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_REGISTER_HPP
+#define XSIMD_SVE_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_SVE
+#include <arm_sve.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup arch
+         *
+         * SVE instructions (fixed vector size) for arm64
+         */
+        template <size_t Width>
+        struct sve : xsimd::generic
+        {
+            static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
+            static constexpr char const* name() noexcept { return "arm64+sve"; }
+        };
+    }
+
+#if XSIMD_WITH_SVE
+
+    using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
+
+    namespace types
+    {
+        namespace detail
+        {
+// define fixed size alias per SVE sizeless type
+#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
+            using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
+            using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
+            using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
+            using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
+            using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
+            using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
+            using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
+            using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
+            using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
+            using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
+            using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
+#undef SVE_TO_FIXED_SIZE
+
+            template <size_t S>
+            struct sve_vector_type_impl;
+
+            template <>
+            struct sve_vector_type_impl<8>
+            {
+                using signed_type = sve_int8_t;
+                using unsigned_type = sve_uint8_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<16>
+            {
+                using signed_type = sve_int16_t;
+                using unsigned_type = sve_uint16_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<32>
+            {
+                using signed_type = sve_int32_t;
+                using unsigned_type = sve_uint32_t;
+                using floating_point_type = sve_float32_t;
+            };
+
+            template <>
+            struct sve_vector_type_impl<64>
+            {
+                using signed_type = sve_int64_t;
+                using unsigned_type = sve_uint64_t;
+                using floating_point_type = sve_float64_t;
+            };
+
+            template <class T>
+            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_sve_vector_type<T>,
+                                                                                           signed_int_sve_vector_type<T>>::type;
+
+            template <class T>
+            using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_sve_vector_type<T>,
+                                                              unsigned_int_sve_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
+
+        namespace detail
+        {
+            struct sve_bool_simd_register
+            {
+                using register_type = sve_bool_t;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, sve>
+        {
+            using type = detail::sve_bool_simd_register;
+        };
+    } // namespace types
+#endif
+} // namespace xsimd
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
@ -0,0 +1,251 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_TRAITS_HPP
+#define XSIMD_TRAITS_HPP
+
+#include <type_traits>
+
+#include "xsimd_batch.hpp"
+
+namespace xsimd
+{
+
+    /**************************************
+     * simd_traits and revert_simd_traits *
+     **************************************/
+
+    template <class T, class A = default_arch>
+    struct has_simd_register : types::has_simd_register<T, A>
+    {
+    };
+
+    namespace detail
+    {
+        template <class T, bool>
+        struct simd_traits_impl;
+
+        template <class T>
+        struct simd_traits_impl<T, false>
+        {
+            using type = T;
+            using bool_type = bool;
+            static constexpr size_t size = 1;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, false>::size;
+
+        template <class T>
+        struct simd_traits_impl<T, true>
+        {
+            using type = batch<T>;
+            using bool_type = typename type::batch_bool_type;
+            static constexpr size_t size = type::size;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, true>::size;
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter
+        {
+
+            static_assert(A::supported(),
+                          "usage of batch type with unsupported architecture");
+            static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
+                          "usage of batch type with unsupported type");
+        };
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, class A, bool i3ec>
+        struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+#endif
+
+        // consistency checker
+        template <class T, class A>
+        void static_check_supported_config()
+        {
+            (void)static_check_supported_config_emitter<T, A>();
+        }
+    }
+
+    template <class T>
+    struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+    template <class T>
+    struct simd_traits<std::complex<T>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    struct simd_traits<xtl::xcomplex<T, T, i3ec>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+#endif
+
+    template <class T>
+    struct revert_simd_traits
+    {
+        using type = T;
+        static constexpr size_t size = simd_traits<type>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<T>::size;
+
+    template <class T>
+    struct revert_simd_traits<batch<T>>
+    {
+        using type = T;
+        static constexpr size_t size = batch<T>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<batch<T>>::size;
+
+    template <class T>
+    using simd_type = typename simd_traits<T>::type;
+
+    template <class T>
+    using simd_bool_type = typename simd_traits<T>::bool_type;
+
+    template <class T>
+    using revert_simd_type = typename revert_simd_traits<T>::type;
+
+    /********************
+     * simd_return_type *
+     ********************/
+
+    namespace detail
+    {
+        template <class T1, class T2>
+        struct simd_condition
+        {
+            static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl
+            : std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, T2, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, std::complex<T2>, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+#endif
+    }
+
+    template <class T1, class T2, class A = default_arch>
+    using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
+
+    /************
+     * is_batch *
+     ************/
+
+    template <class V>
+    struct is_batch : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch<batch<T, A>> : std::true_type
+    {
+    };
+
+    /*****************
+     * is_batch_bool *
+     *****************/
+
+    template <class V>
+    struct is_batch_bool : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_bool<batch_bool<T, A>> : std::true_type
+    {
+    };
+
+    /********************
+     * is_batch_complex *
+     ********************/
+
+    template <class V>
+    struct is_batch_complex : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
+    {
+    };
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
@ -0,0 +1,530 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_UTILS_HPP
+#define XSIMD_UTILS_HPP
+
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+namespace xsimd
+{
+
+    template <class T, class A>
+    class batch;
+
+    template <class T, class A>
+    class batch_bool;
+
+    /**************
+     * index      *
+     **************/
+
+    template <size_t I>
+    using index = std::integral_constant<size_t, I>;
+
+    /**************
+     * as_integer *
+     **************/
+
+    template <class T>
+    struct as_integer : std::make_signed<T>
+    {
+    };
+
+    template <>
+    struct as_integer<float>
+    {
+        using type = int32_t;
+    };
+
+    template <>
+    struct as_integer<double>
+    {
+        using type = int64_t;
+    };
+
+    template <class T, class A>
+    struct as_integer<batch<T, A>>
+    {
+        using type = batch<typename as_integer<T>::type, A>;
+    };
+
+    template <class B>
+    using as_integer_t = typename as_integer<B>::type;
+
+    /***********************
+     * as_unsigned_integer *
+     ***********************/
+
+    template <class T>
+    struct as_unsigned_integer : std::make_unsigned<T>
+    {
+    };
+
+    template <>
+    struct as_unsigned_integer<float>
+    {
+        using type = uint32_t;
+    };
+
+    template <>
+    struct as_unsigned_integer<double>
+    {
+        using type = uint64_t;
+    };
+
+    template <class T, class A>
+    struct as_unsigned_integer<batch<T, A>>
+    {
+        using type = batch<typename as_unsigned_integer<T>::type, A>;
+    };
+
+    template <class T>
+    using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
+
+    /*********************
+     * as_signed_integer *
+     *********************/
+
+    template <class T>
+    struct as_signed_integer : std::make_signed<T>
+    {
+    };
+
+    template <class T>
+    using as_signed_integer_t = typename as_signed_integer<T>::type;
+
+    /******************
+     * flip_sign_type *
+     ******************/
+
+    namespace detail
+    {
+        template <class T, bool is_signed>
+        struct flipped_sign_type_impl : std::make_signed<T>
+        {
+        };
+
+        template <class T>
+        struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
+        {
+        };
+    }
+
+    template <class T>
+    struct flipped_sign_type
+        : detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
+    {
+    };
+
+    template <class T>
+    using flipped_sign_type_t = typename flipped_sign_type<T>::type;
+
+    /***********
+     * as_float *
+     ************/
+
+    template <class T>
+    struct as_float;
+
+    template <>
+    struct as_float<int32_t>
+    {
+        using type = float;
+    };
+
+    template <>
+    struct as_float<int64_t>
+    {
+        using type = double;
+    };
+
+    template <class T, class A>
+    struct as_float<batch<T, A>>
+    {
+        using type = batch<typename as_float<T>::type, A>;
+    };
+
+    template <class T>
+    using as_float_t = typename as_float<T>::type;
+
+    /**************
+     * as_logical *
+     **************/
+
+    template <class T>
+    struct as_logical;
+
+    template <class T, class A>
+    struct as_logical<batch<T, A>>
+    {
+        using type = batch_bool<T, A>;
+    };
+
+    template <class T>
+    using as_logical_t = typename as_logical<T>::type;
+
+    /********************
+     * bit_cast *
+     ********************/
+
+    template <class To, class From>
+    inline To bit_cast(From val) noexcept
+    {
+        static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
+        // FIXME: Some old version of GCC don't support that trait
+        // static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
+        // static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
+        To res;
+        std::memcpy(&res, &val, sizeof(val));
+        return res;
+    }
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
+
+            /********************************
+             * Matching & mismatching sizes *
+             ********************************/
+
+            template <class T, class U, class B = int>
+            using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
+        } // namespace detail
+    } // namespace kernel
+
+    /*****************************************
+     * Backport of index_sequence from c++14 *
+     *****************************************/
+
+    // TODO: Remove this once we drop C++11 support
+    namespace detail
+    {
+        template <typename T>
+        struct identity
+        {
+            using type = T;
+        };
+
+#ifdef __cpp_lib_integer_sequence
+        using std::index_sequence;
+        using std::integer_sequence;
+        using std::make_index_sequence;
+        using std::make_integer_sequence;
+
+        using std::index_sequence_for;
+#else
+        template <typename T, T... Is>
+        struct integer_sequence
+        {
+            using value_type = T;
+            static constexpr std::size_t size() noexcept { return sizeof...(Is); }
+        };
+
+        template <typename Lhs, typename Rhs>
+        struct make_integer_sequence_concat;
+
+        template <typename T, T... Lhs, T... Rhs>
+        struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
+                                            integer_sequence<T, Rhs...>>
+            : identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl;
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
+        {
+        };
+
+        template <typename T, T N>
+        struct make_integer_sequence_impl<std::integral_constant<T, N>>
+            : make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
+                                           typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
+        {
+        };
+
+        template <typename T, T N>
+        using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
+
+        template <std::size_t... Is>
+        using index_sequence = integer_sequence<std::size_t, Is...>;
+
+        template <std::size_t N>
+        using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+        template <typename... Ts>
+        using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+#endif
+
+        template <int... Is>
+        using int_sequence = integer_sequence<int, Is...>;
+
+        template <int N>
+        using make_int_sequence = make_integer_sequence<int, N>;
+
+        template <typename... Ts>
+        using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
+
+        // Type-casted index sequence.
+        template <class P, size_t... Is>
+        inline P indexes_from(index_sequence<Is...>) noexcept
+        {
+            return { static_cast<typename P::value_type>(Is)... };
+        }
+
+        template <class P>
+        inline P make_sequence_as_batch() noexcept
+        {
+            return indexes_from<P>(make_index_sequence<P::size>());
+        }
+    }
+
+    /***********************************
+     * Backport of std::get from C++14 *
+     ***********************************/
+
+    namespace detail
+    {
+        template <class T, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
+        {
+            return std::get<I>(t);
+        }
+
+        template <class T, class U, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
+        }
+
+        template <class T, class... Types>
+        inline const T& get(const std::tuple<Types...>& t) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
+        }
+    }
+
+    /*********************************
+     * Backport of void_t from C++17 *
+     *********************************/
+
+    namespace detail
+    {
+        template <class... T>
+        struct make_void
+        {
+            using type = void;
+        };
+
+        template <class... T>
+        using void_t = typename make_void<T...>::type;
+    }
+
+    /**************************************************
+     * Equivalent of void_t but with size_t parameter *
+     **************************************************/
+
+    namespace detail
+    {
+        template <std::size_t>
+        struct check_size
+        {
+            using type = void;
+        };
+
+        template <std::size_t S>
+        using check_size_t = typename check_size<S>::type;
+    }
+
+    /*****************************************
+     * Supplementary std::array constructors *
+     *****************************************/
+
+    namespace detail
+    {
+        // std::array constructor from scalar value ("broadcast")
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
+        {
+            // You can safely ignore this silly ternary, the "scalar" is all
+            // that matters. The rest is just a dirty workaround...
+            return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_scalar(const T& scalar) noexcept
+        {
+            return array_from_scalar_impl(scalar, make_index_sequence<N>());
+        }
+
+        // std::array constructor from C-style pointer (handled as an array)
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
+        {
+            return std::array<T, sizeof...(Is)> { c_array[Is]... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_pointer(const T* c_array) noexcept
+        {
+            return array_from_pointer_impl(c_array, make_index_sequence<N>());
+        }
+    }
+
+    /************************
+     * is_array_initializer *
+     ************************/
+
+    namespace detail
+    {
+        template <bool...>
+        struct bool_pack;
+
+        template <bool... bs>
+        using all_true = std::is_same<
+            bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+        template <typename T, typename... Args>
+        using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
+
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer = std::enable_if<
+            (sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
+
+        // Check that a variadic argument pack is a list of N values of type T,
+        // as usable for instantiating a value of type std::array<T, N>.
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
+    }
+
+    /**************
+     * is_complex *
+     **************/
+
+    // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
+    // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
+    // so we cannot define is_complex in xsimd_traits.hpp. Besides, if
+    // no file defining batches is included, we still need this definition
+    // in xsimd_traits.hpp, so let's define it here.
+
+    namespace detail
+    {
+        template <class T>
+        struct is_complex : std::false_type
+        {
+        };
+
+        template <class T>
+        struct is_complex<std::complex<T>> : std::true_type
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, bool i3ec>
+        struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
+        {
+        };
+#endif
+    }
+
+    /*******************
+     * real_batch_type *
+     *******************/
+
+    template <class B>
+    struct real_batch_type
+    {
+        using type = B;
+    };
+
+    template <class T, class A>
+    struct real_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<T, A>;
+    };
+
+    template <class B>
+    using real_batch_type_t = typename real_batch_type<B>::type;
+
+    /**********************
+     * complex_batch_type *
+     **********************/
+
+    template <class B>
+    struct complex_batch_type
+    {
+        using real_value_type = typename B::value_type;
+        using arch_type = typename B::arch_type;
+        using type = batch<std::complex<real_value_type>, arch_type>;
+    };
+
+    template <class T, class A>
+    struct complex_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<std::complex<T>, A>;
+    };
+
+    template <class B>
+    using complex_batch_type_t = typename complex_batch_type<B>::type;
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/xsimd.hpp
+++ b/third_party/xsimd/include/xsimd/xsimd.hpp
@ -0,0 +1,68 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_HPP
+#define XSIMD_HPP
+
+#if defined(__has_cpp_attribute)
+// if this check passes, then the compiler supports feature test macros
+#if __has_cpp_attribute(nodiscard) >= 201603L
+// if this check passes, then the compiler supports [[nodiscard]] without a message
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
+// this means that the previous tests failed, but we are using C++17 or higher
+#define XSIMD_NO_DISCARD [[nodiscard]]
+#endif
+
+#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
+// this means that the previous checks failed, but we are using GCC or Clang
+#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
+#endif
+
+#if !defined(XSIMD_NO_DISCARD)
+// this means that all the previous checks failed, so we fallback to doing nothing
+#define XSIMD_NO_DISCARD
+#endif
+
+#ifdef __cpp_if_constexpr
+// this means that the compiler supports the `if constexpr` construct
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
+// this means that the previous test failed, but we are using C++17 or higher
+#define XSIMD_IF_CONSTEXPR if constexpr
+#endif
+
+#if !defined(XSIMD_IF_CONSTEXPR)
+// this means that all the previous checks failed, so we fallback to a normal `if`
+#define XSIMD_IF_CONSTEXPR if
+#endif
+
+#include "config/xsimd_config.hpp"
+
+#include "arch/xsimd_scalar.hpp"
+#include "memory/xsimd_aligned_allocator.hpp"
+
+#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
+// to type definition or anything appart from scalar definition and aligned allocator
+#else
+#include "types/xsimd_batch.hpp"
+#include "types/xsimd_batch_constant.hpp"
+#include "types/xsimd_traits.hpp"
+
+// This include must come last
+#include "types/xsimd_api.hpp"
+#endif
+#endif
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@ -0,0 +1,37 @@
+schema: 1
+
+bugzilla:
+  product: Toolkit
+  component: "General"
+
+origin:
+  name: xsimd
+  description: C++ wrappers for SIMD intrinsics
+
+  url: https://github.com/QuantStack/xsimd
+
+  release: 75b043b8e031f1ada8053fe80d5ba635e2a75588 (2023-01-05T06:45:23Z).
+  revision: 75b043b8e031f1ada8053fe80d5ba635e2a75588
+
+  license: BSD-3-Clause
+
+vendoring:
+  url: https://github.com/QuantStack/xsimd
+  source-hosting: github
+  tracking: commit
+
+  exclude:
+    - ".*"
+    - "*.md"
+    - "*.yml"
+    - "*.txt"
+    - "*.in"
+    - "*.sh"
+    - benchmark
+    - cmake
+    - docs
+    - examples
+    - test
+
+  keep:
+    - include/
--- a/toolkit/content/license.html
+++ b/toolkit/content/license.html
@ -2029,6 +2029,7 @@ into source code and to files in the following directories:
 #ifdef MOZ_JXL
    <li><code>third_party/jpeg-xl/</code></li>
 #endif
+    <li><code>third_party/xsimd/</code></li>
 </ul>
 See the individual LICENSE files for copyright owners.</p>