Backed out 3 changesets (bug 1804226, bug 1801557) for AudioNodeEngine bustages related. CLOSED TREE

Backed out changeset 55a4d00bc8b2 (bug 1804226) Backed out changeset f47bedfe0f5f (bug 1801557) Backed out changeset cccb159a5b46 (bug 1801557)
2023-01-04 18:34:57 +02:00 · 2023-01-04 18:34:57 +02:00 · 848e2cfb4b
--- a/build/moz.configure/toolchain.configure
+++ b/build/moz.configure/toolchain.configure
@ -3091,5 +3091,3 @@ set_config("MMX_FLAGS", ["-mmmx"])
 set_config("SSE_FLAGS", ["-msse"])
 set_config("SSE2_FLAGS", ["-msse2"])
 set_config("SSSE3_FLAGS", ["-mssse3"])
-set_config("SSE4_2_FLAGS", ["-msse4.2"])
-set_config("FMA_FLAGS", ["-mfma"])
--- a/dom/media/webaudio/AudioNodeEngine.cpp
+++ b/dom/media/webaudio/AudioNodeEngine.cpp
@ -9,15 +9,12 @@
 #include "mozilla/AbstractThread.h"
 #ifdef USE_NEON
 #  include "mozilla/arm.h"
-#  include "AudioNodeEngineGeneric.h"
+#  include "AudioNodeEngineNEON.h"
 #endif
 #ifdef USE_SSE2
 #  include "mozilla/SSE.h"
-#  include "AudioNodeEngineGeneric.h"
-#endif
-#if defined(USE_SSE42) && defined(USE_FMA3)
-#  include "mozilla/SSE.h"
-#  include "AudioNodeEngineGeneric.h"
+#  include "AlignmentUtils.h"
+#  include "AudioNodeEngineSSE2.h"
 #endif
 #include "AudioBlock.h"
 #include "Tracing.h"
@ -70,8 +67,7 @@ void AudioBufferAddWithScale(const float* aInput, float aScale, float* aOutput,
                             uint32_t aSize) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    Engine<xsimd::neon>::AudioBufferAddWithScale(aInput, aScale, aOutput,
-                                                 aSize);
+    AudioBufferAddWithScale_NEON(aInput, aScale, aOutput, aSize);
    return;
  }
 #endif
@ -97,16 +93,7 @@ void AudioBufferAddWithScale(const float* aInput, float aScale, float* aOutput,
    // we need to round aSize down to the nearest multiple of 16
    uint32_t alignedSize = aSize & ~0x0F;
    if (alignedSize > 0) {
-#  if defined(USE_SSE42) && defined(USE_FMA3)
-      if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
-        Engine<xsimd::fma3<xsimd::sse4_2>>::AudioBufferAddWithScale(
-            aInput, aScale, aOutput, alignedSize);
-      } else
-#  endif
-      {
-        Engine<xsimd::sse2>::AudioBufferAddWithScale(aInput, aScale, aOutput,
-                                                     alignedSize);
-      }
+      AudioBufferAddWithScale_SSE(aInput, aScale, aOutput, alignedSize);

      // adjust parameters for use with scalar operations below
      aInput += alignedSize;
@ -140,16 +127,14 @@ void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
  } else {
 #ifdef USE_NEON
    if (mozilla::supports_neon()) {
-      Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
-                                                          aOutput);
+      AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
      return;
    }
 #endif

 #ifdef USE_SSE2
    if (mozilla::supports_sse2()) {
-      Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
-                                                          aOutput);
+      AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
      return;
    }
 #endif
@ -164,16 +149,7 @@ void BufferComplexMultiply(const float* aInput, const float* aScale,
                           float* aOutput, uint32_t aSize) {
 #ifdef USE_SSE2
  if (mozilla::supports_sse()) {
-#  if defined(USE_SSE42) && defined(USE_FMA3)
-    if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
-      Engine<xsimd::fma3<xsimd::sse4_2>>::BufferComplexMultiply(aInput, aScale,
-                                                                aOutput, aSize);
-    } else
-#  endif
-    {
-      Engine<xsimd::sse2>::BufferComplexMultiply(aInput, aScale, aOutput,
-                                                 aSize);
-    }
+    BufferComplexMultiply_SSE(aInput, aScale, aOutput, aSize);
    return;
  }
 #endif
@ -206,16 +182,14 @@ void AudioBlockCopyChannelWithScale(const float aInput[WEBAUDIO_BLOCK_SIZE],
                                    float aOutput[WEBAUDIO_BLOCK_SIZE]) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    Engine<xsimd::neon>::AudioBlockCopyChannelWithScale(aInput, aScale,
-                                                        aOutput);
+    AudioBlockCopyChannelWithScale_NEON(aInput, aScale, aOutput);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    Engine<xsimd::sse2>::AudioBlockCopyChannelWithScale(aInput, aScale,
-                                                        aOutput);
+    AudioBlockCopyChannelWithScale_SSE(aInput, aScale, aOutput);
    return;
  }
 #endif
@ -240,14 +214,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
  }
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
+    AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
+    AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
    return;
  }
 #endif
@ -260,14 +234,14 @@ void AudioBufferInPlaceScale(float* aBlock, float aScale, uint32_t aSize) {
 void AudioBufferInPlaceScale(float* aBlock, float* aScale, uint32_t aSize) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    Engine<xsimd::neon>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
+    AudioBufferInPlaceScale_NEON(aBlock, aScale, aSize);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    Engine<xsimd::sse2>::AudioBufferInPlaceScale(aBlock, aScale, aSize);
+    AudioBufferInPlaceScale_SSE(aBlock, aScale, aSize);
    return;
  }
 #endif
@ -301,24 +275,16 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                 float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
-        aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
+    AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
+                                     aIsOnTheLeft, aOutputL, aOutputR);
    return;
  }
 #endif

 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-#  if defined(USE_SSE42) && defined(USE_FMA3)
-    if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
-      Engine<xsimd::fma3<xsimd::sse4_2>>::AudioBlockPanStereoToStereo(
-          aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
-    } else
-#  endif
-    {
-      Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
-          aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
-    }
+    AudioBlockPanStereoToStereo_SSE(aInputL, aInputR, aGainL, aGainR,
+                                    aIsOnTheLeft, aOutputL, aOutputR);
    return;
  }
 #endif
@ -347,24 +313,8 @@ void AudioBlockPanStereoToStereo(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                 float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 #ifdef USE_NEON
  if (mozilla::supports_neon()) {
-    Engine<xsimd::neon>::AudioBlockPanStereoToStereo(
-        aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
-    return;
-  }
-#endif
-
-#ifdef USE_SSE2
-  if (mozilla::supports_sse2()) {
-#  if defined(USE_SSE42) && defined(USE_FMA3)
-    if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
-      Engine<xsimd::fma3<xsimd::sse2>>::AudioBlockPanStereoToStereo(
-          aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
-    } else
-#  endif
-    {
-      Engine<xsimd::sse2>::AudioBlockPanStereoToStereo(
-          aInputL, aInputR, aGainL, aGainR, aIsOnTheLeft, aOutputL, aOutputR);
-    }
+    AudioBlockPanStereoToStereo_NEON(aInputL, aInputR, aGainL, aGainR,
+                                     aIsOnTheLeft, aOutputL, aOutputR);
    return;
  }
 #endif
@ -399,16 +349,7 @@ float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
    }

    uint32_t vLength = (aLength >> 4) << 4;
-#  if defined(USE_SSE42) && defined(USE_FMA3)
-    if (mozilla::supports_fma3() && mozilla::supports_sse4_2()) {
-      sum += Engine<xsimd::fma3<xsimd::sse4_2>>::AudioBufferSumOfSquares(
-          alignedInput, vLength);
-    } else
-#  endif
-    {
-      sum +=
-          Engine<xsimd::sse2>::AudioBufferSumOfSquares(alignedInput, vLength);
-    }
+    sum += AudioBufferSumOfSquares_SSE(alignedInput, vLength);

    // adjust aInput and aLength to use scalar operations for any
    // remaining values
@ -427,7 +368,7 @@ float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
 void NaNToZeroInPlace(float* aSamples, size_t aCount) {
 #ifdef USE_SSE2
  if (mozilla::supports_sse2()) {
-    Engine<xsimd::sse2>::NaNToZeroInPlace(aSamples, aCount);
+    NaNToZeroInPlace_SSE(aSamples, aCount);
    return;
  }
 #endif
--- a/dom/media/webaudio/AudioNodeEngineGeneric.h
+++ b/dom/media/webaudio/AudioNodeEngineGeneric.h
@ -1,261 +0,0 @@
-/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* this source code form is subject to the terms of the mozilla public
- * license, v. 2.0. if a copy of the mpl was not distributed with this file,
- * You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#ifndef MOZILLA_AUDIONODEENGINEGENERIC_H_
-#define MOZILLA_AUDIONODEENGINEGENERIC_H_
-
-#include "AudioNodeEngine.h"
-#include "AlignmentUtils.h"
-
-#include "xsimd/xsimd.hpp"
-
-namespace mozilla {
-
-template <class Arch>
-struct Engine {
-  static void AudioBufferAddWithScale(const float* aInput, float aScale,
-                                      float* aOutput, uint32_t aSize) {
-    ASSERT_ALIGNED16(aInput);
-    ASSERT_ALIGNED16(aOutput);
-    ASSERT_MULTIPLE16(aSize);
-
-    xsimd::batch<float, Arch> vgain(aScale);
-
-#pragma GCC unroll(4)
-    for (unsigned i = 0; i < aSize; i += 4 * xsimd::batch<float, Arch>::size) {
-      auto vin1 = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
-      auto vin2 = xsimd::batch<float, Arch>::load_aligned(&aOutput[i]);
-      auto vout = xsimd::fma(vin1, vgain, vin2);
-      vout.store_aligned(&aOutput[i]);
-    }
-  };
-
-  static void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
-                                             float* aOutput) {
-    ASSERT_ALIGNED16(aInput);
-    ASSERT_ALIGNED16(aOutput);
-
-    xsimd::batch<float, Arch> vgain = (aScale);
-
-#pragma GCC unroll(4)
-    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
-         i += xsimd::batch<float, Arch>::size) {
-      auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
-      auto vout = vin * vgain;
-      vout.store_aligned(&aOutput[i]);
-    }
-  };
-
-  static void AudioBlockCopyChannelWithScale(
-      const float aInput[WEBAUDIO_BLOCK_SIZE],
-      const float aScale[WEBAUDIO_BLOCK_SIZE],
-      float aOutput[WEBAUDIO_BLOCK_SIZE]) {
-    ASSERT_ALIGNED16(aInput);
-    ASSERT_ALIGNED16(aScale);
-    ASSERT_ALIGNED16(aOutput);
-
-#pragma GCC unroll(4)
-    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
-         i += xsimd::batch<float, Arch>::size) {
-      auto vscaled = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
-      auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
-      auto vout = vin * vscaled;
-      vout.store_aligned(&aOutput[i]);
-    }
-  };
-
-  static void AudioBufferInPlaceScale(float* aBlock, float aScale,
-                                      uint32_t aSize) {
-    ASSERT_ALIGNED16(aBlock);
-    ASSERT_MULTIPLE16(aSize);
-
-    xsimd::batch<float, Arch> vgain(aScale);
-
-#pragma GCC unroll(4)
-    for (unsigned i = 0; i < aSize; i += xsimd::batch<float, Arch>::size) {
-      auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
-      auto vout = vin * vgain;
-      vout.store_aligned(&aBlock[i]);
-    }
-  };
-
-  static void AudioBufferInPlaceScale(float* aBlock, float* aScale,
-                                      uint32_t aSize) {
-    ASSERT_ALIGNED16(aBlock);
-    ASSERT_MULTIPLE16(aSize);
-
-#pragma GCC unroll(4)
-    for (unsigned i = 0; i < aSize; i += xsimd::batch<float, Arch>::size) {
-      auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
-      auto vgain = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
-      auto vout = vin * vgain;
-      vout.store_aligned(&aBlock[i]);
-    }
-  };
-
-  static void AudioBlockPanStereoToStereo(
-      const float aInputL[WEBAUDIO_BLOCK_SIZE],
-      const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR,
-      bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE],
-      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
-    ASSERT_ALIGNED16(aInputL);
-    ASSERT_ALIGNED16(aInputR);
-    ASSERT_ALIGNED16(aOutputL);
-    ASSERT_ALIGNED16(aOutputR);
-
-    xsimd::batch<float, Arch> vgainl(aGainL);
-    xsimd::batch<float, Arch> vgainr(aGainR);
-
-    if (aIsOnTheLeft) {
-#pragma GCC unroll(2)
-      for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
-           i += xsimd::batch<float, Arch>::size) {
-        auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
-        auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
-
-        /* left channel : aOutputL  = aInputL + aInputR * gainL */
-        auto vout = xsimd::fma(vinr, vgainl, vinl);
-        vout.store_aligned(&aOutputL[i]);
-
-        /* right channel : aOutputR = aInputR * gainR */
-        auto vscaled = vinr * vgainr;
-        vscaled.store_aligned(&aOutputR[i]);
-      }
-    } else {
-#pragma GCC unroll(2)
-      for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
-           i += xsimd::batch<float, Arch>::size) {
-        auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
-        auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
-
-        /* left channel : aInputL * gainL */
-        auto vscaled = vinl * vgainl;
-        vscaled.store_aligned(&aOutputL[i]);
-
-        /* right channel: aOutputR = aInputR + aInputL * gainR */
-        auto vout = xsimd::fma(vinl, vgainr, vinr);
-        vout.store_aligned(&aOutputR[i]);
-      }
-    }
-  };
-
-  static void BufferComplexMultiply(const float* aInput, const float* aScale,
-                                    float* aOutput, uint32_t aSize) {
-    ASSERT_ALIGNED16(aInput);
-    ASSERT_ALIGNED16(aScale);
-    ASSERT_ALIGNED16(aOutput);
-    ASSERT_MULTIPLE16(aSize);
-
-#pragma GCC unroll(2)
-    for (unsigned i = 0; i < aSize * 2;
-         i += 2 * xsimd::batch<std::complex<float>>::size) {
-      auto in1 = xsimd::batch<std::complex<float>>::load_aligned(
-          reinterpret_cast<const std::complex<float>*>(&aInput[i]));
-      auto in2 = xsimd::batch<std::complex<float>>::load_aligned(
-          reinterpret_cast<const std::complex<float>*>(&aScale[i]));
-      auto out = in1 * in2;
-      out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
-    }
-  };
-
-  static float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
-    ASSERT_ALIGNED16(aInput);
-    ASSERT_MULTIPLE16(aLength);
-
-    constexpr uint32_t unroll_factor = 4;
-    xsimd::batch<float, Arch> accs[unroll_factor] = {0.f, 0.f, 0.f, 0.f};
-
-    for (uint32_t i = 0; i < aLength;
-         i += unroll_factor * xsimd::batch<float, Arch>::size) {
-#pragma GCC unroll
-      for (uint32_t j = 0; j < unroll_factor; ++j) {
-        auto in = xsimd::batch<float, Arch>::load_aligned(
-            &aInput[i + xsimd::batch<float, Arch>::size * j]);
-        accs[j] = xsimd::fma(in, in, accs[j]);
-      }
-    }
-
-    return reduce_add((accs[0] + accs[1]) + (accs[2] + accs[3]));
-  };
-
-  static void NaNToZeroInPlace(float* aSamples, size_t aCount) {
-    float* samplesAligned16 = ALIGNED16(aSamples);
-    size_t leadingElementsScalar =
-        std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
-    size_t remainingElements = aCount - leadingElementsScalar;
-    size_t vectoredEnd =
-        aCount - remainingElements % (4 * xsimd::batch<float, Arch>::size);
-
-    MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) %
-                 (4 * xsimd::batch<float, Arch>::size)));
-
-    size_t i = 0;
-    for (; i < leadingElementsScalar; i++) {
-      if (aSamples[i] != aSamples[i]) {
-        aSamples[i] = 0.0;
-      }
-    }
-
-    ASSERT_ALIGNED16(&aSamples[i]);
-
-#pragma GCC unroll(4)
-    for (; i < vectoredEnd; i += xsimd::batch<float, Arch>::size) {
-      auto vin = xsimd::batch<float, Arch>::load_aligned(&aSamples[i]);
-      auto vout =
-          xsimd::select(xsimd::isnan(vin), xsimd::batch<float, Arch>(0.f), vin);
-      vout.store_aligned(&aSamples[i]);
-    }
-    for (; i < aCount; i++) {
-      if (aSamples[i] != aSamples[i]) {
-        aSamples[i] = 0.0;
-      }
-    }
-  };
-
-  static void AudioBlockPanStereoToStereo(
-      const float aInputL[WEBAUDIO_BLOCK_SIZE],
-      const float aInputR[WEBAUDIO_BLOCK_SIZE],
-      const float aGainL[WEBAUDIO_BLOCK_SIZE],
-      const float aGainR[WEBAUDIO_BLOCK_SIZE],
-      const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
-      float aOutputL[WEBAUDIO_BLOCK_SIZE],
-      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
-    ASSERT_ALIGNED16(aInputL);
-    ASSERT_ALIGNED16(aInputR);
-    ASSERT_ALIGNED16(aGainL);
-    ASSERT_ALIGNED16(aGainR);
-    ASSERT_ALIGNED16(aIsOnTheLeft);
-    ASSERT_ALIGNED16(aOutputL);
-    ASSERT_ALIGNED16(aOutputR);
-
-#pragma GCC unroll(2)
-    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE;
-         i += xsimd::batch<float, Arch>::size) {
-      auto mask =
-          xsimd::batch_bool<float, Arch>::load_aligned(&aIsOnTheLeft[i]);
-
-      auto inputL = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
-      auto inputR = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
-      auto gainL = xsimd::batch<float, Arch>::load_aligned(&aGainL[i]);
-      auto gainR = xsimd::batch<float, Arch>::load_aligned(&aGainR[i]);
-
-      auto outL_true = xsimd::fma(inputR, gainL, inputL);
-      auto outR_true = inputR * gainR;
-
-      auto outL_false = inputL * gainL;
-      auto outR_false = xsimd::fma(inputL, gainR, inputR);
-
-      auto outL = xsimd::select(mask, outL_true, outL_false);
-      auto outR = xsimd::select(mask, outR_true, outR_false);
-
-      outL.store_aligned(&aOutputL[i]);
-      outR.store_aligned(&aOutputR[i]);
-    }
-  }
-};
-
-}  // namespace mozilla
-
-#endif
--- a/dom/media/webaudio/AudioNodeEngineNEON.cpp
+++ b/dom/media/webaudio/AudioNodeEngineNEON.cpp
@ -3,7 +3,350 @@
 * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

-#include "AudioNodeEngineGeneric.h"
+#include "AudioNodeEngineNEON.h"
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#  include <arm64_neon.h>
+#else
+#  include <arm_neon.h>
+#endif
+
+//#ifdef DEBUG
+#if 0  // see bug 921099
+#  define ASSERT_ALIGNED(ptr)                                     \
+    MOZ_ASSERT((((uintptr_t)ptr + 15) & ~0x0F) == (uintptr_t)ptr, \
+               #ptr " has to be aligned 16-bytes aligned.");
+#else
+#  define ASSERT_ALIGNED(ptr)
+#endif
+
+#define ADDRESS_OF(array, index) ((float32_t*)&array[index])
+
 namespace mozilla {
-template struct Engine<xsimd::neon>;
+void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
+                                  float* aOutput, uint32_t aSize) {
+  ASSERT_ALIGNED(aInput);
+  ASSERT_ALIGNED(aOutput);
+
+  float32x4_t vin0, vin1, vin2, vin3;
+  float32x4_t vout0, vout1, vout2, vout3;
+  float32x4_t vscale = vmovq_n_f32(aScale);
+
+  uint32_t dif = aSize % 16;
+  aSize -= dif;
+  unsigned i = 0;
+  for (; i < aSize; i += 16) {
+    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
+    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
+    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
+    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
+
+    vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
+    vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
+    vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
+    vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));
+
+    vout0 = vmlaq_f32(vout0, vin0, vscale);
+    vout1 = vmlaq_f32(vout1, vin1, vscale);
+    vout2 = vmlaq_f32(vout2, vin2, vscale);
+    vout3 = vmlaq_f32(vout3, vin3, vscale);
+
+    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
+  }
+
+  for (unsigned j = 0; j < dif; ++i, ++j) {
+    aOutput[i] += aInput[i] * aScale;
+  }
+}
+void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
+                                         float* aOutput) {
+  ASSERT_ALIGNED(aInput);
+  ASSERT_ALIGNED(aOutput);
+
+  float32x4_t vin0, vin1, vin2, vin3;
+  float32x4_t vout0, vout1, vout2, vout3;
+  float32x4_t vscale = vmovq_n_f32(aScale);
+
+  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
+    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
+    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
+    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
+    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
+
+    vout0 = vmulq_f32(vin0, vscale);
+    vout1 = vmulq_f32(vin1, vscale);
+    vout2 = vmulq_f32(vin2, vscale);
+    vout3 = vmulq_f32(vin3, vscale);
+
+    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
+  }
+}
+
+void AudioBlockCopyChannelWithScale_NEON(
+    const float aInput[WEBAUDIO_BLOCK_SIZE],
+    const float aScale[WEBAUDIO_BLOCK_SIZE],
+    float aOutput[WEBAUDIO_BLOCK_SIZE]) {
+  ASSERT_ALIGNED(aInput);
+  ASSERT_ALIGNED(aScale);
+  ASSERT_ALIGNED(aOutput);
+
+  float32x4_t vin0, vin1, vin2, vin3;
+  float32x4_t vout0, vout1, vout2, vout3;
+  float32x4_t vscale0, vscale1, vscale2, vscale3;
+
+  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
+    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
+    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
+    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
+    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));
+
+    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
+    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
+    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
+    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
+
+    vout0 = vmulq_f32(vin0, vscale0);
+    vout1 = vmulq_f32(vin1, vscale1);
+    vout2 = vmulq_f32(vin2, vscale2);
+    vout3 = vmulq_f32(vin3, vscale3);
+
+    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
+    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
+  }
+}
+
+void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize) {
+  ASSERT_ALIGNED(aBlock);
+
+  float32x4_t vin0, vin1, vin2, vin3;
+  float32x4_t vout0, vout1, vout2, vout3;
+  float32x4_t vscale = vmovq_n_f32(aScale);
+
+  uint32_t dif = aSize % 16;
+  uint32_t vectorSize = aSize - dif;
+  uint32_t i = 0;
+  for (; i < vectorSize; i += 16) {
+    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
+    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
+    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
+    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
+
+    vout0 = vmulq_f32(vin0, vscale);
+    vout1 = vmulq_f32(vin1, vscale);
+    vout2 = vmulq_f32(vin2, vscale);
+    vout3 = vmulq_f32(vin3, vscale);
+
+    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
+    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
+    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
+    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
+  }
+
+  for (unsigned j = 0; j < dif; ++i, ++j) {
+    aBlock[i] *= aScale;
+  }
+}
+
+void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
+                                  uint32_t aSize) {
+  ASSERT_ALIGNED(aBlock);
+
+  float32x4_t vin0, vin1, vin2, vin3;
+  float32x4_t vout0, vout1, vout2, vout3;
+  float32x4_t vscale0, vscale1, vscale2, vscale3;
+
+  uint32_t dif = aSize % 16;
+  uint32_t vectorSize = aSize - dif;
+  uint32_t i = 0;
+  for (; i < vectorSize; i += 16) {
+    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
+    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
+    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
+    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));
+
+    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
+    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
+    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
+    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));
+
+    vout0 = vmulq_f32(vin0, vscale0);
+    vout1 = vmulq_f32(vin1, vscale1);
+    vout2 = vmulq_f32(vin2, vscale2);
+    vout3 = vmulq_f32(vin3, vscale3);
+
+    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
+    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
+    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
+    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
+  }
+
+  for (unsigned j = 0; j < dif; ++i, ++j) {
+    aBlock[i] *= aScale[i];
+  }
+}
+
+void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
+                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
+                                      float aGainL, float aGainR,
+                                      bool aIsOnTheLeft,
+                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
+                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
+  ASSERT_ALIGNED(aInputL);
+  ASSERT_ALIGNED(aInputR);
+  ASSERT_ALIGNED(aOutputL);
+  ASSERT_ALIGNED(aOutputR);
+
+  float32x4_t vinL0, vinL1;
+  float32x4_t vinR0, vinR1;
+  float32x4_t voutL0, voutL1;
+  float32x4_t voutR0, voutR1;
+  float32x4_t vscaleL = vmovq_n_f32(aGainL);
+  float32x4_t vscaleR = vmovq_n_f32(aGainR);
+
+  if (aIsOnTheLeft) {
+    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
+      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
+
+      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
+      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
+
+      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
+      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);
+
+      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
+      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
+
+      voutR0 = vmulq_f32(vinR0, vscaleR);
+      voutR1 = vmulq_f32(vinR1, vscaleR);
+
+      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
+      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
+    }
+  } else {
+    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
+      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
+
+      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
+      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
+
+      voutL0 = vmulq_f32(vinL0, vscaleL);
+      voutL1 = vmulq_f32(vinL1, vscaleL);
+
+      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
+      vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
+
+      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
+      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);
+
+      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
+      vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
+    }
+  }
+}
+
+void AudioBlockPanStereoToStereo_NEON(
+    const float aInputL[WEBAUDIO_BLOCK_SIZE],
+    const float aInputR[WEBAUDIO_BLOCK_SIZE],
+    const float aGainL[WEBAUDIO_BLOCK_SIZE],
+    const float aGainR[WEBAUDIO_BLOCK_SIZE],
+    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
+    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
+  ASSERT_ALIGNED(aInputL);
+  ASSERT_ALIGNED(aInputR);
+  ASSERT_ALIGNED(aGainL);
+  ASSERT_ALIGNED(aGainR);
+  ASSERT_ALIGNED(aIsOnTheLeft);
+  ASSERT_ALIGNED(aOutputL);
+  ASSERT_ALIGNED(aOutputR);
+
+  float32x4_t vinL0, vinL1;
+  float32x4_t vinR0, vinR1;
+  float32x4_t voutL0, voutL1;
+  float32x4_t voutR0, voutR1;
+  float32x4_t vscaleL0, vscaleL1;
+  float32x4_t vscaleR0, vscaleR1;
+  float32x4_t onleft0, onleft1, notonleft0, notonleft1;
+
+  float32x4_t zero = vmovq_n_f32(0);
+  uint8x8_t isOnTheLeft;
+
+  // Although MSVC throws uninitialized value warning for voutL0 and voutL1,
+  // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
+  // compiler warning, set zero.
+  voutL0 = zero;
+  voutL1 = zero;
+
+  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+    vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
+    vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));
+
+    vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
+    vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));
+
+    vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
+    vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));
+
+    vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
+    vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));
+
+    // Load output with boolean "on the left" values. This assumes that
+    // bools are stored as a single byte.
+    isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
+    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
+    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
+    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
+    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
+    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
+    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
+    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
+    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);
+
+    // Convert the boolean values into masks by setting all bits to 1
+    // if true.
+    voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
+    voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);
+
+    // The right output masks are the same as the left masks
+    voutR0 = voutL0;
+    voutR1 = voutL1;
+
+    // Calculate left channel assuming isOnTheLeft
+    onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
+    onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL1);
+
+    // Calculate left channel assuming not isOnTheLeft
+    notonleft0 = vmulq_f32(vinL0, vscaleL0);
+    notonleft1 = vmulq_f32(vinL1, vscaleL1);
+
+    // Write results using previously stored masks
+    voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
+    voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);
+
+    // Calculate right channel assuming isOnTheLeft
+    onleft0 = vmulq_f32(vinR0, vscaleR0);
+    onleft1 = vmulq_f32(vinR1, vscaleR1);
+
+    // Calculate right channel assuming not isOnTheLeft
+    notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
+    notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);
+
+    // Write results using previously stored masks
+    voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
+    voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);
+
+    vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
+    vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
+    vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
+    vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
+  }
+}
 }  // namespace mozilla
--- a/dom/media/webaudio/AudioNodeEngineNEON.h
+++ b/dom/media/webaudio/AudioNodeEngineNEON.h
@ -0,0 +1,42 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifndef MOZILLA_AUDIONODEENGINENEON_H_
+#define MOZILLA_AUDIONODEENGINENEON_H_
+
+#include "AudioNodeEngine.h"
+
+namespace mozilla {
+void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
+                                  float* aOutput, uint32_t aSize);
+
+void AudioBlockCopyChannelWithScale_NEON(const float* aInput, float aScale,
+                                         float* aOutput);
+
+void AudioBlockCopyChannelWithScale_NEON(
+    const float aInput[WEBAUDIO_BLOCK_SIZE],
+    const float aScale[WEBAUDIO_BLOCK_SIZE],
+    float aOutput[WEBAUDIO_BLOCK_SIZE]);
+
+void AudioBufferInPlaceScale_NEON(float* aBlock, float aScale, uint32_t aSize);
+void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale, uint32_t aSize);
+
+void AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
+                                      const float aInputR[WEBAUDIO_BLOCK_SIZE],
+                                      float aGainL, float aGainR,
+                                      bool aIsOnTheLeft,
+                                      float aOutputL[WEBAUDIO_BLOCK_SIZE],
+                                      float aOutputR[WEBAUDIO_BLOCK_SIZE]);
+
+void AudioBlockPanStereoToStereo_NEON(
+    const float aInputL[WEBAUDIO_BLOCK_SIZE],
+    const float aInputR[WEBAUDIO_BLOCK_SIZE],
+    const float aGainL[WEBAUDIO_BLOCK_SIZE],
+    const float aGainR[WEBAUDIO_BLOCK_SIZE],
+    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
+    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]);
+}  // namespace mozilla
+
+#endif /* MOZILLA_AUDIONODEENGINENEON_H_ */
--- a/dom/media/webaudio/AudioNodeEngineSSE2.cpp
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
@ -3,8 +3,361 @@
 * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */

-#include "AudioNodeEngineGeneric.h"
+#include "AudioNodeEngineSSE2.h"
+#include "AlignmentUtils.h"
+#include <emmintrin.h>

 namespace mozilla {
-template struct Engine<xsimd::sse2>;
+void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
+                                 float* aOutput, uint32_t aSize) {
+  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
+      vout1, vout2, vout3, vgain;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aOutput);
+  ASSERT_MULTIPLE16(aSize);
+
+  vgain = _mm_load1_ps(&aScale);
+
+  for (unsigned i = 0; i < aSize; i += 16) {
+    vin0 = _mm_load_ps(&aInput[i]);
+    vin1 = _mm_load_ps(&aInput[i + 4]);
+    vin2 = _mm_load_ps(&aInput[i + 8]);
+    vin3 = _mm_load_ps(&aInput[i + 12]);
+
+    vscaled0 = _mm_mul_ps(vin0, vgain);
+    vscaled1 = _mm_mul_ps(vin1, vgain);
+    vscaled2 = _mm_mul_ps(vin2, vgain);
+    vscaled3 = _mm_mul_ps(vin3, vgain);
+
+    vin0 = _mm_load_ps(&aOutput[i]);
+    vin1 = _mm_load_ps(&aOutput[i + 4]);
+    vin2 = _mm_load_ps(&aOutput[i + 8]);
+    vin3 = _mm_load_ps(&aOutput[i + 12]);
+
+    vout0 = _mm_add_ps(vin0, vscaled0);
+    vout1 = _mm_add_ps(vin1, vscaled1);
+    vout2 = _mm_add_ps(vin2, vscaled2);
+    vout3 = _mm_add_ps(vin3, vscaled3);
+
+    _mm_store_ps(&aOutput[i], vout0);
+    _mm_store_ps(&aOutput[i + 4], vout1);
+    _mm_store_ps(&aOutput[i + 8], vout2);
+    _mm_store_ps(&aOutput[i + 12], vout3);
+  }
+}
+
+void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
+                                        float* aOutput) {
+  __m128 vin0, vin1, vin2, vin3, vout0, vout1, vout2, vout3;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aOutput);
+
+  __m128 vgain = _mm_load1_ps(&aScale);
+
+  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
+    vin0 = _mm_load_ps(&aInput[i]);
+    vin1 = _mm_load_ps(&aInput[i + 4]);
+    vin2 = _mm_load_ps(&aInput[i + 8]);
+    vin3 = _mm_load_ps(&aInput[i + 12]);
+    vout0 = _mm_mul_ps(vin0, vgain);
+    vout1 = _mm_mul_ps(vin1, vgain);
+    vout2 = _mm_mul_ps(vin2, vgain);
+    vout3 = _mm_mul_ps(vin3, vgain);
+    _mm_store_ps(&aOutput[i], vout0);
+    _mm_store_ps(&aOutput[i + 4], vout1);
+    _mm_store_ps(&aOutput[i + 8], vout2);
+    _mm_store_ps(&aOutput[i + 12], vout3);
+  }
+}
+
+void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
+                                        const float aScale[WEBAUDIO_BLOCK_SIZE],
+                                        float aOutput[WEBAUDIO_BLOCK_SIZE]) {
+  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
+      vout1, vout2, vout3;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aScale);
+  ASSERT_ALIGNED16(aOutput);
+
+  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 16) {
+    vscaled0 = _mm_load_ps(&aScale[i]);
+    vscaled1 = _mm_load_ps(&aScale[i + 4]);
+    vscaled2 = _mm_load_ps(&aScale[i + 8]);
+    vscaled3 = _mm_load_ps(&aScale[i + 12]);
+
+    vin0 = _mm_load_ps(&aInput[i]);
+    vin1 = _mm_load_ps(&aInput[i + 4]);
+    vin2 = _mm_load_ps(&aInput[i + 8]);
+    vin3 = _mm_load_ps(&aInput[i + 12]);
+
+    vout0 = _mm_mul_ps(vin0, vscaled0);
+    vout1 = _mm_mul_ps(vin1, vscaled1);
+    vout2 = _mm_mul_ps(vin2, vscaled2);
+    vout3 = _mm_mul_ps(vin3, vscaled3);
+
+    _mm_store_ps(&aOutput[i], vout0);
+    _mm_store_ps(&aOutput[i + 4], vout1);
+    _mm_store_ps(&aOutput[i + 8], vout2);
+    _mm_store_ps(&aOutput[i + 12], vout3);
+  }
+}
+
+void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize) {
+  __m128 vout0, vout1, vout2, vout3, vin0, vin1, vin2, vin3;
+
+  ASSERT_ALIGNED16(aBlock);
+  ASSERT_MULTIPLE16(aSize);
+
+  __m128 vgain = _mm_load1_ps(&aScale);
+
+  for (unsigned i = 0; i < aSize; i += 16) {
+    vin0 = _mm_load_ps(&aBlock[i]);
+    vin1 = _mm_load_ps(&aBlock[i + 4]);
+    vin2 = _mm_load_ps(&aBlock[i + 8]);
+    vin3 = _mm_load_ps(&aBlock[i + 12]);
+    vout0 = _mm_mul_ps(vin0, vgain);
+    vout1 = _mm_mul_ps(vin1, vgain);
+    vout2 = _mm_mul_ps(vin2, vgain);
+    vout3 = _mm_mul_ps(vin3, vgain);
+    _mm_store_ps(&aBlock[i], vout0);
+    _mm_store_ps(&aBlock[i + 4], vout1);
+    _mm_store_ps(&aBlock[i + 8], vout2);
+    _mm_store_ps(&aBlock[i + 12], vout3);
+  }
+}
+
+void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize) {
+  __m128 vout0, vout1, vout2, vout3, vgain0, vgain1, vgain2, vgain3, vin0, vin1,
+      vin2, vin3;
+
+  ASSERT_ALIGNED16(aBlock);
+  ASSERT_MULTIPLE16(aSize);
+
+  for (unsigned i = 0; i < aSize; i += 16) {
+    vin0 = _mm_load_ps(&aBlock[i]);
+    vin1 = _mm_load_ps(&aBlock[i + 4]);
+    vin2 = _mm_load_ps(&aBlock[i + 8]);
+    vin3 = _mm_load_ps(&aBlock[i + 12]);
+    vgain0 = _mm_load_ps(&aScale[i]);
+    vgain1 = _mm_load_ps(&aScale[i + 4]);
+    vgain2 = _mm_load_ps(&aScale[i + 8]);
+    vgain3 = _mm_load_ps(&aScale[i + 12]);
+    vout0 = _mm_mul_ps(vin0, vgain0);
+    vout1 = _mm_mul_ps(vin1, vgain1);
+    vout2 = _mm_mul_ps(vin2, vgain2);
+    vout3 = _mm_mul_ps(vin3, vgain3);
+    _mm_store_ps(&aBlock[i], vout0);
+    _mm_store_ps(&aBlock[i + 4], vout1);
+    _mm_store_ps(&aBlock[i + 8], vout2);
+    _mm_store_ps(&aBlock[i + 12], vout3);
+  }
+}
+
+void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
+                                     const float aInputR[WEBAUDIO_BLOCK_SIZE],
+                                     float aGainL, float aGainR,
+                                     bool aIsOnTheLeft,
+                                     float aOutputL[WEBAUDIO_BLOCK_SIZE],
+                                     float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
+  __m128 vinl0, vinr0, vinl1, vinr1, vout0, vout1, vscaled0, vscaled1, vgainl,
+      vgainr;
+
+  ASSERT_ALIGNED16(aInputL);
+  ASSERT_ALIGNED16(aInputR);
+  ASSERT_ALIGNED16(aOutputL);
+  ASSERT_ALIGNED16(aOutputR);
+
+  vgainl = _mm_load1_ps(&aGainL);
+  vgainr = _mm_load1_ps(&aGainR);
+
+  if (aIsOnTheLeft) {
+    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+      vinl0 = _mm_load_ps(&aInputL[i]);
+      vinr0 = _mm_load_ps(&aInputR[i]);
+      vinl1 = _mm_load_ps(&aInputL[i + 4]);
+      vinr1 = _mm_load_ps(&aInputR[i + 4]);
+
+      /* left channel : aOutputL  = aInputL + aInputR * gainL */
+      vscaled0 = _mm_mul_ps(vinr0, vgainl);
+      vscaled1 = _mm_mul_ps(vinr1, vgainl);
+      vout0 = _mm_add_ps(vscaled0, vinl0);
+      vout1 = _mm_add_ps(vscaled1, vinl1);
+      _mm_store_ps(&aOutputL[i], vout0);
+      _mm_store_ps(&aOutputL[i + 4], vout1);
+
+      /* right channel : aOutputR = aInputR * gainR */
+      vscaled0 = _mm_mul_ps(vinr0, vgainr);
+      vscaled1 = _mm_mul_ps(vinr1, vgainr);
+      _mm_store_ps(&aOutputR[i], vscaled0);
+      _mm_store_ps(&aOutputR[i + 4], vscaled1);
+    }
+  } else {
+    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
+      vinl0 = _mm_load_ps(&aInputL[i]);
+      vinr0 = _mm_load_ps(&aInputR[i]);
+      vinl1 = _mm_load_ps(&aInputL[i + 4]);
+      vinr1 = _mm_load_ps(&aInputR[i + 4]);
+
+      /* left channel : aInputL * gainL */
+      vscaled0 = _mm_mul_ps(vinl0, vgainl);
+      vscaled1 = _mm_mul_ps(vinl1, vgainl);
+      _mm_store_ps(&aOutputL[i], vscaled0);
+      _mm_store_ps(&aOutputL[i + 4], vscaled1);
+
+      /* right channel: aOutputR = aInputR + aInputL * gainR */
+      vscaled0 = _mm_mul_ps(vinl0, vgainr);
+      vscaled1 = _mm_mul_ps(vinl1, vgainr);
+      vout0 = _mm_add_ps(vscaled0, vinr0);
+      vout1 = _mm_add_ps(vscaled1, vinr1);
+      _mm_store_ps(&aOutputR[i], vout0);
+      _mm_store_ps(&aOutputR[i + 4], vout1);
+    }
+  }
+}
+
+void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
+                               float* aOutput, uint32_t aSize) {
+  unsigned i;
+  __m128 in0, in1, in2, in3, outreal0, outreal1, outreal2, outreal3, outimag0,
+      outimag1, outimag2, outimag3;
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_ALIGNED16(aScale);
+  ASSERT_ALIGNED16(aOutput);
+  ASSERT_MULTIPLE16(aSize);
+
+  for (i = 0; i < aSize * 2; i += 16) {
+    in0 = _mm_load_ps(&aInput[i]);
+    in1 = _mm_load_ps(&aInput[i + 4]);
+    in2 = _mm_load_ps(&aInput[i + 8]);
+    in3 = _mm_load_ps(&aInput[i + 12]);
+
+    outreal0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag0 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
+    outreal2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag2 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
+
+    in0 = _mm_load_ps(&aScale[i]);
+    in1 = _mm_load_ps(&aScale[i + 4]);
+    in2 = _mm_load_ps(&aScale[i + 8]);
+    in3 = _mm_load_ps(&aScale[i + 12]);
+
+    outreal1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag1 = _mm_shuffle_ps(in0, in1, _MM_SHUFFLE(3, 1, 3, 1));
+    outreal3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(2, 0, 2, 0));
+    outimag3 = _mm_shuffle_ps(in2, in3, _MM_SHUFFLE(3, 1, 3, 1));
+
+    in0 = _mm_sub_ps(_mm_mul_ps(outreal0, outreal1),
+                     _mm_mul_ps(outimag0, outimag1));
+    in1 = _mm_add_ps(_mm_mul_ps(outreal0, outimag1),
+                     _mm_mul_ps(outimag0, outreal1));
+    in2 = _mm_sub_ps(_mm_mul_ps(outreal2, outreal3),
+                     _mm_mul_ps(outimag2, outimag3));
+    in3 = _mm_add_ps(_mm_mul_ps(outreal2, outimag3),
+                     _mm_mul_ps(outimag2, outreal3));
+
+    outreal0 = _mm_unpacklo_ps(in0, in1);
+    outreal1 = _mm_unpackhi_ps(in0, in1);
+    outreal2 = _mm_unpacklo_ps(in2, in3);
+    outreal3 = _mm_unpackhi_ps(in2, in3);
+
+    _mm_store_ps(&aOutput[i], outreal0);
+    _mm_store_ps(&aOutput[i + 4], outreal1);
+    _mm_store_ps(&aOutput[i + 8], outreal2);
+    _mm_store_ps(&aOutput[i + 12], outreal3);
+  }
+}
+
+float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength) {
+  unsigned i;
+  __m128 in0, in1, in2, in3, acc0, acc1, acc2, acc3;
+  float out[4];
+
+  ASSERT_ALIGNED16(aInput);
+  ASSERT_MULTIPLE16(aLength);
+
+  acc0 = _mm_setzero_ps();
+  acc1 = _mm_setzero_ps();
+  acc2 = _mm_setzero_ps();
+  acc3 = _mm_setzero_ps();
+
+  for (i = 0; i < aLength; i += 16) {
+    in0 = _mm_load_ps(&aInput[i]);
+    in1 = _mm_load_ps(&aInput[i + 4]);
+    in2 = _mm_load_ps(&aInput[i + 8]);
+    in3 = _mm_load_ps(&aInput[i + 12]);
+
+    in0 = _mm_mul_ps(in0, in0);
+    in1 = _mm_mul_ps(in1, in1);
+    in2 = _mm_mul_ps(in2, in2);
+    in3 = _mm_mul_ps(in3, in3);
+
+    acc0 = _mm_add_ps(acc0, in0);
+    acc1 = _mm_add_ps(acc1, in1);
+    acc2 = _mm_add_ps(acc2, in2);
+    acc3 = _mm_add_ps(acc3, in3);
+  }
+
+  acc0 = _mm_add_ps(acc0, acc1);
+  acc0 = _mm_add_ps(acc0, acc2);
+  acc0 = _mm_add_ps(acc0, acc3);
+
+  _mm_store_ps(out, acc0);
+
+  return out[0] + out[1] + out[2] + out[3];
+}
+
+void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount) {
+  __m128 vin0, vin1, vin2, vin3;
+  __m128 vmask0, vmask1, vmask2, vmask3;
+  __m128 vout0, vout1, vout2, vout3;
+
+  float* samplesAligned16 = ALIGNED16(aSamples);
+  size_t leadingElementsScalar =
+      std::min(static_cast<size_t>(samplesAligned16 - aSamples), aCount);
+  size_t remainingElements = aCount - leadingElementsScalar;
+  size_t vectoredEnd = aCount - remainingElements % 16;
+
+  MOZ_ASSERT(!((vectoredEnd - leadingElementsScalar) % 16));
+
+  size_t i = 0;
+  for (; i < leadingElementsScalar; i++) {
+    if (aSamples[i] != aSamples[i]) {
+      aSamples[i] = 0.0;
+    }
+  }
+
+  ASSERT_ALIGNED16(&aSamples[i]);
+
+  for (; i < vectoredEnd; i += 16) {
+    vin0 = _mm_load_ps(&aSamples[i + 0]);
+    vin1 = _mm_load_ps(&aSamples[i + 4]);
+    vin2 = _mm_load_ps(&aSamples[i + 8]);
+    vin3 = _mm_load_ps(&aSamples[i + 12]);
+
+    vmask0 = _mm_cmpord_ps(vin0, vin0);
+    vmask1 = _mm_cmpord_ps(vin1, vin1);
+    vmask2 = _mm_cmpord_ps(vin2, vin2);
+    vmask3 = _mm_cmpord_ps(vin3, vin3);
+
+    vout0 = _mm_and_ps(vin0, vmask0);
+    vout1 = _mm_and_ps(vin1, vmask1);
+    vout2 = _mm_and_ps(vin2, vmask2);
+    vout3 = _mm_and_ps(vin3, vmask3);
+
+    _mm_store_ps(&aSamples[i + 0], vout0);
+    _mm_store_ps(&aSamples[i + 4], vout1);
+    _mm_store_ps(&aSamples[i + 8], vout2);
+    _mm_store_ps(&aSamples[i + 12], vout3);
+  }
+  for (; i < aCount; i++) {
+    if (aSamples[i] != aSamples[i]) {
+      aSamples[i] = 0.0;
+    }
+  }
+}
+
 }  // namespace mozilla
--- a/dom/media/webaudio/AudioNodeEngineSSE2.h
+++ b/dom/media/webaudio/AudioNodeEngineSSE2.h
@ -0,0 +1,35 @@
+/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* this source code form is subject to the terms of the mozilla public
+ * license, v. 2.0. if a copy of the mpl was not distributed with this file,
+ * You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#include "AudioNodeEngine.h"
+
+namespace mozilla {
+void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
+                                 float* aOutput, uint32_t aSize);
+
+void AudioBlockCopyChannelWithScale_SSE(const float* aInput, float aScale,
+                                        float* aOutput);
+
+void AudioBlockCopyChannelWithScale_SSE(const float aInput[WEBAUDIO_BLOCK_SIZE],
+                                        const float aScale[WEBAUDIO_BLOCK_SIZE],
+                                        float aOutput[WEBAUDIO_BLOCK_SIZE]);
+
+void AudioBufferInPlaceScale_SSE(float* aBlock, float aScale, uint32_t aSize);
+void AudioBufferInPlaceScale_SSE(float* aBlock, float* aScale, uint32_t aSize);
+
+void AudioBlockPanStereoToStereo_SSE(const float aInputL[WEBAUDIO_BLOCK_SIZE],
+                                     const float aInputR[WEBAUDIO_BLOCK_SIZE],
+                                     float aGainL, float aGainR,
+                                     bool aIsOnTheLeft,
+                                     float aOutputL[WEBAUDIO_BLOCK_SIZE],
+                                     float aOutputR[WEBAUDIO_BLOCK_SIZE]);
+
+float AudioBufferSumOfSquares_SSE(const float* aInput, uint32_t aLength);
+
+void BufferComplexMultiply_SSE(const float* aInput, const float* aScale,
+                               float* aOutput, uint32_t aSize);
+
+void NaNToZeroInPlace_SSE(float* aSamples, size_t aCount);
+}  // namespace mozilla
--- a/dom/media/webaudio/AudioNodeEngineSSE4_2_FMA3.cpp
+++ b/dom/media/webaudio/AudioNodeEngineSSE4_2_FMA3.cpp
@ -1,10 +0,0 @@
-/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
-/* this source code form is subject to the terms of the mozilla public
- * license, v. 2.0. if a copy of the mpl was not distributed with this file,
- * You can obtain one at http://mozilla.org/MPL/2.0/. */
-
-#include "AudioNodeEngineGeneric.h"
-
-namespace mozilla {
-template struct Engine<xsimd::fma3<xsimd::sse4_2>>;
-}  // namespace mozilla
--- a/dom/media/webaudio/moz.build
+++ b/dom/media/webaudio/moz.build
@ -127,23 +127,16 @@ UNIFIED_SOURCES += [

 if CONFIG["CPU_ARCH"] == "aarch64" or CONFIG["BUILD_ARM_NEON"]:
    DEFINES["USE_NEON"] = True
-    LOCAL_INCLUDES += ["/third_party/xsimd/include"]
    SOURCES += ["AudioNodeEngineNEON.cpp"]
    SOURCES["AudioNodeEngineNEON.cpp"].flags += CONFIG["NEON_FLAGS"]
    if CONFIG["BUILD_ARM_NEON"]:
        LOCAL_INCLUDES += ["/media/openmax_dl/dl/api/"]

-# Are we targeting x86 or x64?  If so, build SSEX files.
+# Are we targeting x86 or x64?  If so, build SSE2 files.
 if CONFIG["INTEL_ARCHITECTURE"]:
-    SOURCES += ["AudioNodeEngineSSE2.cpp", "AudioNodeEngineSSE4_2_FMA3.cpp"]
+    SOURCES += ["AudioNodeEngineSSE2.cpp"]
    DEFINES["USE_SSE2"] = True
-    DEFINES["USE_SSE4_2"] = True
-    DEFINES["USE_FMA3"] = True
-    LOCAL_INCLUDES += ["/third_party/xsimd/include"]
    SOURCES["AudioNodeEngineSSE2.cpp"].flags += CONFIG["SSE2_FLAGS"]
-    SOURCES["AudioNodeEngineSSE4_2_FMA3.cpp"].flags += (
-        CONFIG["SSE4_2_FLAGS"] + CONFIG["FMA_FLAGS"]
-    )

 include("/ipc/chromium/chromium-config.mozbuild")

--- a/mozglue/misc/SSE.cpp
+++ b/mozglue/misc/SSE.cpp
@ -147,10 +147,6 @@ bool sse4_1_enabled = has_cpuid_bits(1u, ecx, (1u << 19));
 bool sse4_2_enabled = has_cpuid_bits(1u, ecx, (1u << 20));
 #  endif

-#  if !defined(MOZILLA_PRESUME_FMA3)
-bool fma3_enabled = has_cpuid_bits(1u, ecx, (1u << 12));
-#  endif
-
 #  if !defined(MOZILLA_PRESUME_AVX) || !defined(MOZILLA_PRESUME_AVX2)
 static bool has_avx() {
 #    if defined(MOZILLA_PRESUME_AVX)
--- a/mozglue/misc/SSE.h
+++ b/mozglue/misc/SSE.h
@ -215,9 +215,6 @@ extern bool MFBT_DATA sse4_1_enabled;
 #  if !defined(MOZILLA_PRESUME_SSE4_2)
 extern bool MFBT_DATA sse4_2_enabled;
 #  endif
-#  if !defined(MOZILLA_PRESUME_FMA3)
-extern bool MFBT_DATA fma3_enabled;
-#  endif
 #  if !defined(MOZILLA_PRESUME_AVX)
 extern bool MFBT_DATA avx_enabled;
 #  endif
@ -320,16 +317,6 @@ inline bool supports_sse4_2() { return sse_private::sse4_2_enabled; }
 inline bool supports_sse4_2() { return false; }
 #endif

-#if defined(MOZILLA_PRESUME_FMA3)
-#  define MOZILLA_MAY_SUPPORT_FMA3 1
-inline bool supports_fma3() { return true; }
-#elif defined(MOZILLA_SSE_HAVE_CPUID_DETECTION)
-#  define MOZILLA_MAY_SUPPORT_FMA3 1
-inline bool supports_fma3() { return sse_private::fma3_enabled; }
-#else
-inline bool supports_fma3() { return false; }
-#endif
-
 #if defined(MOZILLA_PRESUME_AVX)
 #  define MOZILLA_MAY_SUPPORT_AVX 1
 inline bool supports_avx() { return true; }
--- a/third_party/xsimd/LICENSE
+++ b/third_party/xsimd/LICENSE
@ -1,29 +0,0 @@
-Copyright (c) 2016, Johan Mabille, Sylvain Corlay, Wolf Vollprecht and Martin Renou
-Copyright (c) 2016, QuantStack
-Copyright (c) 2018, Serge Guelton
-All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-* Redistributions of source code must retain the above copyright notice, this
-  list of conditions and the following disclaimer.
-
-* Redistributions in binary form must reproduce the above copyright notice,
-  this list of conditions and the following disclaimer in the documentation
-  and/or other materials provided with the distribution.
-
-* Neither the name of the copyright holder nor the names of its
-  contributors may be used to endorse or promote products derived from
-  this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@ -1,152 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
-#define XSIMD_GENERIC_ARITHMETIC_HPP
-
-#include <complex>
-#include <type_traits>
-
-#include "./xsimd_generic_details.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-
-        using namespace types;
-
-        // bitwise_lshift
-        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return detail::apply([](T x, T y) noexcept
-                                 { return x << y; },
-                                 self, other);
-        }
-
-        // bitwise_rshift
-        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return detail::apply([](T x, T y) noexcept
-                                 { return x >> y; },
-                                 self, other);
-        }
-
-        // div
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return detail::apply([](T x, T y) noexcept -> T
-                                 { return x / y; },
-                                 self, other);
-        }
-
-        // fma
-        template <class A, class T>
-        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
-        {
-            return x * y + z;
-        }
-
-        template <class A, class T>
-        inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
-            auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
-            return { res_r, res_i };
-        }
-
-        // fms
-        template <class A, class T>
-        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
-        {
-            return x * y - z;
-        }
-
-        template <class A, class T>
-        inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
-            auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
-            return { res_r, res_i };
-        }
-
-        // fnma
-        template <class A, class T>
-        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
-        {
-            return -x * y + z;
-        }
-
-        template <class A, class T>
-        inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
-            auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
-            return { res_r, res_i };
-        }
-
-        // fnms
-        template <class A, class T>
-        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
-        {
-            return -x * y - z;
-        }
-
-        template <class A, class T>
-        inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
-            auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
-            return { res_r, res_i };
-        }
-
-        // mul
-        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return detail::apply([](T x, T y) noexcept -> T
-                                 { return x * y; },
-                                 self, other);
-        }
-
-        // sadd
-        template <class A>
-        inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
-        {
-            return add(self, other); // no saturated arithmetic on floating point numbers
-        }
-        template <class A>
-        inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
-        {
-            return add(self, other); // no saturated arithmetic on floating point numbers
-        }
-
-        // ssub
-        template <class A>
-        inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
-        {
-            return sub(self, other); // no saturated arithmetic on floating point numbers
-        }
-        template <class A>
-        inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
-        {
-            return sub(self, other); // no saturated arithmetic on floating point numbers
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
@ -1,96 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_COMPLEX_HPP
-#define XSIMD_GENERIC_COMPLEX_HPP
-
-#include <complex>
-
-#include "./xsimd_generic_details.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-
-        using namespace types;
-
-        // real
-        template <class A, class T>
-        inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return self;
-        }
-
-        template <class A, class T>
-        inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
-        {
-            return self.real();
-        }
-
-        // imag
-        template <class A, class T>
-        inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
-        {
-            return batch<T, A>(T(0));
-        }
-
-        template <class A, class T>
-        inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
-        {
-            return self.imag();
-        }
-
-        // arg
-        template <class A, class T>
-        inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return atan2(imag(self), real(self));
-        }
-
-        // conj
-        template <class A, class T>
-        inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return { real(self), -imag(self) };
-        }
-
-        // norm
-        template <class A, class T>
-        inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return { fma(real(self), real(self), imag(self) * imag(self)) };
-        }
-
-        // proj
-        template <class A, class T>
-        inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = complex_batch_type_t<batch<T, A>>;
-            using real_batch = typename batch_type::real_batch;
-            using real_value_type = typename real_batch::value_type;
-            auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
-            return select(cond,
-                          batch_type(constants::infinity<real_batch>(),
-                                     copysign(real_batch(real_value_type(0)), imag(self))),
-                          batch_type(self));
-        }
-
-        template <class A, class T>
-        inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
-        {
-            return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
-        }
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@ -1,239 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_DETAILS_HPP
-#define XSIMD_GENERIC_DETAILS_HPP
-
-#include <complex>
-
-#include "../../math/xsimd_rem_pio2.hpp"
-#include "../../types/xsimd_generic_arch.hpp"
-#include "../../types/xsimd_utils.hpp"
-#include "../xsimd_constants.hpp"
-
-namespace xsimd
-{
-    // Forward declaration. Should we put them in a separate file?
-    template <class T, class A>
-    inline batch<T, A> abs(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
-    template <class T, class A>
-    inline bool any(batch_bool<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
-    template <class A, class T_out, class T_in>
-    inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
-    template <class T, class A>
-    inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
-    template <class B, class T, class A>
-    inline B bitwise_cast(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> cos(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> exp(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
-    template <class T, class A>
-    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
-    template <class T, class A>
-    inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
-    template <class T, class A, uint64_t... Coefs>
-    inline batch<T, A> horner(const batch<T, A>& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
-    template <class T, class A>
-    inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
-    template <class T, class A>
-    inline batch<T, A> log(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
-    template <class T, class A>
-    inline T reduce_add(batch<T, A> const&) noexcept;
-    template <class T, class A>
-    inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
-    template <class T, class A>
-    inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
-    template <class T, class A>
-    inline batch<T, A> sign(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> sin(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> tan(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
-    template <class T, class A>
-    inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
-
-    namespace kernel
-    {
-
-        namespace detail
-        {
-            template <class F, class A, class T, class... Batches>
-            inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
-            {
-                constexpr std::size_t size = batch<T, A>::size;
-                alignas(A::alignment()) T self_buffer[size];
-                alignas(A::alignment()) T other_buffer[size];
-                self.store_aligned(&self_buffer[0]);
-                other.store_aligned(&other_buffer[0]);
-                for (std::size_t i = 0; i < size; ++i)
-                {
-                    self_buffer[i] = func(self_buffer[i], other_buffer[i]);
-                }
-                return batch<T, A>::load_aligned(self_buffer);
-            }
-
-            template <class U, class F, class A, class T>
-            inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
-            {
-                static_assert(batch<T, A>::size == batch<U, A>::size,
-                              "Source and destination sizes must match");
-                constexpr std::size_t src_size = batch<T, A>::size;
-                constexpr std::size_t dest_size = batch<U, A>::size;
-                alignas(A::alignment()) T self_buffer[src_size];
-                alignas(A::alignment()) U other_buffer[dest_size];
-                self.store_aligned(&self_buffer[0]);
-                for (std::size_t i = 0; i < src_size; ++i)
-                {
-                    other_buffer[i] = func(self_buffer[i]);
-                }
-                return batch<U, A>::load_aligned(other_buffer);
-            }
-        }
-
-        namespace detail
-        {
-            // Generic conversion handling machinery. Each architecture must define
-            // conversion function when such conversions exits in the form of
-            // intrinsic. Then we use that information to automatically decide whether
-            // to use scalar or vector conversion when doing load / store / batch_cast
-            struct with_fast_conversion
-            {
-            };
-            struct with_slow_conversion
-            {
-            };
-
-            template <class A, class From, class To, class = void>
-            struct conversion_type_impl
-            {
-                using type = with_slow_conversion;
-            };
-
-            using xsimd::detail::void_t;
-
-            template <class A, class From, class To>
-            struct conversion_type_impl<A, From, To,
-                                        void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
-                                                                  std::declval<const batch<To, A>&>(),
-                                                                  std::declval<const A&>()))>>
-            {
-                using type = with_fast_conversion;
-            };
-
-            template <class A, class From, class To>
-            using conversion_type = typename conversion_type_impl<A, From, To>::type;
-        }
-
-        namespace detail
-        {
-            /* origin: boost/simdfunction/horn.hpp*/
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-            template <class B, uint64_t c>
-            inline B coef() noexcept
-            {
-                using value_type = typename B::value_type;
-                return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
-            }
-            template <class B>
-            inline B horner(const B&) noexcept
-            {
-                return B(typename B::value_type(0.));
-            }
-
-            template <class B, uint64_t c0>
-            inline B horner(const B&) noexcept
-            {
-                return coef<B, c0>();
-            }
-
-            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
-            inline B horner(const B& self) noexcept
-            {
-                return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
-            }
-
-            /* origin: boost/simdfunction/horn1.hpp*/
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-            template <class B>
-            inline B horner1(const B&) noexcept
-            {
-                return B(1.);
-            }
-
-            template <class B, uint64_t c0>
-            inline B horner1(const B& x) noexcept
-            {
-                return x + detail::coef<B, c0>();
-            }
-
-            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
-            inline B horner1(const B& x) noexcept
-            {
-                return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
-            }
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
@ -1,163 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_LOGICAL_HPP
-#define XSIMD_GENERIC_LOGICAL_HPP
-
-#include "./xsimd_generic_details.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-
-        using namespace types;
-
-        // from  mask
-        template <class A, class T>
-        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
-        {
-            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
-            // This is inefficient but should never be called. It's just a
-            // temporary implementation until arm support is added.
-            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
-                buffer[i] = mask & (1ull << i);
-            return batch_bool<T, A>::load_aligned(buffer);
-        }
-
-        // ge
-        template <class A, class T>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return other <= self;
-        }
-
-        // gt
-        template <class A, class T>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return other < self;
-        }
-
-        // is_even
-        template <class A, class T>
-        inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return is_flint(self * T(0.5));
-        }
-
-        // is_flint
-        template <class A, class T>
-        inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
-            return frac == T(0.);
-        }
-
-        // is_odd
-        template <class A, class T>
-        inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return is_even(self - T(1.));
-        }
-
-        // isinf
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
-        {
-            return batch_bool<T, A>(false);
-        }
-        template <class A>
-        inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
-        {
-            return abs(self) == std::numeric_limits<float>::infinity();
-        }
-        template <class A>
-        inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
-        {
-            return abs(self) == std::numeric_limits<double>::infinity();
-        }
-
-        // isfinite
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
-        {
-            return batch_bool<T, A>(true);
-        }
-        template <class A>
-        inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
-        {
-            return (self - self) == 0.f;
-        }
-        template <class A>
-        inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
-        {
-            return (self - self) == 0.;
-        }
-
-        // isnan
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
-        {
-            return batch_bool<T, A>(false);
-        }
-
-        // le
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return (self < other) || (self == other);
-        }
-
-        // neq
-        template <class A, class T>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return !(other == self);
-        }
-
-        // logical_and
-        template <class A, class T>
-        inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return detail::apply([](T x, T y) noexcept
-                                 { return x && y; },
-                                 self, other);
-        }
-
-        // logical_or
-        template <class A, class T>
-        inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            return detail::apply([](T x, T y) noexcept
-                                 { return x || y; },
-                                 self, other);
-        }
-
-        // mask
-        template <class A, class T>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
-            self.store_aligned(buffer);
-            // This is inefficient but should never be called. It's just a
-            // temporary implementation until arm support is added.
-            uint64_t res = 0;
-            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
-                if (buffer[i])
-                    res |= 1ul << i;
-            return res;
-        }
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@ -1,397 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_MEMORY_HPP
-#define XSIMD_GENERIC_MEMORY_HPP
-
-#include <algorithm>
-#include <complex>
-#include <stdexcept>
-
-#include "../../types/xsimd_batch_constant.hpp"
-#include "./xsimd_generic_details.hpp"
-
-namespace xsimd
-{
-    template <class batch_type, typename batch_type::value_type... Values>
-    struct batch_constant;
-
-    namespace kernel
-    {
-
-        using namespace types;
-
-        // extract_pair
-        template <class A, class T>
-        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
-        {
-            constexpr std::size_t size = batch<T, A>::size;
-            assert(i < size && "index in bounds");
-
-            alignas(A::alignment()) T self_buffer[size];
-            self.store_aligned(self_buffer);
-
-            alignas(A::alignment()) T other_buffer[size];
-            other.store_aligned(other_buffer);
-
-            alignas(A::alignment()) T concat_buffer[size];
-
-            for (std::size_t j = 0; j < (size - i); ++j)
-            {
-                concat_buffer[j] = other_buffer[i + j];
-                if (j < i)
-                {
-                    concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
-                }
-            }
-            return batch<T, A>::load_aligned(concat_buffer);
-        }
-
-        // gather
-        namespace detail
-        {
-            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
-            inline batch<T, A> gather(U const* src, batch<V, A> const& index,
-                                      ::xsimd::index<N> I) noexcept
-            {
-                return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
-            }
-
-            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
-            inline batch<T, A>
-            gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
-            {
-                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
-
-                const auto test = gather<N - 1, T, A>(src, index, {});
-                return insert(test, static_cast<T>(src[index.get(I)]), I);
-            }
-        } // namespace detail
-
-        template <typename T, typename A, typename V>
-        inline batch<T, A>
-        gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
-               kernel::requires_arch<generic>) noexcept
-        {
-            static_assert(batch<T, A>::size == batch<V, A>::size,
-                          "Index and destination sizes must match");
-
-            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
-        }
-
-        // Gather with runtime indexes and mismatched strides.
-        template <typename T, typename A, typename U, typename V>
-        inline detail::sizes_mismatch_t<T, U, batch<T, A>>
-        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
-               kernel::requires_arch<generic>) noexcept
-        {
-            static_assert(batch<T, A>::size == batch<V, A>::size,
-                          "Index and destination sizes must match");
-
-            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
-        }
-
-        // Gather with runtime indexes and matching strides.
-        template <typename T, typename A, typename U, typename V>
-        inline detail::stride_match_t<T, U, batch<T, A>>
-        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
-               kernel::requires_arch<generic>) noexcept
-        {
-            static_assert(batch<T, A>::size == batch<V, A>::size,
-                          "Index and destination sizes must match");
-
-            return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
-        }
-
-        // insert
-        template <class A, class T, size_t I>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
-        {
-            struct index_mask
-            {
-                static constexpr bool get(size_t index, size_t /* size*/)
-                {
-                    return index != I;
-                }
-            };
-            batch<T, A> tmp(val);
-            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
-        }
-
-        // get
-        template <class A, size_t I, class T>
-        inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
-        {
-            alignas(A::alignment()) T buffer[batch<T, A>::size];
-            self.store_aligned(&buffer[0]);
-            return buffer[I];
-        }
-
-        template <class A, size_t I, class T>
-        inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
-        {
-            alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
-            self.store_aligned(&buffer[0]);
-            return buffer[I];
-        }
-
-        template <class A, size_t I, class T>
-        inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
-        {
-            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
-            self.store_aligned(&buffer[0]);
-            return buffer[I];
-        }
-
-        template <class A, class T>
-        inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
-        {
-            alignas(A::alignment()) T buffer[batch<T, A>::size];
-            self.store_aligned(&buffer[0]);
-            return buffer[i];
-        }
-
-        template <class A, class T>
-        inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
-        {
-            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
-            self.store_aligned(&buffer[0]);
-            return buffer[i];
-        }
-
-        template <class A, class T>
-        inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
-        {
-            using T2 = typename batch<std::complex<T>, A>::value_type;
-            alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
-            self.store_aligned(&buffer[0]);
-            return buffer[i];
-        }
-
-        // load_aligned
-        namespace detail
-        {
-            template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
-            {
-                using batch_type_in = batch<T_in, A>;
-                using batch_type_out = batch<T_out, A>;
-                return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
-            }
-            template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
-            {
-                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
-                using batch_type_out = batch<T_out, A>;
-                alignas(A::alignment()) T_out buffer[batch_type_out::size];
-                std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
-                return batch_type_out::load_aligned(buffer);
-            }
-        }
-        template <class A, class T_in, class T_out>
-        inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
-        {
-            return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
-        }
-
-        // load_unaligned
-        namespace detail
-        {
-            template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
-            {
-                using batch_type_in = batch<T_in, A>;
-                using batch_type_out = batch<T_out, A>;
-                return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
-            }
-
-            template <class A, class T_in, class T_out>
-            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
-            {
-                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
-                return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
-            }
-        }
-        template <class A, class T_in, class T_out>
-        inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
-        {
-            return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
-        }
-
-        namespace detail
-        {
-            // Scatter with runtime indexes.
-            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
-            inline void scatter(batch<T, A> const& src, U* dst,
-                                batch<V, A> const& index,
-                                ::xsimd::index<N> I) noexcept
-            {
-                dst[index.get(I)] = static_cast<U>(src.get(I));
-            }
-
-            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
-            inline void
-            scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
-                    ::xsimd::index<N> I) noexcept
-            {
-                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
-
-                kernel::detail::scatter<N - 1, T, A, U, V>(
-                    src, dst, index, {});
-                dst[index.get(I)] = static_cast<U>(src.get(I));
-            }
-        } // namespace detail
-
-        template <typename A, typename T, typename V>
-        inline void
-        scatter(batch<T, A> const& src, T* dst,
-                batch<V, A> const& index,
-                kernel::requires_arch<generic>) noexcept
-        {
-            static_assert(batch<T, A>::size == batch<V, A>::size,
-                          "Source and index sizes must match");
-            kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
-                src, dst, index, {});
-        }
-
-        template <typename A, typename T, typename U, typename V>
-        inline detail::sizes_mismatch_t<T, U, void>
-        scatter(batch<T, A> const& src, U* dst,
-                batch<V, A> const& index,
-                kernel::requires_arch<generic>) noexcept
-        {
-            static_assert(batch<T, A>::size == batch<V, A>::size,
-                          "Source and index sizes must match");
-            kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
-                src, dst, index, {});
-        }
-
-        template <typename A, typename T, typename U, typename V>
-        inline detail::stride_match_t<T, U, void>
-        scatter(batch<T, A> const& src, U* dst,
-                batch<V, A> const& index,
-                kernel::requires_arch<generic>) noexcept
-        {
-            static_assert(batch<T, A>::size == batch<V, A>::size,
-                          "Source and index sizes must match");
-            const auto tmp = batch_cast<U>(src);
-            kernel::scatter<A>(tmp, dst, index, A {});
-        }
-
-        // store
-        template <class T, class A>
-        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            constexpr auto size = batch_bool<T, A>::size;
-            alignas(A::alignment()) T buffer[size];
-            kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
-            for (std::size_t i = 0; i < size; ++i)
-                mem[i] = bool(buffer[i]);
-        }
-
-        // store_aligned
-        template <class A, class T_in, class T_out>
-        inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
-        {
-            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
-            alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
-            store_aligned(&buffer[0], self);
-            std::copy(std::begin(buffer), std::end(buffer), mem);
-        }
-
-        // store_unaligned
-        template <class A, class T_in, class T_out>
-        inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
-        {
-            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
-            return store_aligned<A>(mem, self, generic {});
-        }
-
-        // swizzle
-        template <class A, class T, class ITy, ITy... Vs>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
-        {
-            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
-        }
-
-        namespace detail
-        {
-            template <class A, class T>
-            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
-            {
-                static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
-            }
-
-            template <class A, class T>
-            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
-            {
-                static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
-            }
-
-            template <class A, class T>
-            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
-            {
-                static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
-            }
-        }
-
-        // load_complex_aligned
-        template <class A, class T_out, class T_in>
-        inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
-        {
-            using real_batch = batch<T_out, A>;
-            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
-            real_batch hi = real_batch::load_aligned(buffer),
-                       lo = real_batch::load_aligned(buffer + real_batch::size);
-            return detail::load_complex(hi, lo, A {});
-        }
-
-        // load_complex_unaligned
-        template <class A, class T_out, class T_in>
-        inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
-        {
-            using real_batch = batch<T_out, A>;
-            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
-            real_batch hi = real_batch::load_unaligned(buffer),
-                       lo = real_batch::load_unaligned(buffer + real_batch::size);
-            return detail::load_complex(hi, lo, A {});
-        }
-
-        // store_complex_aligned
-        template <class A, class T_out, class T_in>
-        inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
-        {
-            using real_batch = batch<T_in, A>;
-            real_batch hi = detail::complex_high(src, A {});
-            real_batch lo = detail::complex_low(src, A {});
-            T_out* buffer = reinterpret_cast<T_out*>(dst);
-            lo.store_aligned(buffer);
-            hi.store_aligned(buffer + real_batch::size);
-        }
-
-        // store_compelx_unaligned
-        template <class A, class T_out, class T_in>
-        inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
-        {
-            using real_batch = batch<T_in, A>;
-            real_batch hi = detail::complex_high(src, A {});
-            real_batch lo = detail::complex_low(src, A {});
-            T_out* buffer = reinterpret_cast<T_out*>(dst);
-            lo.store_unaligned(buffer);
-            hi.store_unaligned(buffer + real_batch::size);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
@ -1,72 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_ROUNDING_HPP
-#define XSIMD_GENERIC_ROUNDING_HPP
-
-#include "./xsimd_generic_details.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-
-        using namespace types;
-
-        // ceil
-        template <class A, class T>
-        inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            batch<T, A> truncated_self = trunc(self);
-            return select(truncated_self < self, truncated_self + 1, truncated_self);
-        }
-
-        // floor
-        template <class A, class T>
-        inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            batch<T, A> truncated_self = trunc(self);
-            return select(truncated_self > self, truncated_self - 1, truncated_self);
-        }
-
-        // round
-        template <class A, class T>
-        inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            auto v = abs(self);
-            auto c = ceil(v);
-            auto cp = select(c - 0.5 > v, c - 1, c);
-            return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
-        }
-
-        // trunc
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return self;
-        }
-        template <class A>
-        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
-        {
-            return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
-        }
-        template <class A>
-        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
-        {
-            return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
@ -1,969 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_TRIGO_HPP
-#define XSIMD_GENERIC_TRIGO_HPP
-
-#include "./xsimd_generic_details.hpp"
-
-#include <array>
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-
-        using namespace types;
-
-        // acos
-        template <class A, class T>
-        inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            batch_type x = abs(self);
-            auto x_larger_05 = x > batch_type(0.5);
-            x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
-            x = asin(x);
-            x = select(x_larger_05, x + x, x);
-            x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
-            return select(x_larger_05, x, constants::pio2<batch_type>() - x);
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            using real_batch = typename batch_type::real_batch;
-            batch_type tmp = asin(z);
-            return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
-        }
-
-        // acosh
-        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-        template <class A, class T>
-        inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            batch_type x = self - batch_type(1.);
-            auto test = x > constants::oneotwoeps<batch_type>();
-            batch_type z = select(test, self, x + sqrt(x + x + x * x));
-            batch_type l1pz = log1p(z);
-            return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            batch_type w = acos(z);
-            w = batch_type(-w.imag(), w.real());
-            return w;
-        }
-
-        // asin
-        template <class A>
-        inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<float, A>;
-            batch_type x = abs(self);
-            batch_type sign = bitofsign(self);
-            auto x_larger_05 = x > batch_type(0.5);
-            batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
-            x = select(x_larger_05, sqrt(z), x);
-            batch_type z1 = detail::horner<batch_type,
-                                           0x3e2aaae4,
-                                           0x3d9980f6,
-                                           0x3d3a3ec7,
-                                           0x3cc617e3,
-                                           0x3d2cb352>(z);
-            z1 = fma(z1, z * x, x);
-            z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
-            return z ^ sign;
-        }
-        template <class A>
-        inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<double, A>;
-            batch_type x = abs(self);
-            auto small_cond = x < constants::sqrteps<batch_type>();
-            batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
-            batch_type zz1 = batch_type(1.) - x;
-            batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
-            zz1 = sqrt(zz1 + zz1);
-            batch_type z = constants::pio4<batch_type>() - zz1;
-            zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
-            z = z - zz1;
-            zz1 = z + constants::pio4<batch_type>();
-            batch_type zz2 = self * self;
-            z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
-            zz2 = fma(x, z, x);
-            return select(x > batch_type(1.), constants::nan<batch_type>(),
-                          select(small_cond, x,
-                                 select(x > ct1, zz1, zz2))
-                              ^ bitofsign(self));
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            using real_batch = typename batch_type::real_batch;
-            real_batch x = z.real();
-            real_batch y = z.imag();
-
-            batch_type ct(-y, x);
-            batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
-            zz = log(ct + sqrt(zz));
-            batch_type resg(zz.imag(), -zz.real());
-
-            return select(y == real_batch(0.),
-                          select(fabs(x) > real_batch(1.),
-                                 batch_type(constants::pio2<real_batch>(), real_batch(0.)),
-                                 batch_type(asin(x), real_batch(0.))),
-                          resg);
-        }
-
-        // asinh
-        /* origin: boost/simd/arch/common/simd/function/asinh.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-        namespace detail
-        {
-            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-            inline batch<T, A>
-            average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
-            {
-                return (x1 & x2) + ((x1 ^ x2) >> 1);
-            }
-
-            template <class A, class T>
-            inline batch<T, A>
-            averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
-            {
-                using batch_type = batch<T, A>;
-                return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
-            }
-            template <class A>
-            inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
-            {
-                return averagef(x1, x2);
-            }
-            template <class A>
-            inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
-            {
-                return averagef(x1, x2);
-            }
-        }
-        template <class A>
-        inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<float, A>;
-            batch_type x = abs(self);
-            auto lthalf = x < batch_type(0.5);
-            batch_type x2 = x * x;
-            batch_type bts = bitofsign(self);
-            batch_type z(0.);
-            if (any(lthalf))
-            {
-                z = detail::horner<batch_type,
-                                   0x3f800000,
-                                   0xbe2aa9ad,
-                                   0x3d9949b1,
-                                   0xbd2ee581,
-                                   0x3ca4d6e6>(x2)
-                    * x;
-                if (all(lthalf))
-                    return z ^ bts;
-            }
-            batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
-#ifndef XSIMD_NO_NANS
-            return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
-#else
-            return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
-#endif
-        }
-        template <class A>
-        inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<double, A>;
-            batch_type x = abs(self);
-            auto test = x > constants::oneosqrteps<batch_type>();
-            batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
-#ifndef XSIMD_NO_INFINITIES
-            z = select(x == constants::infinity<batch_type>(), x, z);
-#endif
-            batch_type l1pz = log1p(z);
-            z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
-            return bitofsign(self) ^ z;
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            batch_type w = asin(batch_type(-z.imag(), z.real()));
-            w = batch_type(w.imag(), -w.real());
-            return w;
-        }
-
-        // atan
-        namespace detail
-        {
-            template <class A>
-            static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
-            {
-                using batch_type = batch<float, A>;
-                const auto flag1 = x < constants::tan3pio8<batch_type>();
-                const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
-                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
-                yy = select(flag2, constants::pio4<batch_type>(), yy);
-                batch_type xx = select(flag1, x, -recx);
-                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
-                const batch_type z = xx * xx;
-                batch_type z1 = detail::horner<batch_type,
-                                               0xbeaaaa2aul,
-                                               0x3e4c925ful,
-                                               0xbe0e1b85ul,
-                                               0x3da4f0d1ul>(z);
-                z1 = fma(xx, z1 * z, xx);
-                z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
-                z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
-                return yy + z1;
-            }
-            template <class A>
-            static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
-            {
-                using batch_type = batch<double, A>;
-                const auto flag1 = x < constants::tan3pio8<batch_type>();
-                const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
-                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
-                yy = select(flag2, constants::pio4<batch_type>(), yy);
-                batch_type xx = select(flag1, x, -recx);
-                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
-                batch_type z = xx * xx;
-                z *= detail::horner<batch_type,
-                                    0xc0503669fd28ec8eull,
-                                    0xc05eb8bf2d05ba25ull,
-                                    0xc052c08c36880273ull,
-                                    0xc03028545b6b807aull,
-                                    0xbfec007fa1f72594ull>(z)
-                    / detail::horner1<batch_type,
-                                      0x4068519efbbd62ecull,
-                                      0x407e563f13b049eaull,
-                                      0x407b0e18d2e2be3bull,
-                                      0x4064a0dd43b8fa25ull,
-                                      0x4038dbc45b14603cull>(z);
-                z = fma(xx, z, xx);
-                z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
-                z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
-                return yy + z;
-            }
-        }
-        template <class A, class T>
-        inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            const batch_type absa = abs(self);
-            const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
-            return x ^ bitofsign(self);
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            using real_batch = typename batch_type::real_batch;
-            real_batch x = z.real();
-            real_batch y = z.imag();
-            real_batch x2 = x * x;
-            real_batch one(1.);
-            real_batch a = one - x2 - (y * y);
-            real_batch w = 0.5 * atan2(2. * x, a);
-            real_batch num = y + one;
-            num = x2 + num * num;
-            real_batch den = y - one;
-            den = x2 + den * den;
-            batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
-                                    batch_type(real_batch(0.), constants::infinity<real_batch>()),
-                                    batch_type(w, 0.25 * log(num / den)));
-            return res;
-        }
-
-        // atanh
-        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-        template <class A, class T>
-        inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            batch_type x = abs(self);
-            batch_type t = x + x;
-            batch_type z = batch_type(1.) - x;
-            auto test = x < batch_type(0.5);
-            batch_type tmp = select(test, x, t) / z;
-            return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            batch_type w = atan(batch_type(-z.imag(), z.real()));
-            w = batch_type(w.imag(), -w.real());
-            return w;
-        }
-
-        // atan2
-        template <class A, class T>
-        inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            const batch_type q = abs(self / other);
-            const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
-            return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
-        }
-
-        // cos
-        namespace detail
-        {
-            template <class T, class A>
-            inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
-            {
-                return x & batch<T, A>(3);
-            }
-
-            template <class A>
-            inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
-            {
-                return to_float(quadrant(to_int(x)));
-            }
-
-            template <class A>
-            inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type a = x * batch_type(0.25);
-                return (a - floor(a)) * batch_type(4.);
-            }
-            /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-
-            template <class A>
-            inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
-            {
-                using batch_type = batch<float, A>;
-                batch_type y = detail::horner<batch_type,
-                                              0x3d2aaaa5,
-                                              0xbab60619,
-                                              0x37ccf5ce>(z);
-                return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
-            }
-
-            template <class A>
-            inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
-            {
-                using batch_type = batch<float, A>;
-                batch_type y = detail::horner<batch_type,
-                                              0xbe2aaaa2,
-                                              0x3c08839d,
-                                              0xb94ca1f9>(z);
-                return fma(y * z, x, x);
-            }
-
-            template <class A>
-            static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
-            {
-                using batch_type = batch<float, A>;
-                batch_type zz = z * z;
-                batch_type y = detail::horner<batch_type,
-                                              0x3eaaaa6f,
-                                              0x3e0896dd,
-                                              0x3d5ac5c9,
-                                              0x3cc821b5,
-                                              0x3b4c779c,
-                                              0x3c19c53b>(zz);
-                return fma(y, zz * z, z);
-            }
-
-            template <class A, class BB>
-            static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
-            {
-                using batch_type = batch<float, A>;
-                batch_type y = base_tancot_eval(z);
-                return select(test, y, -batch_type(1.) / y);
-            }
-
-            template <class A, class BB>
-            static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
-            {
-                using batch_type = batch<float, A>;
-                batch_type y = base_tancot_eval(z);
-                return select(test, batch_type(1.) / y, -y);
-            }
-
-            /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-            template <class A>
-            static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type y = detail::horner<batch_type,
-                                              0x3fe0000000000000ull,
-                                              0xbfa5555555555551ull,
-                                              0x3f56c16c16c15d47ull,
-                                              0xbefa01a019ddbcd9ull,
-                                              0x3e927e4f8e06d9a5ull,
-                                              0xbe21eea7c1e514d4ull,
-                                              0x3da8ff831ad9b219ull>(z);
-                return batch_type(1.) - y * z;
-            }
-
-            template <class A>
-            static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type y = detail::horner<batch_type,
-                                              0xbfc5555555555548ull,
-                                              0x3f8111111110f7d0ull,
-                                              0xbf2a01a019bfdf03ull,
-                                              0x3ec71de3567d4896ull,
-                                              0xbe5ae5e5a9291691ull,
-                                              0x3de5d8fd1fcf0ec1ull>(z);
-                return fma(y * z, x, x);
-            }
-
-            template <class A>
-            static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type zz = z * z;
-                batch_type num = detail::horner<batch_type,
-                                                0xc1711fead3299176ull,
-                                                0x413199eca5fc9dddull,
-                                                0xc0c992d8d24f3f38ull>(zz);
-                batch_type den = detail::horner1<batch_type,
-                                                 0xc189afe03cbe5a31ull,
-                                                 0x4177d98fc2ead8efull,
-                                                 0xc13427bc582abc96ull,
-                                                 0x40cab8a5eeb36572ull>(zz);
-                return fma(z, (zz * (num / den)), z);
-            }
-
-            template <class A, class BB>
-            static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type y = base_tancot_eval(z);
-                return select(test, y, -batch_type(1.) / y);
-            }
-
-            template <class A, class BB>
-            static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type y = base_tancot_eval(z);
-                return select(test, batch_type(1.) / y, -y);
-            }
-            /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-
-            struct trigo_radian_tag
-            {
-            };
-            struct trigo_pi_tag
-            {
-            };
-
-            template <class B, class Tag = trigo_radian_tag>
-            struct trigo_reducer
-            {
-                static inline B reduce(const B& x, B& xr) noexcept
-                {
-                    if (all(x <= constants::pio4<B>()))
-                    {
-                        xr = x;
-                        return B(0.);
-                    }
-                    else if (all(x <= constants::pio2<B>()))
-                    {
-                        auto test = x > constants::pio4<B>();
-                        xr = x - constants::pio2_1<B>();
-                        xr -= constants::pio2_2<B>();
-                        xr -= constants::pio2_3<B>();
-                        xr = select(test, xr, x);
-                        return select(test, B(1.), B(0.));
-                    }
-                    else if (all(x <= constants::twentypi<B>()))
-                    {
-                        B xi = nearbyint(x * constants::twoopi<B>());
-                        xr = fnma(xi, constants::pio2_1<B>(), x);
-                        xr -= xi * constants::pio2_2<B>();
-                        xr -= xi * constants::pio2_3<B>();
-                        return quadrant(xi);
-                    }
-                    else if (all(x <= constants::mediumpi<B>()))
-                    {
-                        B fn = nearbyint(x * constants::twoopi<B>());
-                        B r = x - fn * constants::pio2_1<B>();
-                        B w = fn * constants::pio2_1t<B>();
-                        B t = r;
-                        w = fn * constants::pio2_2<B>();
-                        r = t - w;
-                        w = fn * constants::pio2_2t<B>() - ((t - r) - w);
-                        t = r;
-                        w = fn * constants::pio2_3<B>();
-                        r = t - w;
-                        w = fn * constants::pio2_3t<B>() - ((t - r) - w);
-                        xr = r - w;
-                        return quadrant(fn);
-                    }
-                    else
-                    {
-                        static constexpr std::size_t size = B::size;
-                        using value_type = typename B::value_type;
-                        alignas(B) std::array<value_type, size> tmp;
-                        alignas(B) std::array<value_type, size> txr;
-                        alignas(B) std::array<value_type, size> args;
-                        x.store_aligned(args.data());
-
-                        for (std::size_t i = 0; i < size; ++i)
-                        {
-                            double arg = args[i];
-                            if (arg == std::numeric_limits<value_type>::infinity())
-                            {
-                                tmp[i] = 0.;
-                                txr[i] = std::numeric_limits<value_type>::quiet_NaN();
-                            }
-                            else
-                            {
-                                double y[2];
-                                std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
-                                tmp[i] = value_type(n & 3);
-                                txr[i] = value_type(y[0]);
-                            }
-                        }
-                        xr = B::load_aligned(&txr[0]);
-                        B res = B::load_aligned(&tmp[0]);
-                        return res;
-                    }
-                }
-            };
-
-            template <class B>
-            struct trigo_reducer<B, trigo_pi_tag>
-            {
-                static inline B reduce(const B& x, B& xr) noexcept
-                {
-                    B xi = nearbyint(x * B(2.));
-                    B x2 = x - xi * B(0.5);
-                    xr = x2 * constants::pi<B>();
-                    return quadrant(xi);
-                }
-            };
-
-        }
-        template <class A, class T>
-        inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            const batch_type x = abs(self);
-            batch_type xr = constants::nan<batch_type>();
-            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
-            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
-            auto swap_bit = fma(batch_type(-2.), tmp, n);
-            auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
-            const batch_type z = xr * xr;
-            const batch_type se = detail::sin_eval(z, xr);
-            const batch_type ce = detail::cos_eval(z);
-            const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
-            return z1 ^ sign_bit;
-        }
-
-        template <class A, class T>
-        inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
-        }
-
-        // cosh
-
-        /* origin: boost/simd/arch/common/simd/function/cosh.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-
-        template <class A, class T>
-        inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            batch_type x = abs(self);
-            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
-            batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
-            batch_type tmp = exp(x * fac);
-            batch_type tmp1 = batch_type(0.5) * tmp;
-            return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            auto x = z.real();
-            auto y = z.imag();
-            return { cosh(x) * cos(y), sinh(x) * sin(y) };
-        }
-
-        // sin
-        namespace detail
-        {
-            template <class A, class T, class Tag = trigo_radian_tag>
-            inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
-            {
-                using batch_type = batch<T, A>;
-                const batch_type x = abs(self);
-                batch_type xr = constants::nan<batch_type>();
-                const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
-                auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
-                auto swap_bit = fma(batch_type(-2.), tmp, n);
-                auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
-                const batch_type z = xr * xr;
-                const batch_type se = detail::sin_eval(z, xr);
-                const batch_type ce = detail::cos_eval(z);
-                const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
-                return z1 ^ sign_bit;
-            }
-        }
-
-        template <class A, class T>
-        inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            return detail::sin(self);
-        }
-
-        template <class A, class T>
-        inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
-        }
-
-        // sincos
-        template <class A, class T>
-        inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            const batch_type x = abs(self);
-            batch_type xr = constants::nan<batch_type>();
-            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
-            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
-            auto swap_bit = fma(batch_type(-2.), tmp, n);
-            const batch_type z = xr * xr;
-            const batch_type se = detail::sin_eval(z, xr);
-            const batch_type ce = detail::cos_eval(z);
-            auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
-            const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
-            auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
-            const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
-            return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
-        }
-
-        template <class A, class T>
-        inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
-        sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            using real_batch = typename batch_type::real_batch;
-            real_batch rcos = cos(z.real());
-            real_batch rsin = sin(z.real());
-            real_batch icosh = cosh(z.imag());
-            real_batch isinh = sinh(z.imag());
-            return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
-        }
-
-        // sinh
-        namespace detail
-        {
-            /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-            template <class A>
-            inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
-            {
-                using batch_type = batch<float, A>;
-                batch_type sqr_self = self * self;
-                return detail::horner<batch_type,
-                                      0x3f800000, // 1.0f
-                                      0x3e2aaacc, // 1.66667160211E-1f
-                                      0x3c087bbe, // 8.33028376239E-3f
-                                      0x39559e2f // 2.03721912945E-4f
-                                      >(sqr_self)
-                    * self;
-            }
-
-            template <class A>
-            inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
-            {
-                using batch_type = batch<double, A>;
-                batch_type sqrself = self * self;
-                return fma(self, (detail::horner<batch_type,
-                                                 0xc115782bdbf6ab05ull, //  -3.51754964808151394800E5
-                                                 0xc0c694b8c71d6182ull, //  -1.15614435765005216044E4,
-                                                 0xc064773a398ff4feull, //  -1.63725857525983828727E2,
-                                                 0xbfe9435fe8bb3cd6ull //  -7.89474443963537015605E-1
-                                                 >(sqrself)
-                                  / detail::horner1<batch_type,
-                                                    0xc1401a20e4f90044ull, //  -2.11052978884890840399E6
-                                                    0x40e1a7ba7ed72245ull, //   3.61578279834431989373E4,
-                                                    0xc0715b6096e96484ull //  -2.77711081420602794433E2,
-                                                    >(sqrself))
-                               * sqrself,
-                           self);
-            }
-        }
-        /* origin: boost/simd/arch/common/simd/function/sinh.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-        template <class A, class T>
-        inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            batch_type half(0.5);
-            batch_type x = abs(a);
-            auto lt1 = x < batch_type(1.);
-            batch_type bts = bitofsign(a);
-            batch_type z(0.);
-            if (any(lt1))
-            {
-                z = detail::sinh_kernel(x);
-                if (all(lt1))
-                    return z ^ bts;
-            }
-            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
-            batch_type fac = select(test1, half, batch_type(1.));
-            batch_type tmp = exp(x * fac);
-            batch_type tmp1 = half * tmp;
-            batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
-            return select(lt1, z, r) ^ bts;
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            auto x = z.real();
-            auto y = z.imag();
-            return { sinh(x) * cos(y), cosh(x) * sin(y) };
-        }
-
-        // tan
-        template <class A, class T>
-        inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            const batch_type x = abs(self);
-            batch_type xr = constants::nan<batch_type>();
-            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
-            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
-            auto swap_bit = fma(batch_type(-2.), tmp, n);
-            auto test = (swap_bit == batch_type(0.));
-            const batch_type y = detail::tan_eval(xr, test);
-            return y ^ bitofsign(self);
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<std::complex<T>, A>;
-            using real_batch = typename batch_type::real_batch;
-            real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
-            batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
-            real_batch wreal = sin(2 * z.real()) / d;
-            real_batch wimag = sinh(2 * z.imag());
-            batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
-            return select(d == real_batch(0.), winf, wres);
-        }
-
-        // tanh
-        namespace detail
-        {
-            /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
-            /*
-             * ====================================================
-             * copyright 2016 NumScale SAS
-             *
-             * Distributed under the Boost Software License, Version 1.0.
-             * (See copy at http://boost.org/LICENSE_1_0.txt)
-             * ====================================================
-             */
-            template <class B>
-            struct tanh_kernel;
-
-            template <class A>
-            struct tanh_kernel<batch<float, A>>
-            {
-                using batch_type = batch<float, A>;
-                static inline batch_type tanh(const batch_type& x) noexcept
-                {
-                    batch_type sqrx = x * x;
-                    return fma(detail::horner<batch_type,
-                                              0xbeaaaa99, //    -3.33332819422E-1F
-                                              0x3e088393, //    +1.33314422036E-1F
-                                              0xbd5c1e2d, //    -5.37397155531E-2F
-                                              0x3ca9134e, //    +2.06390887954E-2F
-                                              0xbbbaf0ea //    -5.70498872745E-3F
-                                              >(sqrx)
-                                   * sqrx,
-                               x, x);
-                }
-
-                static inline batch_type cotanh(const batch_type& x) noexcept
-                {
-                    return batch_type(1.) / tanh(x);
-                }
-            };
-
-            template <class A>
-            struct tanh_kernel<batch<double, A>>
-            {
-                using batch_type = batch<double, A>;
-                static inline batch_type tanh(const batch_type& x) noexcept
-                {
-                    batch_type sqrx = x * x;
-                    return fma(sqrx * p(sqrx) / q(sqrx), x, x);
-                }
-
-                static inline batch_type cotanh(const batch_type& x) noexcept
-                {
-                    batch_type sqrx = x * x;
-                    batch_type qval = q(sqrx);
-                    return qval / (x * fma(p(sqrx), sqrx, qval));
-                }
-
-                static inline batch_type p(const batch_type& x) noexcept
-                {
-                    return detail::horner<batch_type,
-                                          0xc0993ac030580563, // -1.61468768441708447952E3
-                                          0xc058d26a0e26682d, // -9.92877231001918586564E1,
-                                          0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
-                                          >(x);
-                }
-
-                static inline batch_type q(const batch_type& x) noexcept
-                {
-                    return detail::horner1<batch_type,
-                                           0x40b2ec102442040c, //  4.84406305325125486048E3
-                                           0x40a176fa0e5535fa, //  2.23548839060100448583E3,
-                                           0x405c33f28a581B86 //  1.12811678491632931402E2,
-                                           >(x);
-                }
-            };
-
-        }
-        /* origin: boost/simd/arch/common/simd/function/tanh.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-        template <class A, class T>
-        inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
-        {
-            using batch_type = batch<T, A>;
-            batch_type one(1.);
-            batch_type x = abs(self);
-            auto test = x < (batch_type(5.) / batch_type(8.));
-            batch_type bts = bitofsign(self);
-            batch_type z = one;
-            if (any(test))
-            {
-                z = detail::tanh_kernel<batch_type>::tanh(x);
-                if (all(test))
-                    return z ^ bts;
-            }
-            batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
-            return select(test, z, r) ^ bts;
-        }
-        template <class A, class T>
-        inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
-        {
-            using real_batch = typename batch<std::complex<T>, A>::real_batch;
-            auto x = z.real();
-            auto y = z.imag();
-            real_batch two(2);
-            auto d = cosh(two * x) + cos(two * y);
-            return { sinh(two * x) / d, sin(two * y) / d };
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@ -1,940 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX2_HPP
-#define XSIMD_AVX2_HPP
-
-#include <complex>
-#include <type_traits>
-
-#include "../types/xsimd_avx2_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // abs
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_abs_epi8(self);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_abs_epi16(self);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_abs_epi32(self);
-                }
-                else
-                {
-                    return abs(self, avx {});
-                }
-            }
-            return self;
-        }
-
-        // add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm256_add_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm256_add_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_add_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm256_add_epi64(self, other);
-            }
-            else
-            {
-                return add(self, other, avx {});
-            }
-        }
-
-        // bitwise_and
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_and_si256(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_and_si256(self, other);
-        }
-
-        // bitwise_andnot
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_andnot_si256(other, self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_andnot_si256(other, self);
-        }
-
-        // bitwise_not
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
-        {
-            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
-        {
-            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
-        }
-
-        // bitwise_lshift
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm256_slli_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_slli_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm256_slli_epi64(self, other);
-            }
-            else
-            {
-                return bitwise_lshift(self, other, avx {});
-            }
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_sllv_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm256_sllv_epi64(self, other);
-            }
-            else
-            {
-                return bitwise_lshift(self, other, avx {});
-            }
-        }
-
-        // bitwise_or
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_or_si256(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_or_si256(self, other);
-        }
-
-        // bitwise_rshift
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
-                    __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
-                    __m256i res = _mm256_srai_epi16(self, other);
-                    return _mm256_or_si256(
-                        detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
-                                           { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
-                                           sign_mask, cmp_is_negative),
-                        _mm256_andnot_si256(sign_mask, res));
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_srai_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_srai_epi32(self, other);
-                }
-                else
-                {
-                    return bitwise_rshift(self, other, avx {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_srli_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_srli_epi32(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    return _mm256_srli_epi64(self, other);
-                }
-                else
-                {
-                    return bitwise_rshift(self, other, avx {});
-                }
-            }
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_srav_epi32(self, other);
-                }
-                else
-                {
-                    return bitwise_rshift(self, other, avx {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_srlv_epi32(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    return _mm256_srlv_epi64(self, other);
-                }
-                else
-                {
-                    return bitwise_rshift(self, other, avx {});
-                }
-            }
-        }
-
-        // bitwise_xor
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_xor_si256(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            return _mm256_xor_si256(self, other);
-        }
-
-        // complex_low
-        template <class A>
-        inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
-        {
-            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
-            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
-            return _mm256_blend_pd(tmp0, tmp1, 10);
-        }
-
-        // complex_high
-        template <class A>
-        inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
-        {
-            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
-            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
-            return _mm256_blend_pd(tmp0, tmp1, 10);
-        }
-
-        // fast_cast
-        namespace detail
-        {
-
-            template <class A>
-            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
-            {
-                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
-                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
-                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
-
-                __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
-                __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
-                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
-                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
-                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
-                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
-            }
-
-            template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
-            {
-                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                // adapted to avx
-                __m256i xH = _mm256_srli_epi64(x, 32);
-                xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); //  2^84
-                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
-                                                 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
-                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
-                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
-                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
-            }
-
-            template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
-            {
-                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                // adapted to avx
-                __m256i xH = _mm256_srai_epi32(x, 16);
-                xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
-                xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); //  3*2^67
-                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
-                                                 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
-                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
-                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
-                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
-            }
-        }
-
-        // eq
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm256_cmpeq_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm256_cmpeq_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_cmpeq_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm256_cmpeq_epi64(self, other);
-            }
-            else
-            {
-                return eq(self, other, avx {});
-            }
-        }
-
-        // gather
-        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
-                                  kernel::requires_arch<avx2>) noexcept
-        {
-            // scatter for this one is AVX512F+AVX512VL
-            return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
-        }
-
-        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
-        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
-                                  kernel::requires_arch<avx2>) noexcept
-        {
-            // scatter for this one is AVX512F+AVX512VL
-            return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
-        }
-
-        template <class A, class U,
-                  detail::enable_sized_integral_t<U, 4> = 0>
-        inline batch<float, A> gather(batch<float, A> const&, float const* src,
-                                      batch<U, A> const& index,
-                                      kernel::requires_arch<avx2>) noexcept
-        {
-            // scatter for this one is AVX512F+AVX512VL
-            return _mm256_i32gather_ps(src, index, sizeof(float));
-        }
-
-        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
-        inline batch<double, A> gather(batch<double, A> const&, double const* src,
-                                       batch<U, A> const& index,
-                                       requires_arch<avx2>) noexcept
-        {
-            // scatter for this one is AVX512F+AVX512VL
-            return _mm256_i64gather_pd(src, index, sizeof(double));
-        }
-
-        // gather: handmade conversions
-        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
-        inline batch<float, A> gather(batch<float, A> const&, double const* src,
-                                      batch<V, A> const& index,
-                                      requires_arch<avx2>) noexcept
-        {
-            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
-            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
-            return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
-        }
-
-        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
-        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
-                                        batch<V, A> const& index,
-                                        requires_arch<avx2>) noexcept
-        {
-            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
-            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
-            return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
-        }
-
-        // lt
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_cmpgt_epi8(other, self);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_cmpgt_epi16(other, self);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_cmpgt_epi32(other, self);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    return _mm256_cmpgt_epi64(other, self);
-                }
-                else
-                {
-                    return lt(self, other, avx {});
-                }
-            }
-            else
-            {
-                return lt(self, other, avx {});
-            }
-        }
-
-        // load_complex
-        template <class A>
-        inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
-        {
-            using batch_type = batch<float, A>;
-            batch_type real = _mm256_castpd_ps(
-                _mm256_permute4x64_pd(
-                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
-                    _MM_SHUFFLE(3, 1, 2, 0)));
-            batch_type imag = _mm256_castpd_ps(
-                _mm256_permute4x64_pd(
-                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
-                    _MM_SHUFFLE(3, 1, 2, 0)));
-            return { real, imag };
-        }
-        template <class A>
-        inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
-        {
-            using batch_type = batch<double, A>;
-            batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
-            batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
-            return { real, imag };
-        }
-        // mask
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
-                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
-            }
-            else
-            {
-                return mask(self, avx {});
-            }
-        }
-
-        // max
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_max_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_max_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_max_epi32(self, other);
-                }
-                else
-                {
-                    return max(self, other, avx {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_max_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_max_epu16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_max_epu32(self, other);
-                }
-                else
-                {
-                    return max(self, other, avx {});
-                }
-            }
-        }
-
-        // min
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_min_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_min_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_min_epi32(self, other);
-                }
-                else
-                {
-                    return min(self, other, avx {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_min_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_min_epu16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm256_min_epu32(self, other);
-                }
-                else
-                {
-                    return min(self, other, avx {});
-                }
-            }
-        }
-
-        // mul
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm256_mullo_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_mullo_epi32(self, other);
-            }
-            else
-            {
-                return mul(self, other, avx {});
-            }
-        }
-
-        // reduce_add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                __m256i tmp1 = _mm256_hadd_epi32(self, self);
-                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
-                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
-                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
-                return _mm_cvtsi128_si32(tmp4);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
-                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
-                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
-                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
-#if defined(__x86_64__)
-                return _mm_cvtsi128_si64(res);
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, res);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                return reduce_add(self, avx {});
-            }
-        }
-
-        // sadd
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_adds_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_adds_epi16(self, other);
-                }
-                else
-                {
-                    return sadd(self, other, avx {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_adds_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_adds_epu16(self, other);
-                }
-                else
-                {
-                    return sadd(self, other, avx {});
-                }
-            }
-        }
-
-        // select
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm256_blendv_epi8(false_br, true_br, cond);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm256_blendv_epi8(false_br, true_br, cond);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_blendv_epi8(false_br, true_br, cond);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm256_blendv_epi8(false_br, true_br, cond);
-            }
-            else
-            {
-                return select(cond, true_br, false_br, avx {});
-            }
-        }
-        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
-        {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
-            // FIXME: for some reason mask here is not considered as an immediate,
-            // but it's okay for _mm256_blend_epi32
-            // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_blend_epi32(false_br, true_br, mask);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                constexpr int imask = detail::interleave(mask);
-                return _mm256_blend_epi32(false_br, true_br, imask);
-            }
-            else
-            {
-                return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
-            }
-        }
-
-        // slide_left
-        template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
-        {
-            constexpr unsigned BitCount = N * 8;
-            if (BitCount == 0)
-            {
-                return x;
-            }
-            if (BitCount >= 256)
-            {
-                return batch<T, A>(T(0));
-            }
-            if (BitCount > 128)
-            {
-                constexpr unsigned M = (BitCount - 128) / 8;
-                auto y = _mm256_bslli_epi128(x, M);
-                return _mm256_permute2x128_si256(y, y, 0x28);
-            }
-            if (BitCount == 128)
-            {
-                return _mm256_permute2x128_si256(x, x, 0x28);
-            }
-            // shifting by [0, 128[ bits
-            constexpr unsigned M = BitCount / 8;
-            auto y = _mm256_bslli_epi128(x, M);
-            auto z = _mm256_bsrli_epi128(x, 16 - M);
-            auto w = _mm256_permute2x128_si256(z, z, 0x28);
-            return _mm256_or_si256(y, w);
-        }
-
-        // slide_right
-        template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
-        {
-            constexpr unsigned BitCount = N * 8;
-            if (BitCount == 0)
-            {
-                return x;
-            }
-            if (BitCount >= 256)
-            {
-                return batch<T, A>(T(0));
-            }
-            if (BitCount > 128)
-            {
-                constexpr unsigned M = (BitCount - 128) / 8;
-                auto y = _mm256_bsrli_epi128(x, M);
-                return _mm256_permute2x128_si256(y, y, 0x81);
-            }
-            if (BitCount == 128)
-            {
-                return _mm256_permute2x128_si256(x, x, 0x81);
-            }
-            // shifting by [0, 128[ bits
-            constexpr unsigned M = BitCount / 8;
-            auto y = _mm256_bsrli_epi128(x, M);
-            auto z = _mm256_bslli_epi128(x, 16 - M);
-            auto w = _mm256_permute2x128_si256(z, z, 0x81);
-            return _mm256_or_si256(y, w);
-        }
-
-        // ssub
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_subs_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_subs_epi16(self, other);
-                }
-                else
-                {
-                    return ssub(self, other, avx {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm256_subs_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm256_subs_epu16(self, other);
-                }
-                else
-                {
-                    return ssub(self, other, avx {});
-                }
-            }
-        }
-
-        // sub
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm256_sub_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm256_sub_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm256_sub_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm256_sub_epi64(self, other);
-            }
-            else
-            {
-                return sub(self, other, avx {});
-            }
-        }
-
-        // swizzle
-        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
-        {
-            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
-        }
-
-        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
-        {
-            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
-            return _mm256_permute4x64_pd(self, mask);
-        }
-
-        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
-        {
-            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
-            return _mm256_permute4x64_epi64(self, mask);
-        }
-        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
-        {
-            return bitwise_cast<batch<int64_t, A>>(swizzle(bitwise_cast<batch<uint64_t, A>>(self), mask, avx2 {}));
-        }
-        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
-        {
-            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
-        }
-        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
-        {
-            return bitwise_cast<batch<int32_t, A>>(swizzle(bitwise_cast<batch<uint32_t, A>>(self), mask, avx2 {}));
-        }
-
-        // zip_hi
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                auto lo = _mm256_unpacklo_epi8(self, other);
-                auto hi = _mm256_unpackhi_epi8(self, other);
-                return _mm256_permute2f128_si256(lo, hi, 0x31);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                auto lo = _mm256_unpacklo_epi16(self, other);
-                auto hi = _mm256_unpackhi_epi16(self, other);
-                return _mm256_permute2f128_si256(lo, hi, 0x31);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                auto lo = _mm256_unpacklo_epi32(self, other);
-                auto hi = _mm256_unpackhi_epi32(self, other);
-                return _mm256_permute2f128_si256(lo, hi, 0x31);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                auto lo = _mm256_unpacklo_epi64(self, other);
-                auto hi = _mm256_unpackhi_epi64(self, other);
-                return _mm256_permute2f128_si256(lo, hi, 0x31);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
-        // zip_lo
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                auto lo = _mm256_unpacklo_epi8(self, other);
-                auto hi = _mm256_unpackhi_epi8(self, other);
-                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                auto lo = _mm256_unpacklo_epi16(self, other);
-                auto hi = _mm256_unpackhi_epi16(self, other);
-                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                auto lo = _mm256_unpacklo_epi32(self, other);
-                auto hi = _mm256_unpackhi_epi32(self, other);
-                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                auto lo = _mm256_unpacklo_epi64(self, other);
-                auto hi = _mm256_unpackhi_epi64(self, other);
-                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@ -1,627 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512BW_HPP
-#define XSIMD_AVX512BW_HPP
-
-#include <array>
-#include <type_traits>
-
-#include "../types/xsimd_avx512bw_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        namespace detail
-        {
-            template <class A, class T, int Cmp>
-            inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
-            {
-                using register_type = typename batch_bool<T, A>::register_type;
-                if (std::is_signed<T>::value)
-                {
-                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                    {
-                        return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
-                    }
-                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                    {
-                        return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
-                    }
-                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                    {
-                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
-                    }
-                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                    {
-                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
-                    }
-                }
-                else
-                {
-                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                    {
-                        return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
-                    }
-                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                    {
-                        return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
-                    }
-                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                    {
-                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
-                    }
-                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                    {
-                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
-                    }
-                }
-            }
-        }
-
-        // abs
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
-        {
-            if (std::is_unsigned<T>::value)
-            {
-                return self;
-            }
-
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm512_abs_epi8(self);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_abs_epi16(self);
-            }
-            else
-            {
-                return abs(self, avx512dq {});
-            }
-        }
-
-        // add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm512_add_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_add_epi16(self, other);
-            }
-            else
-            {
-                return add(self, other, avx512dq {});
-            }
-        }
-
-        // bitwise_lshift
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
-        {
-#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
-#else
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_slli_epi16(self, other);
-#endif
-            }
-            else
-            {
-                return bitwise_lshift(self, other, avx512dq {});
-            }
-        }
-
-        // bitwise_rshift
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
-                    __m512i zeros = _mm512_setzero_si512();
-                    __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
-                    __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
-#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
-                    __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
-#else
-                    __m512i res = _mm512_srai_epi16(self, other);
-#endif
-                    return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
-#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
-#else
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_srai_epi16(self, other);
-#endif
-                }
-                else
-                {
-                    return bitwise_rshift(self, other, avx512dq {});
-                }
-            }
-            else
-            {
-#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
-#else
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_srli_epi16(self, other);
-#endif
-                }
-                else
-                {
-                    return bitwise_rshift(self, other, avx512dq {});
-                }
-            }
-        }
-
-        // eq
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
-        }
-
-        // ge
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
-        }
-
-        // gt
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
-        }
-
-        // le
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
-        }
-
-        // lt
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
-        }
-
-        // max
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_max_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_max_epi16(self, other);
-                }
-                else
-                {
-                    return max(self, other, avx512dq {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_max_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_max_epu16(self, other);
-                }
-                else
-                {
-                    return max(self, other, avx512dq {});
-                }
-            }
-        }
-
-        // min
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_min_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_min_epi16(self, other);
-                }
-                else
-                {
-                    return min(self, other, avx512dq {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_min_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_min_epu16(self, other);
-                }
-                else
-                {
-                    return min(self, other, avx512dq {});
-                }
-            }
-        }
-
-        // mul
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
-                __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
-                return _mm512_or_si512(upper, lower);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_mullo_epi16(self, other);
-            }
-            else
-            {
-                return mul(self, other, avx512dq {});
-            }
-        }
-
-        // neq
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
-        }
-
-        // sadd
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_adds_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_adds_epi16(self, other);
-                }
-                else
-                {
-                    return sadd(self, other, avx512dq {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_adds_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_adds_epu16(self, other);
-                }
-                else
-                {
-                    return sadd(self, other, avx512dq {});
-                }
-            }
-        }
-
-        // select
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
-            }
-            else
-            {
-                return select(cond, true_br, false_br, avx512dq {});
-            }
-        }
-
-        // slide_left
-        namespace detail
-        {
-            template <size_t... Is>
-            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is == 0 ? 8 : Is - 1)... };
-            }
-
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is >= N ? Is - N : 0)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is >= N ? 0xFFFF : 0x0000)... };
-            }
-        }
-
-        template <size_t N, class A, class T>
-        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
-        {
-            constexpr unsigned BitCount = N * 8;
-            if (BitCount == 0)
-            {
-                return x;
-            }
-            if (BitCount >= 512)
-            {
-                return batch<T, A>(T(0));
-            }
-            batch<T, A> xx;
-            if (N & 1)
-            {
-                alignas(A::alignment()) uint64_t buffer[8];
-                _mm512_store_epi64(&buffer[0], x);
-                for (int i = 7; i > 0; --i)
-                    buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
-                buffer[0] = buffer[0] << 8;
-                xx = _mm512_load_epi64(&buffer[0]);
-
-                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
-                __m512i xl = _mm512_slli_epi64(x, 8);
-                __m512i xr = _mm512_srli_epi64(x, 56);
-                xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
-                xx = _mm512_or_si512(xr, xl);
-                if (N == 1)
-                    return xx;
-            }
-            else
-            {
-                xx = x;
-            }
-            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
-        }
-
-        // slide_right
-        namespace detail
-        {
-            template <size_t... Is>
-            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is + 1)... };
-            }
-
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is < (32 - N) ? Is + N : 0)... };
-            }
-            template <size_t N, size_t... Is>
-            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
-            {
-                return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
-            }
-        }
-        template <size_t N, class A, class T>
-        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
-        {
-            constexpr unsigned BitCount = N * 8;
-            if (BitCount == 0)
-            {
-                return x;
-            }
-            if (BitCount >= 512)
-            {
-                return batch<T, A>(T(0));
-            }
-            batch<T, A> xx;
-            if (N & 1)
-            {
-                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
-                __m512i xr = _mm512_srli_epi64(x, 8);
-                __m512i xl = _mm512_slli_epi64(x, 56);
-                xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
-                xx = _mm512_or_si512(xr, xl);
-                if (N == 1)
-                    return xx;
-            }
-            else
-            {
-                xx = x;
-            }
-            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
-            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
-        }
-
-        // ssub
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_subs_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_subs_epi16(self, other);
-                }
-                else
-                {
-                    return ssub(self, other, avx512dq {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm512_subs_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm512_subs_epu16(self, other);
-                }
-                else
-                {
-                    return ssub(self, other, avx512dq {});
-                }
-            }
-        }
-
-        // sub
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm512_sub_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm512_sub_epi16(self, other);
-            }
-            else
-            {
-                return sub(self, other, avx512dq {});
-            }
-        }
-
-        // swizzle
-
-        template <class A, uint16_t... Vs>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
-        {
-            return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
-        }
-
-        template <class A, uint16_t... Vs>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
-        {
-            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, avx512bw {}));
-        }
-
-        template <class A, uint8_t... Vs>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
-        {
-            return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
-        }
-
-        template <class A, uint8_t... Vs>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
-        {
-            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, avx512bw {}));
-        }
-
-        // zip_hi
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            __m512i lo, hi;
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                lo = _mm512_unpacklo_epi8(self, other);
-                hi = _mm512_unpackhi_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                lo = _mm512_unpacklo_epi16(self, other);
-                hi = _mm512_unpackhi_epi16(self, other);
-            }
-            else
-            {
-                return zip_hi(self, other, avx512f {});
-            }
-            return _mm512_inserti32x4(
-                _mm512_inserti32x4(
-                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
-                    _mm512_extracti32x4_epi32(lo, 3),
-                    2),
-                _mm512_extracti32x4_epi32(hi, 2),
-                1);
-        }
-
-        // zip_lo
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
-        {
-            __m512i lo, hi;
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                lo = _mm512_unpacklo_epi8(self, other);
-                hi = _mm512_unpackhi_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                lo = _mm512_unpacklo_epi16(self, other);
-                hi = _mm512_unpackhi_epi16(self, other);
-            }
-            else
-            {
-                return zip_lo(self, other, avx512f {});
-            }
-            return _mm512_inserti32x4(
-                _mm512_inserti32x4(
-                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
-                    _mm512_extracti32x4_epi32(hi, 1),
-                    3),
-                _mm512_extracti32x4_epi32(lo, 1),
-                2);
-        }
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
@ -1,28 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512CD_HPP
-#define XSIMD_AVX512CD_HPP
-
-#include "../types/xsimd_avx512cd_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        // Nothing there yet.
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
@ -1,212 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512_DQHPP
-#define XSIMD_AVX512_D_HPP
-
-#include "../types/xsimd_avx512dq_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // bitwise_and
-        template <class A>
-        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_and_ps(self, other);
-        }
-        template <class A>
-        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_and_pd(self, other);
-        }
-
-        // bitwise_andnot
-        template <class A>
-        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_andnot_ps(other, self);
-        }
-        template <class A>
-        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_andnot_pd(other, self);
-        }
-
-        // bitwise_not
-        template <class A>
-        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
-        }
-        template <class A>
-        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
-        {
-            return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
-        }
-
-        // bitwise_or
-        template <class A>
-        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_or_ps(self, other);
-        }
-        template <class A>
-        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_or_pd(self, other);
-        }
-
-        template <class A, class T>
-        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            using register_type = typename batch_bool<T, A>::register_type;
-            return register_type(self.data | other.data);
-        }
-
-        // bitwise_xor
-        template <class A>
-        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_xor_ps(self, other);
-        }
-        template <class A>
-        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_xor_pd(self, other);
-        }
-
-        // haddp
-        template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
-        {
-            // The following folds over the vector once:
-            // tmp1 = [a0..8, b0..8]
-            // tmp2 = [a8..f, b8..f]
-#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
-    batch<float, avx512f> res##I;                                        \
-    {                                                                    \
-        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
-        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
-        res##I = _mm512_add_ps(tmp1, tmp2);                              \
-    }
-
-            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
-            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
-            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
-            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
-            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
-            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
-            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
-            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
-
-#undef XSIMD_AVX512_HADDP_STEP1
-
-            // The following flds the code and shuffles so that hadd_ps produces the correct result
-            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
-            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
-            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
-#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                               \
-    batch<float, avx2> halfx##I;                                              \
-    {                                                                         \
-        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));      \
-        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));      \
-                                                                              \
-        auto resx1 = _mm512_add_ps(tmp1, tmp2);                               \
-                                                                              \
-        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));      \
-        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));      \
-                                                                              \
-        auto resx2 = _mm512_add_ps(tmp3, tmp4);                               \
-                                                                              \
-        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
-        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
-                                                                              \
-        auto resx3 = _mm512_add_ps(tmp5, tmp6);                               \
-                                                                              \
-        halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),           \
-                                  _mm512_extractf32x8_ps(resx3, 1));          \
-    }
-
-            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
-            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
-
-#undef XSIMD_AVX512_HADDP_STEP2
-
-            auto concat = _mm512_castps256_ps512(halfx0);
-            concat = _mm512_insertf32x8(concat, halfx1, 1);
-            return concat;
-        }
-
-        // ldexp
-        template <class A>
-        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
-        }
-
-        // mul
-        template <class A>
-        inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_mullo_epi64(self, other);
-        }
-
-        template <class A>
-        inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_mullo_epi64(self, other);
-        }
-
-        // nearbyint_as_int
-        template <class A>
-        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
-                                                  requires_arch<avx512dq>) noexcept
-        {
-            return _mm512_cvtpd_epi64(self);
-        }
-
-        // reduce_add
-        template <class A>
-        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
-        {
-            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
-            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
-            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
-            return reduce_add(batch<float, avx2>(res1), avx2 {});
-        }
-
-        // convert
-        namespace detail
-        {
-            template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
-            {
-                return _mm512_cvtepi64_pd(self);
-            }
-
-            template <class A>
-            inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
-            {
-                return _mm512_cvttpd_epi64(self);
-            }
-
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
@ -1,384 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
-#define XSIMD_NUMERICAL_CONSTANT_HPP
-
-#include <limits>
-
-#include "../types/xsimd_utils.hpp"
-
-namespace xsimd
-{
-
-    namespace constants
-    {
-
-#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
-    template <class T>                              \
-    inline T NAME() noexcept                        \
-    {                                               \
-        return T(NAME<typename T::value_type>());   \
-    }                                               \
-    template <>                                     \
-    inline float NAME<float>() noexcept             \
-    {                                               \
-        return SINGLE;                              \
-    }                                               \
-    template <>                                     \
-    inline double NAME<double>() noexcept           \
-    {                                               \
-        return DOUBLE;                              \
-    }
-
-#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
-    template <class T>                                  \
-    inline T NAME() noexcept                            \
-    {                                                   \
-        return T(NAME<typename T::value_type>());       \
-    }                                                   \
-    template <>                                         \
-    inline float NAME<float>() noexcept                 \
-    {                                                   \
-        return bit_cast<float>((uint32_t)SINGLE);       \
-    }                                                   \
-    template <>                                         \
-    inline double NAME<double>() noexcept               \
-    {                                                   \
-        return bit_cast<double>((uint64_t)DOUBLE);      \
-    }
-
-        XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
-        XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
-        XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
-        XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
-        XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
-        XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
-        XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
-        XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
-        XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
-        XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
-        XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
-        XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
-        XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
-        XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
-        XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
-        XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
-        XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
-        XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
-        XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
-        XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
-        XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
-        XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
-        XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
-        XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
-        XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
-        XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
-        XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
-        XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
-        XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
-        XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
-        XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
-        XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
-        XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
-        XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
-        XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
-        XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
-        XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
-        XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
-        XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
-        XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
-        XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
-        XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
-
-#undef XSIMD_DEFINE_CONSTANT
-#undef XSIMD_DEFINE_CONSTANT_HEX
-
-        template <class T>
-        constexpr T allbits() noexcept;
-
-        template <class T>
-        constexpr as_integer_t<T> mask1frexp() noexcept;
-
-        template <class T>
-        constexpr as_integer_t<T> mask2frexp() noexcept;
-
-        template <class T>
-        constexpr as_integer_t<T> maxexponent() noexcept;
-
-        template <class T>
-        constexpr as_integer_t<T> maxexponentm1() noexcept;
-
-        template <class T>
-        constexpr int32_t nmb() noexcept;
-
-        template <class T>
-        constexpr T zero() noexcept;
-
-        template <class T>
-        constexpr T minvalue() noexcept;
-
-        template <class T>
-        constexpr T maxvalue() noexcept;
-
-        /**************************
-         * allbits implementation *
-         **************************/
-
-        namespace detail
-        {
-            template <class T, bool = std::is_integral<T>::value>
-            struct allbits_impl
-            {
-                static constexpr T get_value() noexcept
-                {
-                    return T(~0);
-                }
-            };
-
-            template <class T>
-            struct allbits_impl<T, false>
-            {
-                static constexpr T get_value() noexcept
-                {
-                    return nan<T>();
-                }
-            };
-        }
-
-        template <class T>
-        inline constexpr T allbits() noexcept
-        {
-            return T(detail::allbits_impl<typename T::value_type>::get_value());
-        }
-
-        /*****************************
-         * mask1frexp implementation *
-         *****************************/
-
-        template <class T>
-        inline constexpr as_integer_t<T> mask1frexp() noexcept
-        {
-            return as_integer_t<T>(mask1frexp<typename T::value_type>());
-        }
-
-        template <>
-        inline constexpr int32_t mask1frexp<float>() noexcept
-        {
-            return 0x7f800000;
-        }
-
-        template <>
-        inline constexpr int64_t mask1frexp<double>() noexcept
-        {
-            return 0x7ff0000000000000;
-        }
-
-        /*****************************
-         * mask2frexp implementation *
-         *****************************/
-
-        template <class T>
-        inline constexpr as_integer_t<T> mask2frexp() noexcept
-        {
-            return as_integer_t<T>(mask2frexp<typename T::value_type>());
-        }
-
-        template <>
-        inline constexpr int32_t mask2frexp<float>() noexcept
-        {
-            return 0x3f000000;
-        }
-
-        template <>
-        inline constexpr int64_t mask2frexp<double>() noexcept
-        {
-            return 0x3fe0000000000000;
-        }
-
-        /******************************
-         * maxexponent implementation *
-         ******************************/
-
-        template <class T>
-        inline constexpr as_integer_t<T> maxexponent() noexcept
-        {
-            return as_integer_t<T>(maxexponent<typename T::value_type>());
-        }
-
-        template <>
-        inline constexpr int32_t maxexponent<float>() noexcept
-        {
-            return 127;
-        }
-
-        template <>
-        inline constexpr int64_t maxexponent<double>() noexcept
-        {
-            return 1023;
-        }
-
-        /******************************
-         * maxexponent implementation *
-         ******************************/
-
-        template <class T>
-        inline constexpr as_integer_t<T> maxexponentm1() noexcept
-        {
-            return as_integer_t<T>(maxexponentm1<typename T::value_type>());
-        }
-
-        template <>
-        inline constexpr int32_t maxexponentm1<float>() noexcept
-        {
-            return 126;
-        }
-
-        template <>
-        inline constexpr int64_t maxexponentm1<double>() noexcept
-        {
-            return 1022;
-        }
-
-        /**********************
-         * nmb implementation *
-         **********************/
-
-        template <class T>
-        inline constexpr int32_t nmb() noexcept
-        {
-            return nmb<typename T::value_type>();
-        }
-
-        template <>
-        inline constexpr int32_t nmb<float>() noexcept
-        {
-            return 23;
-        }
-
-        template <>
-        inline constexpr int32_t nmb<double>() noexcept
-        {
-            return 52;
-        }
-
-        /***********************
-         * zero implementation *
-         ***********************/
-
-        template <class T>
-        inline constexpr T zero() noexcept
-        {
-            return T(typename T::value_type(0));
-        }
-
-        /***************************
-         * minvalue implementation *
-         ***************************/
-
-        namespace detail
-        {
-            template <class T>
-            struct minvalue_impl
-            {
-                static constexpr T get_value() noexcept
-                {
-                    return std::numeric_limits<typename T::value_type>::min();
-                }
-            };
-
-            template <class T>
-            struct minvalue_common
-            {
-                static constexpr T get_value() noexcept
-                {
-                    return std::numeric_limits<T>::min();
-                }
-            };
-
-            template <>
-            struct minvalue_impl<int8_t> : minvalue_common<int8_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<int16_t> : minvalue_common<int16_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<int32_t> : minvalue_common<int32_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<int64_t> : minvalue_common<int64_t>
-            {
-            };
-            template <>
-            struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
-            {
-            };
-
-            template <>
-            struct minvalue_impl<float>
-            {
-                static float get_value() noexcept
-                {
-                    return bit_cast<float>((uint32_t)0xff7fffff);
-                }
-            };
-
-            template <>
-            struct minvalue_impl<double>
-            {
-                static double get_value() noexcept
-                {
-                    return bit_cast<double>((uint64_t)0xffefffffffffffff);
-                }
-            };
-        }
-
-        template <class T>
-        inline constexpr T minvalue() noexcept
-        {
-            return T(detail::minvalue_impl<typename T::value_type>::get_value());
-        }
-
-        /***************************
-         * maxvalue implementation *
-         ***************************/
-
-        template <class T>
-        inline constexpr T maxvalue() noexcept
-        {
-            return T(std::numeric_limits<typename T::value_type>::max());
-        }
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
@ -1,80 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA3_AVX_HPP
-#define XSIMD_FMA3_AVX_HPP
-
-#include "../types/xsimd_fma3_avx_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // fnma
-        template <class A>
-        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fnmadd_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fnmadd_pd(x, y, z);
-        }
-
-        // fnms
-        template <class A>
-        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fnmsub_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fnmsub_pd(x, y, z);
-        }
-
-        // fma
-        template <class A>
-        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fmadd_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fmadd_pd(x, y, z);
-        }
-
-        // fms
-        template <class A>
-        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fmsub_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
-        {
-            return _mm256_fmsub_pd(x, y, z);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
@ -1,46 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA3_AVX2_HPP
-#define XSIMD_FMA3_AVX2_HPP
-
-#include "../types/xsimd_fma3_avx2_register.hpp"
-
-// Allow inclusion of xsimd_fma3_avx.hpp
-#ifdef XSIMD_FMA3_AVX_HPP
-#undef XSIMD_FMA3_AVX_HPP
-#define XSIMD_FORCE_FMA3_AVX_HPP
-#endif
-
-// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
-#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
-#define XSIMD_FMA3_AVX_REGISTER_HPP
-#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
-#endif
-
-// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
-#define avx avx2
-#include "./xsimd_fma3_avx.hpp"
-#undef avx
-#undef XSIMD_FMA3_AVX_HPP
-
-// Carefully restore guards
-#ifdef XSIMD_FORCE_FMA3_AVX_HPP
-#define XSIMD_FMA3_AVX_HPP
-#undef XSIMD_FORCE_FMA3_AVX_HPP
-#endif
-
-#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
-#undef XSIMD_FMA3_AVX_REGISTER_HPP
-#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
-#endif
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
@ -1,79 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA3_SSE_HPP
-#define XSIMD_FMA3_SSE_HPP
-
-#include "../types/xsimd_fma3_sse_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-        // fnma
-        template <class A>
-        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fnmadd_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fnmadd_pd(x, y, z);
-        }
-
-        // fnms
-        template <class A>
-        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fnmsub_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fnmsub_pd(x, y, z);
-        }
-
-        // fma
-        template <class A>
-        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fmadd_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fmadd_pd(x, y, z);
-        }
-
-        // fms
-        template <class A>
-        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fmsub_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
-        {
-            return _mm_fmsub_pd(x, y, z);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
@ -1,79 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA4_HPP
-#define XSIMD_FMA4_HPP
-
-#include "../types/xsimd_fma4_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // fnma
-        template <class A>
-        inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_nmacc_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_nmacc_pd(x, y, z);
-        }
-
-        // fnms
-        template <class A>
-        inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_nmsub_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_nmsub_pd(x, y, z);
-        }
-
-        // fma
-        template <class A>
-        inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_macc_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_macc_pd(x, y, z);
-        }
-
-        // fms
-        template <class A>
-        inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_msub_ps(x, y, z);
-        }
-
-        template <class A>
-        inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
-        {
-            return _mm_msub_pd(x, y, z);
-        }
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
@ -1,23 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_HPP
-#define XSIMD_GENERIC_HPP
-
-#include "./generic/xsimd_generic_arithmetic.hpp"
-#include "./generic/xsimd_generic_complex.hpp"
-#include "./generic/xsimd_generic_logical.hpp"
-#include "./generic/xsimd_generic_math.hpp"
-#include "./generic/xsimd_generic_memory.hpp"
-#include "./generic/xsimd_generic_rounding.hpp"
-#include "./generic/xsimd_generic_trigo.hpp"
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
@ -1,38 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_FWD_HPP
-#define XSIMD_GENERIC_FWD_HPP
-
-#include "../types/xsimd_batch_constant.hpp"
-
-#include <type_traits>
-
-namespace xsimd
-{
-    namespace kernel
-    {
-        // forward declaration
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
-        template <class A, class T>
-        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
-
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@ -1,86 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_ISA_HPP
-#define XSIMD_ISA_HPP
-
-#include "../config/xsimd_arch.hpp"
-
-#include "./xsimd_generic_fwd.hpp"
-
-#if XSIMD_WITH_SSE2
-#include "./xsimd_sse2.hpp"
-#endif
-
-#if XSIMD_WITH_SSE3
-#include "./xsimd_sse3.hpp"
-#endif
-
-#if XSIMD_WITH_SSSE3
-#include "./xsimd_ssse3.hpp"
-#endif
-
-#if XSIMD_WITH_SSE4_1
-#include "./xsimd_sse4_1.hpp"
-#endif
-
-#if XSIMD_WITH_SSE4_2
-#include "./xsimd_sse4_2.hpp"
-#endif
-
-#if XSIMD_WITH_FMA3_SSE
-#include "./xsimd_fma3_sse.hpp"
-#endif
-
-#if XSIMD_WITH_FMA4
-#include "./xsimd_fma4.hpp"
-#endif
-
-#if XSIMD_WITH_AVX
-#include "./xsimd_avx.hpp"
-#endif
-
-#if XSIMD_WITH_FMA3_AVX
-#include "./xsimd_fma3_avx.hpp"
-#endif
-
-#if XSIMD_WITH_AVX2
-#include "./xsimd_avx2.hpp"
-#endif
-
-#if XSIMD_WITH_FMA3_AVX2
-#include "./xsimd_fma3_avx2.hpp"
-#endif
-
-#if XSIMD_WITH_AVX512F
-#include "./xsimd_avx512f.hpp"
-#endif
-
-#if XSIMD_WITH_AVX512BW
-#include "./xsimd_avx512bw.hpp"
-#endif
-
-#if XSIMD_WITH_NEON
-#include "./xsimd_neon.hpp"
-#endif
-
-#if XSIMD_WITH_NEON64
-#include "./xsimd_neon64.hpp"
-#endif
-
-#if XSIMD_WITH_SVE
-#include "./xsimd_sve.hpp"
-#endif
-
-// Must come last to have access to all conversion specializations.
-#include "./xsimd_generic.hpp"
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
@ -1,64 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE3_HPP
-#define XSIMD_SSE3_HPP
-
-#include "../types/xsimd_sse3_register.hpp"
-#include <type_traits>
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // haddp
-        template <class A>
-        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
-        {
-            return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
-                               _mm_hadd_ps(row[2], row[3]));
-        }
-        template <class A>
-        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
-        {
-            return _mm_hadd_pd(row[0], row[1]);
-        }
-
-        // load_unaligned
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
-        {
-            return _mm_lddqu_si128((__m128i const*)mem);
-        }
-
-        // reduce_add
-        template <class A>
-        inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
-        {
-            __m128 tmp0 = _mm_hadd_ps(self, self);
-            __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
-            return _mm_cvtss_f32(tmp1);
-        }
-        template <class A>
-        inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
-        {
-            __m128d tmp0 = _mm_hadd_pd(self, self);
-            return _mm_cvtsd_f64(tmp0);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
@ -1,350 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE4_1_HPP
-#define XSIMD_SSE4_1_HPP
-
-#include <type_traits>
-
-#include "../types/xsimd_sse4_1_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-        // any
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return !_mm_testz_si128(self, self);
-        }
-        // ceil
-        template <class A>
-        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_ceil_ps(self);
-        }
-        template <class A>
-        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_ceil_pd(self);
-        }
-
-        // fast_cast
-        namespace detail
-        {
-            template <class A>
-            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
-            {
-                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                __m128i xH = _mm_srai_epi32(x, 16);
-                xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
-                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
-                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
-                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
-                return _mm_add_pd(f, _mm_castsi128_pd(xL));
-            }
-
-            template <class A>
-            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
-            {
-                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                __m128i xH = _mm_srli_epi64(x, 32);
-                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
-                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
-                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
-                return _mm_add_pd(f, _mm_castsi128_pd(xL));
-            }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
-            {
-                return _mm_castps_si128(
-                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
-                                  _mm_castsi128_ps(_mm_xor_si128(
-                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                                      _mm_set1_epi32(1u << 31))),
-                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
-            }
-        }
-
-        // eq
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_cmpeq_epi64(self, other);
-            }
-            else
-            {
-                return eq(self, other, ssse3 {});
-            }
-        }
-
-        // floor
-        template <class A>
-        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_floor_ps(self);
-        }
-        template <class A>
-        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_floor_pd(self);
-        }
-
-        // insert
-        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_insert_epi8(self, val, I);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_insert_epi32(self, val, I);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
-                return _mm_insert_epi64(self, val, I);
-#else
-                uint32_t lo, hi;
-                memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
-                memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
-                return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
-#endif
-            }
-            else
-            {
-                return insert(self, val, pos, ssse3 {});
-            }
-        }
-
-        // max
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_max_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_max_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_max_epi32(self, other);
-                }
-                else
-                {
-                    return max(self, other, ssse3 {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_max_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_max_epu16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_max_epu32(self, other);
-                }
-                else
-                {
-                    return max(self, other, ssse3 {});
-                }
-            }
-        }
-
-        // min
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
-        {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_min_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_min_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_min_epi32(self, other);
-                }
-                else
-                {
-                    return min(self, other, ssse3 {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_min_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_min_epu16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_min_epu32(self, other);
-                }
-                else
-                {
-                    return min(self, other, ssse3 {});
-                }
-            }
-        }
-
-        // mul
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_or_si128(
-                    _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
-                    _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_mullo_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_mullo_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_add_epi64(
-                    _mm_mul_epu32(self, other),
-                    _mm_slli_epi64(
-                        _mm_add_epi64(
-                            _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
-                            _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
-                        32));
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
-        // nearbyint
-        template <class A>
-        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
-        }
-        template <class A>
-        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
-        }
-
-        // select
-        namespace detail
-        {
-            template <class T>
-            inline constexpr T interleave(T const& cond) noexcept
-            {
-                return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
-            }
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_blendv_epi8(false_br, true_br, cond);
-        }
-        template <class A>
-        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_blendv_ps(false_br, true_br, cond);
-        }
-        template <class A>
-        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_blendv_pd(false_br, true_br, cond);
-        }
-
-        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
-        {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_blend_epi16(false_br, true_br, mask);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                constexpr int imask = detail::interleave(mask);
-                return _mm_blend_epi16(false_br, true_br, imask);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                constexpr int imask = detail::interleave(mask);
-                constexpr int imask2 = detail::interleave(imask);
-                return _mm_blend_epi16(false_br, true_br, imask2);
-            }
-            else
-            {
-                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
-            }
-        }
-        template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
-        {
-            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
-            return _mm_blend_ps(false_br, true_br, mask);
-        }
-        template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
-        {
-            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
-            return _mm_blend_pd(false_br, true_br, mask);
-        }
-
-        // trunc
-        template <class A>
-        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
-        }
-        template <class A>
-        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
-        {
-            return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
@ -1,44 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE4_2_HPP
-#define XSIMD_SSE4_2_HPP
-
-#include <limits>
-
-#include "../types/xsimd_sse4_2_register.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // lt
-        template <class A>
-        inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
-        {
-            return _mm_cmpgt_epi64(other, self);
-        }
-        template <class A>
-        inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
-        {
-            auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
-            auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
-            return _mm_cmpgt_epi64(xother, xself);
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@ -1,142 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSSE3_HPP
-#define XSIMD_SSSE3_HPP
-
-#include <cstddef>
-#include <type_traits>
-
-#include "../types/xsimd_ssse3_register.hpp"
-#include "../types/xsimd_utils.hpp"
-
-namespace xsimd
-{
-
-    namespace kernel
-    {
-        using namespace types;
-
-        // abs
-        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
-        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_abs_epi8(self);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_abs_epi16(self);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_abs_epi32(self);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_abs_epi64(self);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-
-        // extract_pair
-        namespace detail
-        {
-
-            template <class T, class A>
-            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
-            {
-                return other;
-            }
-
-            template <class T, class A, std::size_t I, std::size_t... Is>
-            inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
-            {
-                if (i == I)
-                {
-                    return _mm_alignr_epi8(self, other, sizeof(T) * I);
-                }
-                else
-                    return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
-            }
-        }
-
-        template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
-        {
-            constexpr std::size_t size = batch<T, A>::size;
-            assert(0 <= i && i < size && "index in bounds");
-            return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
-        }
-
-        // reduce_add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                __m128i tmp1 = _mm_hadd_epi16(self, self);
-                __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
-                __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
-                return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                __m128i tmp1 = _mm_hadd_epi32(self, self);
-                __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
-                return _mm_cvtsi128_si32(tmp2);
-            }
-            else
-            {
-                return reduce_add(self, sse3 {});
-            }
-        }
-
-        // swizzle
-        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
-        {
-            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
-                                     2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
-                mask8;
-            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
-        }
-
-        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
-        {
-            return bitwise_cast<batch<int16_t, A>>(swizzle(bitwise_cast<batch<uint16_t, A>>(self), mask, ssse3 {}));
-        }
-
-        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
-                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
-        {
-            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
-        }
-
-        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
-                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
-        {
-            return bitwise_cast<batch<int8_t, A>>(swizzle(bitwise_cast<batch<uint8_t, A>>(self), mask, ssse3 {}));
-        }
-
-    }
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
--- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
@ -1,249 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_ARCH_HPP
-#define XSIMD_ARCH_HPP
-
-#include <initializer_list>
-#include <type_traits>
-#include <utility>
-
-#include "../types/xsimd_all_registers.hpp"
-#include "./xsimd_config.hpp"
-#include "./xsimd_cpuid.hpp"
-
-namespace xsimd
-{
-
-    namespace detail
-    {
-        // Checks whether T appears in Tys.
-        template <class T, class... Tys>
-        struct contains;
-
-        template <class T>
-        struct contains<T> : std::false_type
-        {
-        };
-
-        template <class T, class Ty, class... Tys>
-        struct contains<T, Ty, Tys...>
-            : std::conditional<std::is_same<Ty, T>::value, std::true_type,
-                               contains<T, Tys...>>::type
-        {
-        };
-
-        template <class... Archs>
-        struct is_sorted;
-
-        template <>
-        struct is_sorted<> : std::true_type
-        {
-        };
-
-        template <class Arch>
-        struct is_sorted<Arch> : std::true_type
-        {
-        };
-
-        template <class A0, class A1, class... Archs>
-        struct is_sorted<A0, A1, Archs...>
-            : std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
-                               std::false_type>::type
-        {
-        };
-
-        template <typename T>
-        inline constexpr T max_of(T value) noexcept
-        {
-            return value;
-        }
-
-        template <typename T, typename... Ts>
-        inline constexpr T max_of(T head0, T head1, Ts... tail) noexcept
-        {
-            return max_of((head0 > head1 ? head0 : head1), tail...);
-        }
-
-    } // namespace detail
-
-    // An arch_list is a list of architectures, sorted by version number.
-    template <class... Archs>
-    struct arch_list
-    {
-#ifndef NDEBUG
-        static_assert(detail::is_sorted<Archs...>::value,
-                      "architecture list must be sorted by version");
-#endif
-
-        template <class Arch>
-        using add = arch_list<Archs..., Arch>;
-
-        template <class... OtherArchs>
-        using extend = arch_list<Archs..., OtherArchs...>;
-
-        template <class Arch>
-        static constexpr bool contains() noexcept
-        {
-            return detail::contains<Arch, Archs...>::value;
-        }
-
-        template <class F>
-        static void for_each(F&& f) noexcept
-        {
-            (void)std::initializer_list<bool> { (f(Archs {}), true)... };
-        }
-
-        static constexpr std::size_t alignment() noexcept
-        {
-            // all alignments are a power of two
-            return detail::max_of(Archs::alignment()..., static_cast<size_t>(0));
-        }
-    };
-
-    struct unavailable
-    {
-        static constexpr bool supported() noexcept { return false; }
-        static constexpr bool available() noexcept { return false; }
-        static constexpr unsigned version() noexcept { return 0; }
-        static constexpr std::size_t alignment() noexcept { return 0; }
-        static constexpr bool requires_alignment() noexcept { return false; }
-        static constexpr char const* name() noexcept { return "<none>"; }
-    };
-
-    namespace detail
-    {
-        // Pick the best architecture in arch_list L, which is the last
-        // because architectures are sorted by version.
-        template <class L>
-        struct best;
-
-        template <>
-        struct best<arch_list<>>
-        {
-            using type = unavailable;
-        };
-
-        template <class Arch, class... Archs>
-        struct best<arch_list<Arch, Archs...>>
-        {
-            using type = Arch;
-        };
-
-        // Filter archlists Archs, picking only supported archs and adding
-        // them to L.
-        template <class L, class... Archs>
-        struct supported_helper;
-
-        template <class L>
-        struct supported_helper<L, arch_list<>>
-        {
-            using type = L;
-        };
-
-        template <class L, class Arch, class... Archs>
-        struct supported_helper<L, arch_list<Arch, Archs...>>
-            : supported_helper<
-                  typename std::conditional<Arch::supported(),
-                                            typename L::template add<Arch>, L>::type,
-                  arch_list<Archs...>>
-        {
-        };
-
-        template <class... Archs>
-        struct supported : supported_helper<arch_list<>, Archs...>
-        {
-        };
-
-        // Joins all arch_list Archs in a single arch_list.
-        template <class... Archs>
-        struct join;
-
-        template <class Arch>
-        struct join<Arch>
-        {
-            using type = Arch;
-        };
-
-        template <class Arch, class... Archs, class... Args>
-        struct join<Arch, arch_list<Archs...>, Args...>
-            : join<typename Arch::template extend<Archs...>, Args...>
-        {
-        };
-    } // namespace detail
-
-    struct unsupported
-    {
-    };
-    using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
-    using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
-    using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
-    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures>::type;
-
-    using supported_architectures = typename detail::supported<all_architectures>::type;
-
-    using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
-    using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
-    // using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
-    using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
-                                                   arm_arch,
-                                                   x86_arch>::type;
-
-    namespace detail
-    {
-        template <class F, class ArchList>
-        class dispatcher
-        {
-
-            const unsigned best_arch;
-            F functor;
-
-            template <class Arch, class... Tys>
-            auto walk_archs(arch_list<Arch>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
-            {
-                assert(Arch::available() && "At least one arch must be supported during dispatch");
-                return functor(Arch {}, std::forward<Tys>(args)...);
-            }
-
-            template <class Arch, class ArchNext, class... Archs, class... Tys>
-            auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
-            {
-                if (Arch::version() <= best_arch)
-                    return functor(Arch {}, std::forward<Tys>(args)...);
-                else
-                    return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
-            }
-
-        public:
-            dispatcher(F f) noexcept
-                : best_arch(available_architectures().best)
-                , functor(f)
-            {
-            }
-
-            template <class... Tys>
-            auto operator()(Tys&&... args) noexcept -> decltype(functor(default_arch {}, std::forward<Tys>(args)...))
-            {
-                return walk_archs(ArchList {}, std::forward<Tys>(args)...);
-            }
-        };
-    }
-
-    // Generic function dispatch, à la ifunc
-    template <class ArchList = supported_architectures, class F>
-    inline detail::dispatcher<F, ArchList> dispatch(F&& f) noexcept
-    {
-        return { std::forward<F>(f) };
-    }
-
-} // namespace xsimd
-
-#endif
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@ -1,341 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_CONFIG_HPP
-#define XSIMD_CONFIG_HPP
-
-#define XSIMD_VERSION_MAJOR 10
-#define XSIMD_VERSION_MINOR 0
-#define XSIMD_VERSION_PATCH 0
-
-/**
- * high level free functions
- *
- * @defgroup xsimd_config_macro Instruction Set Detection
- */
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if SSE2 is available at compile-time, to 0 otherwise.
- */
-#ifdef __SSE2__
-#define XSIMD_WITH_SSE2 1
-#else
-#define XSIMD_WITH_SSE2 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if SSE3 is available at compile-time, to 0 otherwise.
- */
-#ifdef __SSE3__
-#define XSIMD_WITH_SSE3 1
-#else
-#define XSIMD_WITH_SSE3 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if SSSE3 is available at compile-time, to 0 otherwise.
- */
-#ifdef __SSSE3__
-#define XSIMD_WITH_SSSE3 1
-#else
-#define XSIMD_WITH_SSSE3 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if SSE4.1 is available at compile-time, to 0 otherwise.
- */
-#ifdef __SSE4_1__
-#define XSIMD_WITH_SSE4_1 1
-#else
-#define XSIMD_WITH_SSE4_1 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if SSE4.2 is available at compile-time, to 0 otherwise.
- */
-#ifdef __SSE4_2__
-#define XSIMD_WITH_SSE4_2 1
-#else
-#define XSIMD_WITH_SSE4_2 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if AVX is available at compile-time, to 0 otherwise.
- */
-#ifdef __AVX__
-#define XSIMD_WITH_AVX 1
-#else
-#define XSIMD_WITH_AVX 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if AVX2 is available at compile-time, to 0 otherwise.
- */
-#ifdef __AVX2__
-#define XSIMD_WITH_AVX2 1
-#else
-#define XSIMD_WITH_AVX2 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if FMA3 for SSE is available at compile-time, to 0 otherwise.
- */
-#ifdef __FMA__
-
-#if defined(__SSE__)
-#ifndef XSIMD_WITH_FMA3_SSE // Leave the opportunity to manually disable it, see #643
-#define XSIMD_WITH_FMA3_SSE 1
-#endif
-#else
-
-#if XSIMD_WITH_FMA3_SSE
-#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
-#endif
-
-#define XSIMD_WITH_FMA3_SSE 0
-#endif
-
-#else
-
-#if XSIMD_WITH_FMA3_SSE
-#error "Manually set XSIMD_WITH_FMA3_SSE is incompatible with current compiler flags"
-#endif
-
-#define XSIMD_WITH_FMA3_SSE 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if FMA3 for AVX is available at compile-time, to 0 otherwise.
- */
-#ifdef __FMA__
-
-#if defined(__AVX__)
-#ifndef XSIMD_WITH_FMA3_AVX // Leave the opportunity to manually disable it, see #643
-#define XSIMD_WITH_FMA3_AVX 1
-#endif
-#else
-
-#if XSIMD_WITH_FMA3_AVX
-#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
-#endif
-
-#define XSIMD_WITH_FMA3_AVX 0
-#endif
-
-#if defined(__AVX2__)
-#ifndef XSIMD_WITH_FMA3_AVX2 // Leave the opportunity to manually disable it, see #643
-#define XSIMD_WITH_FMA3_AVX2 1
-#endif
-#else
-
-#if XSIMD_WITH_FMA3_AVX2
-#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
-#endif
-
-#define XSIMD_WITH_FMA3_AVX2 0
-#endif
-
-#else
-
-#if XSIMD_WITH_FMA3_AVX
-#error "Manually set XSIMD_WITH_FMA3_AVX is incompatible with current compiler flags"
-#endif
-
-#if XSIMD_WITH_FMA3_AVX2
-#error "Manually set XSIMD_WITH_FMA3_AVX2 is incompatible with current compiler flags"
-#endif
-
-#define XSIMD_WITH_FMA3_AVX 0
-#define XSIMD_WITH_FMA3_AVX2 0
-
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if FMA4 is available at compile-time, to 0 otherwise.
- */
-#ifdef __FMA4__
-#define XSIMD_WITH_FMA4 1
-#else
-#define XSIMD_WITH_FMA4 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if AVX512F is available at compile-time, to 0 otherwise.
- */
-#ifdef __AVX512F__
-// AVX512 instructions are supported starting with gcc 6
-// see https://www.gnu.org/software/gcc/gcc-6/changes.html
-// check clang first, newer clang always defines __GNUC__ = 4
-#if defined(__clang__) && __clang_major__ >= 6
-#define XSIMD_WITH_AVX512F 1
-#elif defined(__GNUC__) && __GNUC__ < 6
-#define XSIMD_WITH_AVX512F 0
-#else
-#define XSIMD_WITH_AVX512F 1
-#if __GNUC__ == 6
-#define XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY 1
-#endif
-#endif
-#else
-#define XSIMD_WITH_AVX512F 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if AVX512CD is available at compile-time, to 0 otherwise.
- */
-#ifdef __AVX512CD__
-// Avoids repeating the GCC workaround over and over
-#define XSIMD_WITH_AVX512CD XSIMD_WITH_AVX512F
-#else
-#define XSIMD_WITH_AVX512CD 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if AVX512DQ is available at compile-time, to 0 otherwise.
- */
-#ifdef __AVX512DQ__
-#define XSIMD_WITH_AVX512DQ XSIMD_WITH_AVX512F
-#else
-#define XSIMD_WITH_AVX512DQ 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if AVX512BW is available at compile-time, to 0 otherwise.
- */
-#ifdef __AVX512BW__
-#define XSIMD_WITH_AVX512BW XSIMD_WITH_AVX512F
-#else
-#define XSIMD_WITH_AVX512BW 0
-#endif
-
-#ifdef __ARM_NEON
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if NEON is available at compile-time, to 0 otherwise.
- */
-#if __ARM_ARCH >= 7
-#define XSIMD_WITH_NEON 1
-#else
-#define XSIMD_WITH_NEON 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if NEON64 is available at compile-time, to 0 otherwise.
- */
-#ifdef __aarch64__
-#define XSIMD_WITH_NEON64 1
-#else
-#define XSIMD_WITH_NEON64 0
-#endif
-#else
-#define XSIMD_WITH_NEON 0
-#define XSIMD_WITH_NEON64 0
-#endif
-
-/**
- * @ingroup xsimd_config_macro
- *
- * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
- */
-#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
-#define XSIMD_WITH_SVE 1
-#define XSIMD_SVE_BITS __ARM_FEATURE_SVE_BITS
-#else
-#define XSIMD_WITH_SVE 0
-#define XSIMD_SVE_BITS 0
-#endif
-
-// Workaround for MSVC compiler
-#ifdef _MSC_VER
-
-#if XSIMD_WITH_AVX512
-#undef XSIMD_WITH_AVX2
-#define XSIMD_WITH_AVX2 1
-#endif
-
-#if XSIMD_WITH_AVX2
-#undef XSIMD_WITH_AVX
-#define XSIMD_WITH_AVX 1
-#undef XSIMD_WITH_FMA3_AVX
-#define XSIMD_WITH_FMA3_AVX 1
-#undef XSIMD_WITH_FMA3_AVX2
-#define XSIMD_WITH_FMA3_AVX2 1
-#endif
-
-#if XSIMD_WITH_AVX
-#undef XSIMD_WITH_SSE4_2
-#define XSIMD_WITH_SSE4_2 1
-#endif
-
-#if !defined(__clang__) && (defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))
-#undef XSIMD_WITH_SSE4_2
-#define XSIMD_WITH_SSE4_2 1
-#endif
-
-#if XSIMD_WITH_SSE4_2
-#undef XSIMD_WITH_SSE4_1
-#define XSIMD_WITH_SSE4_1 1
-#endif
-
-#if XSIMD_WITH_SSE4_1
-#undef XSIMD_WITH_SSSE3
-#define XSIMD_WITH_SSSE3 1
-#endif
-
-#if XSIMD_WITH_SSSE3
-#undef XSIMD_WITH_SSE3
-#define XSIMD_WITH_SSE3 1
-#endif
-
-#if XSIMD_WITH_SSE3 || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
-#undef XSIMD_WITH_SSE2
-#define XSIMD_WITH_SSE2 1
-#endif
-
-#endif
-
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE
-#define XSIMD_NO_SUPPORTED_ARCHITECTURE
-#endif
-
-#endif
--- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
@ -1,180 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_CPUID_HPP
-#define XSIMD_CPUID_HPP
-
-#include <algorithm>
-#include <cstring>
-
-#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
-#include <asm/hwcap.h>
-#include <sys/auxv.h>
-#endif
-
-#if defined(_MSC_VER)
-// Contains the definition of __cpuidex
-#include <intrin.h>
-#endif
-
-#include "../types/xsimd_all_registers.hpp"
-
-namespace xsimd
-{
-    namespace detail
-    {
-        struct supported_arch
-        {
-            unsigned sse2 : 1;
-            unsigned sse3 : 1;
-            unsigned ssse3 : 1;
-            unsigned sse4_1 : 1;
-            unsigned sse4_2 : 1;
-            unsigned sse4a : 1;
-            unsigned fma3_sse : 1;
-            unsigned fma4 : 1;
-            unsigned xop : 1;
-            unsigned avx : 1;
-            unsigned fma3_avx : 1;
-            unsigned avx2 : 1;
-            unsigned fma3_avx2 : 1;
-            unsigned avx512f : 1;
-            unsigned avx512cd : 1;
-            unsigned avx512dq : 1;
-            unsigned avx512bw : 1;
-            unsigned neon : 1;
-            unsigned neon64 : 1;
-
-            // version number of the best arch available
-            unsigned best;
-
-            supported_arch() noexcept
-            {
-                memset(this, 0, sizeof(supported_arch));
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-                neon = 1;
-                neon64 = 1;
-                best = neon64::version();
-#elif defined(__ARM_NEON) || defined(_M_ARM)
-#if defined(__linux__)
-                neon = bool(getauxval(AT_HWCAP) & HWCAP_NEON);
-#else
-                // that's very conservative :-/
-                neon = 0;
-#endif
-                neon64 = 0;
-                best = neon::version() * neon;
-
-#elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
-                auto get_cpuid = [](int reg[4], int func_id) noexcept
-                {
-
-#if defined(_MSC_VER)
-                    __cpuidex(reg, func_id, 0);
-
-#elif defined(__INTEL_COMPILER)
-                    __cpuid(reg, func_id);
-
-#elif defined(__GNUC__) || defined(__clang__)
-
-#if defined(__i386__) && defined(__PIC__)
-                    // %ebx may be the PIC register
-                    __asm__("xchg{l}\t{%%}ebx, %1\n\t"
-                            "cpuid\n\t"
-                            "xchg{l}\t{%%}ebx, %1\n\t"
-                            : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
-                              "=d"(reg[3])
-                            : "a"(func_id), "c"(0));
-
-#else
-                    __asm__("cpuid\n\t"
-                            : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
-                              "=d"(reg[3])
-                            : "a"(func_id), "c"(0));
-#endif
-
-#else
-#error "Unsupported configuration"
-#endif
-                };
-
-                int regs[4];
-
-                get_cpuid(regs, 0x1);
-
-                sse2 = regs[3] >> 26 & 1;
-                best = std::max(best, sse2::version() * sse2);
-
-                sse3 = regs[2] >> 0 & 1;
-                best = std::max(best, sse3::version() * sse3);
-
-                ssse3 = regs[2] >> 9 & 1;
-                best = std::max(best, ssse3::version() * ssse3);
-
-                sse4_1 = regs[2] >> 19 & 1;
-                best = std::max(best, sse4_1::version() * sse4_1);
-
-                sse4_2 = regs[2] >> 20 & 1;
-                best = std::max(best, sse4_2::version() * sse4_2);
-
-                fma3_sse = regs[2] >> 12 & 1;
-                if (sse4_2)
-                    best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);
-
-                get_cpuid(regs, 0x80000001);
-                fma4 = regs[2] >> 16 & 1;
-                best = std::max(best, fma4::version() * fma4);
-
-                // sse4a = regs[2] >> 6 & 1;
-                // best = std::max(best, XSIMD_X86_AMD_SSE4A_VERSION * sse4a);
-
-                // xop = regs[2] >> 11 & 1;
-                // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);
-
-                avx = regs[2] >> 28 & 1;
-                best = std::max(best, avx::version() * avx);
-
-                fma3_avx = avx && fma3_sse;
-                best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
-
-                get_cpuid(regs, 0x7);
-                avx2 = regs[1] >> 5 & 1;
-                best = std::max(best, avx2::version() * avx2);
-
-                fma3_avx2 = avx2 && fma3_sse;
-                best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
-
-                avx512f = regs[1] >> 16 & 1;
-                best = std::max(best, avx512f::version() * avx512f);
-
-                avx512cd = regs[1] >> 28 & 1;
-                best = std::max(best, avx512cd::version() * avx512cd * avx512f);
-
-                avx512dq = regs[1] >> 17 & 1;
-                best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);
-
-                avx512bw = regs[1] >> 30 & 1;
-                best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
-
-#endif
-            }
-        };
-    }
-
-    inline detail::supported_arch available_architectures() noexcept
-    {
-        static detail::supported_arch supported;
-        return supported;
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
+++ b/third_party/xsimd/include/xsimd/math/xsimd_rem_pio2.hpp
@ -1,719 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#include <cmath>
-#include <cstdint>
-#include <cstring>
-
-namespace xsimd
-{
-    namespace detail
-    {
-
-        /* origin: boost/simd/arch/common/scalar/function/rem_pio2.hpp */
-        /*
-         * ====================================================
-         * copyright 2016 NumScale SAS
-         *
-         * Distributed under the Boost Software License, Version 1.0.
-         * (See copy at http://boost.org/LICENSE_1_0.txt)
-         * ====================================================
-         */
-#if defined(_MSC_VER)
-#define ONCE0                                       \
-    __pragma(warning(push))                         \
-        __pragma(warning(disable : 4127)) while (0) \
-            __pragma(warning(pop)) /**/
-#else
-#define ONCE0 while (0)
-#endif
-
-        /*
-         * ====================================================
-         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
-         *
-         * Developed at SunPro, a Sun Microsystems, Inc. business.
-         * Permission to use, copy, modify, and distribute this
-         * software is freely granted, provided that this notice
-         * is preserved.
-         * ====================================================
-         */
-
-#if defined(__GNUC__) && defined(__BYTE_ORDER__)
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define XSIMD_LITTLE_ENDIAN
-#endif
-#elif defined(_WIN32)
-// We can safely assume that Windows is always little endian
-#define XSIMD_LITTLE_ENDIAN
-#elif defined(i386) || defined(i486) || defined(intel) || defined(x86) || defined(i86pc) || defined(__alpha) || defined(__osf__)
-#define XSIMD_LITTLE_ENDIAN
-#endif
-
-#ifdef XSIMD_LITTLE_ENDIAN
-#define LOW_WORD_IDX 0
-#define HIGH_WORD_IDX sizeof(std::uint32_t)
-#else
-#define LOW_WORD_IDX sizeof(std::uint32_t)
-#define HIGH_WORD_IDX 0
-#endif
-
-#define GET_HIGH_WORD(i, d)                                            \
-    do                                                                 \
-    {                                                                  \
-        double f = (d);                                                \
-        std::memcpy(&(i), reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
-                    sizeof(std::uint32_t));                            \
-    }                                                                  \
-    ONCE0                                                              \
-    /**/
-
-#define GET_LOW_WORD(i, d)                                            \
-    do                                                                \
-    {                                                                 \
-        double f = (d);                                               \
-        std::memcpy(&(i), reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
-                    sizeof(std::uint32_t));                           \
-    }                                                                 \
-    ONCE0                                                             \
-    /**/
-
-#define SET_HIGH_WORD(d, v)                                      \
-    do                                                           \
-    {                                                            \
-        double f = (d);                                          \
-        std::uint32_t value = (v);                               \
-        std::memcpy(reinterpret_cast<char*>(&f) + HIGH_WORD_IDX, \
-                    &value, sizeof(std::uint32_t));              \
-        (d) = f;                                                 \
-    }                                                            \
-    ONCE0                                                        \
-    /**/
-
-#define SET_LOW_WORD(d, v)                                      \
-    do                                                          \
-    {                                                           \
-        double f = (d);                                         \
-        std::uint32_t value = (v);                              \
-        std::memcpy(reinterpret_cast<char*>(&f) + LOW_WORD_IDX, \
-                    &value, sizeof(std::uint32_t));             \
-        (d) = f;                                                \
-    }                                                           \
-    ONCE0                                                       \
-    /**/
-
-        /*
-         * __kernel_rem_pio2(x,y,e0,nx,prec,ipio2)
-         * double x[],y[]; int e0,nx,prec; int ipio2[];
-         *
-         * __kernel_rem_pio2 return the last three digits of N with
-         *		y = x - N*pi/2
-         * so that |y| < pi/2.
-         *
-         * The method is to compute the integer (mod 8) and fraction parts of
-         * (2/pi)*x without doing the full multiplication. In general we
-         * skip the part of the product that are known to be a huge integer (
-         * more accurately, = 0 mod 8 ). Thus the number of operations are
-         * independent of the exponent of the input.
-         *
-         * (2/pi) is represented by an array of 24-bit integers in ipio2[].
-         *
-         * Input parameters:
-         * 	x[]	The input value (must be positive) is broken into nx
-         *		pieces of 24-bit integers in double precision format.
-         *		x[i] will be the i-th 24 bit of x. The scaled exponent
-         *		of x[0] is given in input parameter e0 (i.e., x[0]*2^e0
-         *		match x's up to 24 bits.
-         *
-         *		Example of breaking a double positive z into x[0]+x[1]+x[2]:
-         *			e0 = ilogb(z)-23
-         *			z  = scalbn(z,-e0)
-         *		for i = 0,1,2
-         *			x[i] = floor(z)
-         *			z    = (z-x[i])*2**24
-         *
-         *
-         *	y[]	ouput result in an array of double precision numbers.
-         *		The dimension of y[] is:
-         *			24-bit  precision	1
-         *			53-bit  precision	2
-         *			64-bit  precision	2
-         *			113-bit precision	3
-         *		The actual value is the sum of them. Thus for 113-bit
-         *		precison, one may have to do something like:
-         *
-         *		long double t,w,r_head, r_tail;
-         *		t = (long double)y[2] + (long double)y[1];
-         *		w = (long double)y[0];
-         *		r_head = t+w;
-         *		r_tail = w - (r_head - t);
-         *
-         *	e0	The exponent of x[0]
-         *
-         *	nx	dimension of x[]
-         *
-         *  	prec	an integer indicating the precision:
-         *			0	24  bits (single)
-         *			1	53  bits (double)
-         *			2	64  bits (extended)
-         *			3	113 bits (quad)
-         *
-         *	ipio2[]
-         *		integer array, contains the (24*i)-th to (24*i+23)-th
-         *		bit of 2/pi after binary point. The corresponding
-         *		floating value is
-         *
-         *			ipio2[i] * 2^(-24(i+1)).
-         *
-         * External function:
-         *	double scalbn(), floor();
-         *
-         *
-         * Here is the description of some local variables:
-         *
-         * 	jk	jk+1 is the initial number of terms of ipio2[] needed
-         *		in the computation. The recommended value is 2,3,4,
-         *		6 for single, double, extended,and quad.
-         *
-         * 	jz	local integer variable indicating the number of
-         *		terms of ipio2[] used.
-         *
-         *	jx	nx - 1
-         *
-         *	jv	index for pointing to the suitable ipio2[] for the
-         *		computation. In general, we want
-         *			( 2^e0*x[0] * ipio2[jv-1]*2^(-24jv) )/8
-         *		is an integer. Thus
-         *			e0-3-24*jv >= 0 or (e0-3)/24 >= jv
-         *		Hence jv = max(0,(e0-3)/24).
-         *
-         *	jp	jp+1 is the number of terms in PIo2[] needed, jp = jk.
-         *
-         * 	q[]	double array with integral value, representing the
-         *		24-bits chunk of the product of x and 2/pi.
-         *
-         *	q0	the corresponding exponent of q[0]. Note that the
-         *		exponent for q[i] would be q0-24*i.
-         *
-         *	PIo2[]	double precision array, obtained by cutting pi/2
-         *		into 24 bits chunks.
-         *
-         *	f[]	ipio2[] in floating point
-         *
-         *	iq[]	integer array by breaking up q[] in 24-bits chunk.
-         *
-         *	fq[]	final product of x*(2/pi) in fq[0],..,fq[jk]
-         *
-         *	ih	integer. If >0 it indicates q[] is >= 0.5, hence
-         *		it also indicates the *sign* of the result.
-         *
-         */
-
-        inline int32_t __kernel_rem_pio2(double* x, double* y, int32_t e0, int32_t nx, int32_t prec, const int32_t* ipio2) noexcept
-        {
-            static const int32_t init_jk[] = { 2, 3, 4, 6 }; /* initial value for jk */
-
-            static const double PIo2[] = {
-                1.57079625129699707031e+00, /* 0x3FF921FB, 0x40000000 */
-                7.54978941586159635335e-08, /* 0x3E74442D, 0x00000000 */
-                5.39030252995776476554e-15, /* 0x3CF84698, 0x80000000 */
-                3.28200341580791294123e-22, /* 0x3B78CC51, 0x60000000 */
-                1.27065575308067607349e-29, /* 0x39F01B83, 0x80000000 */
-                1.22933308981111328932e-36, /* 0x387A2520, 0x40000000 */
-                2.73370053816464559624e-44, /* 0x36E38222, 0x80000000 */
-                2.16741683877804819444e-51, /* 0x3569F31D, 0x00000000 */
-            };
-
-            static const double
-                zero
-                = 0.0,
-                one = 1.0,
-                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
-                twon24 = 5.96046447753906250000e-08; /* 0x3E700000, 0x00000000 */
-
-            int32_t jz, jx, jv, jp, jk, carry, n, iq[20], i, j, k, m, q0, ih;
-            double z, fw, f[20], fq[20], q[20];
-
-            /* initialize jk*/
-            jk = init_jk[prec];
-            jp = jk;
-
-            /* determine jx,jv,q0, note that 3>q0 */
-            jx = nx - 1;
-            jv = (e0 - 3) / 24;
-            if (jv < 0)
-                jv = 0;
-            q0 = e0 - 24 * (jv + 1);
-
-            /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
-            j = jv - jx;
-            m = jx + jk;
-            for (i = 0; i <= m; i++, j++)
-                f[i] = (j < 0) ? zero : (double)ipio2[j];
-
-            /* compute q[0],q[1],...q[jk] */
-            for (i = 0; i <= jk; i++)
-            {
-                for (j = 0, fw = 0.0; j <= jx; j++)
-                    fw += x[j] * f[jx + i - j];
-                q[i] = fw;
-            }
-
-            jz = jk;
-
-        recompute:
-            /* distill q[] into iq[] reversingly */
-            for (i = 0, j = jz, z = q[jz]; j > 0; i++, j--)
-            {
-                fw = (double)((int32_t)(twon24 * z));
-                iq[i] = (int)(z - two24 * fw);
-                z = q[j - 1] + fw;
-            }
-
-            /* compute n */
-            z = std::scalbn(z, q0); /* actual value of z */
-            z -= 8.0 * std::floor(z * 0.125); /* trim off integer >= 8 */
-            n = (int32_t)z;
-            z -= (double)n;
-            ih = 0;
-            if (q0 > 0)
-            { /* need iq[jz-1] to determine n */
-                i = (iq[jz - 1] >> (24 - q0));
-                n += i;
-                iq[jz - 1] -= i << (24 - q0);
-                ih = iq[jz - 1] >> (23 - q0);
-            }
-            else if (q0 == 0)
-                ih = iq[jz - 1] >> 23;
-            else if (z >= 0.5)
-                ih = 2;
-
-            if (ih > 0)
-            { /* q > 0.5 */
-                n += 1;
-                carry = 0;
-                for (i = 0; i < jz; i++)
-                { /* compute 1-q */
-                    j = iq[i];
-                    if (carry == 0)
-                    {
-                        if (j != 0)
-                        {
-                            carry = 1;
-                            iq[i] = 0x1000000 - j;
-                        }
-                    }
-                    else
-                        iq[i] = 0xffffff - j;
-                }
-                if (q0 > 0)
-                { /* rare case: chance is 1 in 12 */
-                    switch (q0)
-                    {
-                    case 1:
-                        iq[jz - 1] &= 0x7fffff;
-                        break;
-                    case 2:
-                        iq[jz - 1] &= 0x3fffff;
-                        break;
-                    }
-                }
-                if (ih == 2)
-                {
-                    z = one - z;
-                    if (carry != 0)
-                        z -= std::scalbn(one, q0);
-                }
-            }
-
-            /* check if recomputation is needed */
-            if (z == zero)
-            {
-                j = 0;
-                for (i = jz - 1; i >= jk; i--)
-                    j |= iq[i];
-                if (j == 0)
-                { /* need recomputation */
-                    for (k = 1; iq[jk - k] == 0; k++)
-                        ; /* k = no. of terms needed */
-
-                    for (i = jz + 1; i <= jz + k; i++)
-                    { /* add q[jz+1] to q[jz+k] */
-                        f[jx + i] = (double)ipio2[jv + i];
-                        for (j = 0, fw = 0.0; j <= jx; j++)
-                            fw += x[j] * f[jx + i - j];
-                        q[i] = fw;
-                    }
-                    jz += k;
-                    goto recompute;
-                }
-            }
-
-            /* chop off zero terms */
-            if (z == 0.0)
-            {
-                jz -= 1;
-                q0 -= 24;
-                while (iq[jz] == 0)
-                {
-                    jz--;
-                    q0 -= 24;
-                }
-            }
-            else
-            { /* break z into 24-bit if necessary */
-                z = std::scalbn(z, -q0);
-                if (z >= two24)
-                {
-                    fw = (double)((int32_t)(twon24 * z));
-                    iq[jz] = (int32_t)(z - two24 * fw);
-                    jz += 1;
-                    q0 += 24;
-                    iq[jz] = (int32_t)fw;
-                }
-                else
-                    iq[jz] = (int32_t)z;
-            }
-
-            /* convert integer "bit" chunk to floating-point value */
-            fw = scalbn(one, q0);
-            for (i = jz; i >= 0; i--)
-            {
-                q[i] = fw * (double)iq[i];
-                fw *= twon24;
-            }
-
-            /* compute PIo2[0,...,jp]*q[jz,...,0] */
-            for (i = jz; i >= 0; i--)
-            {
-                for (fw = 0.0, k = 0; k <= jp && k <= jz - i; k++)
-                    fw += PIo2[k] * q[i + k];
-                fq[jz - i] = fw;
-            }
-
-            /* compress fq[] into y[] */
-            switch (prec)
-            {
-            case 0:
-                fw = 0.0;
-                for (i = jz; i >= 0; i--)
-                    fw += fq[i];
-                y[0] = (ih == 0) ? fw : -fw;
-                break;
-            case 1:
-            case 2:
-                fw = 0.0;
-                for (i = jz; i >= 0; i--)
-                    fw += fq[i];
-                y[0] = (ih == 0) ? fw : -fw;
-                fw = fq[0] - fw;
-                for (i = 1; i <= jz; i++)
-                    fw += fq[i];
-                y[1] = (ih == 0) ? fw : -fw;
-                break;
-            case 3: /* painful */
-                for (i = jz; i > 0; i--)
-                {
-                    fw = fq[i - 1] + fq[i];
-                    fq[i] += fq[i - 1] - fw;
-                    fq[i - 1] = fw;
-                }
-                for (i = jz; i > 1; i--)
-                {
-                    fw = fq[i - 1] + fq[i];
-                    fq[i] += fq[i - 1] - fw;
-                    fq[i - 1] = fw;
-                }
-                for (fw = 0.0, i = jz; i >= 2; i--)
-                    fw += fq[i];
-                if (ih == 0)
-                {
-                    y[0] = fq[0];
-                    y[1] = fq[1];
-                    y[2] = fw;
-                }
-                else
-                {
-                    y[0] = -fq[0];
-                    y[1] = -fq[1];
-                    y[2] = -fw;
-                }
-            }
-            return n & 7;
-        }
-
-        inline std::int32_t __ieee754_rem_pio2(double x, double* y) noexcept
-        {
-            static const std::int32_t two_over_pi[] = {
-                0xA2F983,
-                0x6E4E44,
-                0x1529FC,
-                0x2757D1,
-                0xF534DD,
-                0xC0DB62,
-                0x95993C,
-                0x439041,
-                0xFE5163,
-                0xABDEBB,
-                0xC561B7,
-                0x246E3A,
-                0x424DD2,
-                0xE00649,
-                0x2EEA09,
-                0xD1921C,
-                0xFE1DEB,
-                0x1CB129,
-                0xA73EE8,
-                0x8235F5,
-                0x2EBB44,
-                0x84E99C,
-                0x7026B4,
-                0x5F7E41,
-                0x3991D6,
-                0x398353,
-                0x39F49C,
-                0x845F8B,
-                0xBDF928,
-                0x3B1FF8,
-                0x97FFDE,
-                0x05980F,
-                0xEF2F11,
-                0x8B5A0A,
-                0x6D1F6D,
-                0x367ECF,
-                0x27CB09,
-                0xB74F46,
-                0x3F669E,
-                0x5FEA2D,
-                0x7527BA,
-                0xC7EBE5,
-                0xF17B3D,
-                0x0739F7,
-                0x8A5292,
-                0xEA6BFB,
-                0x5FB11F,
-                0x8D5D08,
-                0x560330,
-                0x46FC7B,
-                0x6BABF0,
-                0xCFBC20,
-                0x9AF436,
-                0x1DA9E3,
-                0x91615E,
-                0xE61B08,
-                0x659985,
-                0x5F14A0,
-                0x68408D,
-                0xFFD880,
-                0x4D7327,
-                0x310606,
-                0x1556CA,
-                0x73A8C9,
-                0x60E27B,
-                0xC08C6B,
-            };
-
-            static const std::int32_t npio2_hw[] = {
-                0x3FF921FB,
-                0x400921FB,
-                0x4012D97C,
-                0x401921FB,
-                0x401F6A7A,
-                0x4022D97C,
-                0x4025FDBB,
-                0x402921FB,
-                0x402C463A,
-                0x402F6A7A,
-                0x4031475C,
-                0x4032D97C,
-                0x40346B9C,
-                0x4035FDBB,
-                0x40378FDB,
-                0x403921FB,
-                0x403AB41B,
-                0x403C463A,
-                0x403DD85A,
-                0x403F6A7A,
-                0x40407E4C,
-                0x4041475C,
-                0x4042106C,
-                0x4042D97C,
-                0x4043A28C,
-                0x40446B9C,
-                0x404534AC,
-                0x4045FDBB,
-                0x4046C6CB,
-                0x40478FDB,
-                0x404858EB,
-                0x404921FB,
-            };
-
-            /*
-             * invpio2:  53 bits of 2/pi
-             * pio2_1:   first  33 bit of pi/2
-             * pio2_1t:  pi/2 - pio2_1
-             * pio2_2:   second 33 bit of pi/2
-             * pio2_2t:  pi/2 - (pio2_1+pio2_2)
-             * pio2_3:   third  33 bit of pi/2
-             * pio2_3t:  pi/2 - (pio2_1+pio2_2+pio2_3)
-             */
-
-            static const double
-                zero
-                = 0.00000000000000000000e+00, /* 0x00000000, 0x00000000 */
-                half = 5.00000000000000000000e-01, /* 0x3FE00000, 0x00000000 */
-                two24 = 1.67772160000000000000e+07, /* 0x41700000, 0x00000000 */
-                invpio2 = 6.36619772367581382433e-01, /* 0x3FE45F30, 0x6DC9C883 */
-                pio2_1 = 1.57079632673412561417e+00, /* 0x3FF921FB, 0x54400000 */
-                pio2_1t = 6.07710050650619224932e-11, /* 0x3DD0B461, 0x1A626331 */
-                pio2_2 = 6.07710050630396597660e-11, /* 0x3DD0B461, 0x1A600000 */
-                pio2_2t = 2.02226624879595063154e-21, /* 0x3BA3198A, 0x2E037073 */
-                pio2_3 = 2.02226624871116645580e-21, /* 0x3BA3198A, 0x2E000000 */
-                pio2_3t = 8.47842766036889956997e-32; /* 0x397B839A, 0x252049C1 */
-
-            double z = 0., w, t, r, fn;
-            double tx[3];
-            std::int32_t e0, i, j, nx, n, ix, hx;
-            std::uint32_t low;
-
-            GET_HIGH_WORD(hx, x); /* high word of x */
-            ix = hx & 0x7fffffff;
-            if (ix <= 0x3fe921fb) /* |x| ~<= pi/4 , no need for reduction */
-            {
-                y[0] = x;
-                y[1] = 0;
-                return 0;
-            }
-            if (ix < 0x4002d97c)
-            { /* |x| < 3pi/4, special case with n=+-1 */
-                if (hx > 0)
-                {
-                    z = x - pio2_1;
-                    if (ix != 0x3ff921fb)
-                    { /* 33+53 bit pi is good enough */
-                        y[0] = z - pio2_1t;
-                        y[1] = (z - y[0]) - pio2_1t;
-                    }
-                    else
-                    { /* near pi/2, use 33+33+53 bit pi */
-                        z -= pio2_2;
-                        y[0] = z - pio2_2t;
-                        y[1] = (z - y[0]) - pio2_2t;
-                    }
-                    return 1;
-                }
-                else
-                { /* negative x */
-                    z = x + pio2_1;
-                    if (ix != 0x3ff921fb)
-                    { /* 33+53 bit pi is good enough */
-                        y[0] = z + pio2_1t;
-                        y[1] = (z - y[0]) + pio2_1t;
-                    }
-                    else
-                    { /* near pi/2, use 33+33+53 bit pi */
-                        z += pio2_2;
-                        y[0] = z + pio2_2t;
-                        y[1] = (z - y[0]) + pio2_2t;
-                    }
-
-                    return -1;
-                }
-            }
-            if (ix <= 0x413921fb)
-            { /* |x| ~<= 2^19*(pi/2), medium_ size */
-                t = std::fabs(x);
-                n = (std::int32_t)(t * invpio2 + half);
-                fn = (double)n;
-                r = t - fn * pio2_1;
-                w = fn * pio2_1t; /* 1st round good to 85 bit */
-                if ((n < 32) && (n > 0) && (ix != npio2_hw[n - 1]))
-                {
-                    y[0] = r - w; /* quick check no cancellation */
-                }
-                else
-                {
-                    std::uint32_t high;
-                    j = ix >> 20;
-                    y[0] = r - w;
-                    GET_HIGH_WORD(high, y[0]);
-                    i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
-                    if (i > 16)
-                    { /* 2nd iteration needed, good to 118 */
-                        t = r;
-                        w = fn * pio2_2;
-                        r = t - w;
-                        w = fn * pio2_2t - ((t - r) - w);
-                        y[0] = r - w;
-                        GET_HIGH_WORD(high, y[0]);
-                        i = j - static_cast<int32_t>((high >> 20) & 0x7ff);
-                        if (i > 49)
-                        { /* 3rd iteration need, 151 bits acc */
-                            t = r; /* will cover all possible cases */
-                            w = fn * pio2_3;
-                            r = t - w;
-                            w = fn * pio2_3t - ((t - r) - w);
-                            y[0] = r - w;
-                        }
-                    }
-                }
-                y[1] = (r - y[0]) - w;
-                if (hx < 0)
-                {
-                    y[0] = -y[0];
-                    y[1] = -y[1];
-                    return -n;
-                }
-                else
-                    return n;
-            }
-            /*
-             * all other (large) arguments
-             */
-            if (ix >= 0x7ff00000)
-            { /* x is inf or NaN */
-                y[0] = y[1] = x - x;
-                return 0;
-            }
-            /* set z = scalbn(|x|,ilogb(x)-23) */
-            GET_LOW_WORD(low, x);
-            SET_LOW_WORD(z, low);
-            e0 = (ix >> 20) - 1046; /* e0 = ilogb(z)-23; */
-            SET_HIGH_WORD(z, static_cast<uint32_t>(ix - (e0 << 20)));
-            for (i = 0; i < 2; i++)
-            {
-                tx[i] = (double)((std::int32_t)(z));
-                z = (z - tx[i]) * two24;
-            }
-            tx[2] = z;
-            nx = 3;
-            while (tx[nx - 1] == zero)
-                nx--; /* skip zero term */
-            n = __kernel_rem_pio2(tx, y, e0, nx, 2, two_over_pi);
-            if (hx < 0)
-            {
-                y[0] = -y[0];
-                y[1] = -y[1];
-                return -n;
-            }
-            return n;
-        }
-    }
-
-#undef XSIMD_LITTLE_ENDIAN
-#undef SET_LOW_WORD
-#undef SET_HIGH_WORD
-#undef GET_LOW_WORD
-#undef GET_HIGH_WORD
-#undef HIGH_WORD_IDX
-#undef LOW_WORD_IDX
-#undef ONCE0
-}
--- a/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
@ -1,349 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_ALIGNED_ALLOCATOR_HPP
-#define XSIMD_ALIGNED_ALLOCATOR_HPP
-
-#include <algorithm>
-#include <cstddef>
-#include <utility>
-#ifdef _WIN32
-#include <malloc.h>
-#else
-#include <cstdlib>
-#endif
-
-#include <cassert>
-#include <memory>
-
-#include "../config/xsimd_arch.hpp"
-
-namespace xsimd
-{
-
-    /**
-     * @class aligned_allocator
-     * @brief Allocator for aligned memory
-     *
-     * The aligned_allocator class template is an allocator that
-     * performs memory allocation aligned by the specified value.
-     *
-     * @tparam T type of objects to allocate.
-     * @tparam Align alignment in bytes.
-     */
-    template <class T, size_t Align = default_arch::alignment()>
-    class aligned_allocator
-    {
-    public:
-        using value_type = T;
-        using pointer = T*;
-        using const_pointer = const T*;
-        using reference = T&;
-        using const_reference = const T&;
-        using size_type = size_t;
-        using difference_type = ptrdiff_t;
-
-        static constexpr size_t alignment = Align;
-
-        template <class U>
-        struct rebind
-        {
-            using other = aligned_allocator<U, Align>;
-        };
-
-        aligned_allocator() noexcept;
-        aligned_allocator(const aligned_allocator& rhs) noexcept;
-
-        template <class U>
-        aligned_allocator(const aligned_allocator<U, Align>& rhs) noexcept;
-
-        ~aligned_allocator();
-
-        pointer address(reference) noexcept;
-        const_pointer address(const_reference) const noexcept;
-
-        pointer allocate(size_type n, const void* hint = 0);
-        void deallocate(pointer p, size_type n);
-
-        size_type max_size() const noexcept;
-        size_type size_max() const noexcept;
-
-        template <class U, class... Args>
-        void construct(U* p, Args&&... args);
-
-        template <class U>
-        void destroy(U* p);
-    };
-
-    template <class T1, size_t Align1, class T2, size_t Align2>
-    bool operator==(const aligned_allocator<T1, Align1>& lhs,
-                    const aligned_allocator<T2, Align2>& rhs) noexcept;
-
-    template <class T1, size_t Align1, class T2, size_t Align2>
-    bool operator!=(const aligned_allocator<T1, Align1>& lhs,
-                    const aligned_allocator<T2, Align2>& rhs) noexcept;
-
-    void* aligned_malloc(size_t size, size_t alignment);
-    void aligned_free(void* ptr);
-
-    template <class T>
-    size_t get_alignment_offset(const T* p, size_t size, size_t block_size);
-
-    /************************************
-     * aligned_allocator implementation *
-     ************************************/
-
-    /**
-     * Default constructor.
-     */
-    template <class T, size_t A>
-    inline aligned_allocator<T, A>::aligned_allocator() noexcept
-    {
-    }
-
-    /**
-     * Copy constructor.
-     */
-    template <class T, size_t A>
-    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator&) noexcept
-    {
-    }
-
-    /**
-     * Extended copy constructor.
-     */
-    template <class T, size_t A>
-    template <class U>
-    inline aligned_allocator<T, A>::aligned_allocator(const aligned_allocator<U, A>&) noexcept
-    {
-    }
-
-    /**
-     * Destructor.
-     */
-    template <class T, size_t A>
-    inline aligned_allocator<T, A>::~aligned_allocator()
-    {
-    }
-
-    /**
-     * Returns the actual address of \c r even in presence of overloaded \c operator&.
-     * @param r the object to acquire address of.
-     * @return the actual address of \c r.
-     */
-    template <class T, size_t A>
-    inline auto
-    aligned_allocator<T, A>::address(reference r) noexcept -> pointer
-    {
-        return &r;
-    }
-
-    /**
-     * Returns the actual address of \c r even in presence of overloaded \c operator&.
-     * @param r the object to acquire address of.
-     * @return the actual address of \c r.
-     */
-    template <class T, size_t A>
-    inline auto
-    aligned_allocator<T, A>::address(const_reference r) const noexcept -> const_pointer
-    {
-        return &r;
-    }
-
-    /**
-     * Allocates <tt>n * sizeof(T)</tt> bytes of uninitialized memory, aligned by \c A.
-     * The alignment may require some extra memory allocation.
-     * @param n the number of objects to allocate storage for.
-     * @param hint unused parameter provided for standard compliance.
-     * @return a pointer to the first byte of a memory block suitably aligned and sufficient to
-     * hold an array of \c n objects of type \c T.
-     */
-    template <class T, size_t A>
-    inline auto
-    aligned_allocator<T, A>::allocate(size_type n, const void*) -> pointer
-    {
-        pointer res = reinterpret_cast<pointer>(aligned_malloc(sizeof(T) * n, A));
-#if defined(_CPPUNWIND) || defined(__cpp_exceptions)
-        if (res == nullptr)
-            throw std::bad_alloc();
-#endif
-        return res;
-    }
-
-    /**
-     * Deallocates the storage referenced by the pointer p, which must be a pointer obtained by
-     * an earlier call to allocate(). The argument \c n must be equal to the first argument of the call
-     * to allocate() that originally produced \c p; otherwise, the behavior is undefined.
-     * @param p pointer obtained from allocate().
-     * @param n number of objects earlier passed to allocate().
-     */
-    template <class T, size_t A>
-    inline void aligned_allocator<T, A>::deallocate(pointer p, size_type)
-    {
-        aligned_free(p);
-    }
-
-    /**
-     * Returns the maximum theoretically possible value of \c n, for which the
-     * call allocate(n, 0) could succeed.
-     * @return the maximum supported allocated size.
-     */
-    template <class T, size_t A>
-    inline auto
-    aligned_allocator<T, A>::max_size() const noexcept -> size_type
-    {
-        return size_type(-1) / sizeof(T);
-    }
-
-    /**
-     * This method is deprecated, use max_size() instead
-     */
-    template <class T, size_t A>
-    inline auto
-    aligned_allocator<T, A>::size_max() const noexcept -> size_type
-    {
-        return size_type(-1) / sizeof(T);
-    }
-
-    /**
-     * Constructs an object of type \c T in allocated uninitialized memory
-     * pointed to by \c p, using placement-new.
-     * @param p pointer to allocated uninitialized memory.
-     * @param args the constructor arguments to use.
-     */
-    template <class T, size_t A>
-    template <class U, class... Args>
-    inline void aligned_allocator<T, A>::construct(U* p, Args&&... args)
-    {
-        new ((void*)p) U(std::forward<Args>(args)...);
-    }
-
-    /**
-     * Calls the destructor of the object pointed to by \c p.
-     * @param p pointer to the object that is going to be destroyed.
-     */
-    template <class T, size_t A>
-    template <class U>
-    inline void aligned_allocator<T, A>::destroy(U* p)
-    {
-        p->~U();
-    }
-
-    /**
-     * @defgroup allocator_comparison Comparison operators
-     */
-
-    /**
-     * @ingroup allocator_comparison
-     * Compares two aligned memory allocator for equality. Since allocators
-     * are stateless, return \c true iff <tt>A1 == A2</tt>.
-     * @param lhs aligned_allocator to compare.
-     * @param rhs aligned_allocator to compare.
-     * @return true if the allocators have the same alignment.
-     */
-    template <class T1, size_t A1, class T2, size_t A2>
-    inline bool operator==(const aligned_allocator<T1, A1>& lhs,
-                           const aligned_allocator<T2, A2>& rhs) noexcept
-    {
-        return lhs.alignment == rhs.alignment;
-    }
-
-    /**
-     * @ingroup allocator_comparison
-     * Compares two aligned memory allocator for inequality. Since allocators
-     * are stateless, return \c true iff <tt>A1 != A2</tt>.
-     * @param lhs aligned_allocator to compare.
-     * @param rhs aligned_allocator to compare.
-     * @return true if the allocators have different alignments.
-     */
-    template <class T1, size_t A1, class T2, size_t A2>
-    inline bool operator!=(const aligned_allocator<T1, A1>& lhs,
-                           const aligned_allocator<T2, A2>& rhs) noexcept
-    {
-        return !(lhs == rhs);
-    }
-
-    /****************************************
-     * aligned malloc / free implementation *
-     ****************************************/
-
-    namespace detail
-    {
-        inline void* xaligned_malloc(size_t size, size_t alignment)
-        {
-            assert(((alignment & (alignment - 1)) == 0) && "alignment must be a power of two");
-            assert((alignment >= sizeof(void*)) && "alignment must be at least the size of a pointer");
-            void* res = nullptr;
-#ifdef _WIN32
-            res = _aligned_malloc(size, alignment);
-#else
-            if (posix_memalign(&res, alignment, size) != 0)
-            {
-                res = nullptr;
-            }
-#endif
-            return res;
-        }
-
-        inline void xaligned_free(void* ptr)
-        {
-#ifdef _WIN32
-            _aligned_free(ptr);
-#else
-            free(ptr);
-#endif
-        }
-    }
-
-    inline void* aligned_malloc(size_t size, size_t alignment)
-    {
-        return detail::xaligned_malloc(size, alignment);
-    }
-
-    inline void aligned_free(void* ptr)
-    {
-        detail::xaligned_free(ptr);
-    }
-
-    template <class T>
-    inline size_t get_alignment_offset(const T* p, size_t size, size_t block_size)
-    {
-        // size_t block_size = simd_traits<T>::size;
-        if (block_size == 1)
-        {
-            // The simd_block consists of exactly one scalar so that all
-            // elements of the array
-            // are "well" aligned.
-            return 0;
-        }
-        else if (size_t(p) & (sizeof(T) - 1))
-        {
-            // The array is not aligned to the size of a single element, so that
-            // no element
-            // of the array is well aligned
-            return size;
-        }
-        else
-        {
-            size_t block_mask = block_size - 1;
-            return std::min<size_t>(
-                (block_size - ((size_t(p) / sizeof(T)) & block_mask)) & block_mask,
-                size);
-        }
-    }
-
-    template <class T, class A = default_arch>
-    using default_allocator = typename std::conditional<A::requires_alignment(),
-                                                        aligned_allocator<T, A::alignment()>,
-                                                        std::allocator<T>>::type;
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
@ -1,76 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_ALIGNMENT_HPP
-#define XSIMD_ALIGNMENT_HPP
-
-#include "../types/xsimd_utils.hpp"
-#include "xsimd_aligned_allocator.hpp"
-
-namespace xsimd
-{
-    /**
-     * @struct aligned_mode
-     * @brief tag for load and store of aligned memory.
-     */
-    struct aligned_mode
-    {
-    };
-
-    /**
-     * @struct unaligned_mode
-     * @brief tag for load and store of unaligned memory.
-     */
-    struct unaligned_mode
-    {
-    };
-
-    /***********************
-     * Allocator alignment *
-     ***********************/
-
-    template <class A>
-    struct allocator_alignment
-    {
-        using type = unaligned_mode;
-    };
-
-    template <class T>
-    struct allocator_alignment<aligned_allocator<T>>
-    {
-        using type = aligned_mode;
-    };
-
-    template <class A>
-    using allocator_alignment_t = typename allocator_alignment<A>::type;
-
-    /***********************
-     * container alignment *
-     ***********************/
-
-    template <class C, class = void>
-    struct container_alignment
-    {
-        using type = unaligned_mode;
-    };
-
-    template <class C>
-    struct container_alignment<C, detail::void_t<typename C::allocator_type>>
-    {
-        using type = allocator_alignment_t<typename C::allocator_type>;
-    };
-
-    template <class C>
-    using container_alignment_t = typename container_alignment<C>::type;
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@ -1,32 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#include "xsimd_fma3_sse_register.hpp"
-#include "xsimd_fma4_register.hpp"
-#include "xsimd_sse2_register.hpp"
-#include "xsimd_sse3_register.hpp"
-#include "xsimd_sse4_1_register.hpp"
-#include "xsimd_sse4_2_register.hpp"
-
-#include "xsimd_avx2_register.hpp"
-#include "xsimd_avx_register.hpp"
-#include "xsimd_fma3_avx2_register.hpp"
-#include "xsimd_fma3_avx_register.hpp"
-
-#include "xsimd_avx512bw_register.hpp"
-#include "xsimd_avx512cd_register.hpp"
-#include "xsimd_avx512dq_register.hpp"
-#include "xsimd_avx512f_register.hpp"
-
-#include "xsimd_neon64_register.hpp"
-#include "xsimd_neon_register.hpp"
-
-#include "xsimd_sve_register.hpp"
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
@ -1,40 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX2_REGISTER_HPP
-#define XSIMD_AVX2_REGISTER_HPP
-
-#include "./xsimd_avx_register.hpp"
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * AVX2 instructions
-     */
-    struct avx2 : avx
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
-        static constexpr char const* name() noexcept { return "avx2"; }
-    };
-
-#if XSIMD_WITH_AVX2
-    namespace types
-    {
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
@ -1,48 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512BW_REGISTER_HPP
-#define XSIMD_AVX512BW_REGISTER_HPP
-
-#include "./xsimd_avx512dq_register.hpp"
-
-namespace xsimd
-{
-
-    /**
-     * @ingroup arch
-     *
-     * AVX512BW instructions
-     */
-    struct avx512bw : avx512dq
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
-        static constexpr char const* name() noexcept { return "avx512bw"; }
-    };
-
-#if XSIMD_WITH_AVX512BW
-
-    namespace types
-    {
-        template <class T>
-        struct get_bool_simd_register<T, avx512bw>
-        {
-            using type = simd_avx512_bool_register<T>;
-        };
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
-
-    }
-#endif
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
@ -1,48 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512CD_REGISTER_HPP
-#define XSIMD_AVX512CD_REGISTER_HPP
-
-#include "./xsimd_avx512f_register.hpp"
-
-namespace xsimd
-{
-
-    /**
-     * @ingroup arch
-     *
-     * AVX512CD instrutions
-     */
-    struct avx512cd : avx512f
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
-        static constexpr char const* name() noexcept { return "avx512cd"; }
-    };
-
-#if XSIMD_WITH_AVX512CD
-
-    namespace types
-    {
-        template <class T>
-        struct get_bool_simd_register<T, avx512cd>
-        {
-            using type = simd_avx512_bool_register<T>;
-        };
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
-
-    }
-#endif
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
@ -1,48 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512DQ_REGISTER_HPP
-#define XSIMD_AVX512DQ_REGISTER_HPP
-
-#include "./xsimd_avx512cd_register.hpp"
-
-namespace xsimd
-{
-
-    /**
-     * @ingroup arch
-     *
-     * AVX512DQ instructions
-     */
-    struct avx512dq : avx512cd
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
-        static constexpr char const* name() noexcept { return "avx512dq"; }
-    };
-
-#if XSIMD_WITH_AVX512DQ
-
-    namespace types
-    {
-        template <class T>
-        struct get_bool_simd_register<T, avx512dq>
-        {
-            using type = simd_avx512_bool_register<T>;
-        };
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
-
-    }
-#endif
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
@ -1,75 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX512F_REGISTER_HPP
-#define XSIMD_AVX512F_REGISTER_HPP
-
-#include "./xsimd_generic_arch.hpp"
-
-namespace xsimd
-{
-
-    /**
-     * @ingroup arch
-     *
-     * AVX512F instructions
-     */
-    struct avx512f : generic
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
-        static constexpr std::size_t alignment() noexcept { return 64; }
-        static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr char const* name() noexcept { return "avx512f"; }
-    };
-
-#if XSIMD_WITH_AVX512F
-
-    namespace types
-    {
-        template <class T>
-        struct simd_avx512_bool_register
-        {
-            using register_type = typename std::conditional<
-                (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
-                std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
-            register_type data;
-            simd_avx512_bool_register() = default;
-            simd_avx512_bool_register(register_type r) { data = r; }
-            operator register_type() const noexcept { return data; }
-        };
-        template <class T>
-        struct get_bool_simd_register<T, avx512f>
-        {
-            using type = simd_avx512_bool_register<T>;
-        };
-
-        XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
-        XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
-        XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
-
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
@ -1,62 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_AVX_REGISTER_HPP
-#define XSIMD_AVX_REGISTER_HPP
-
-#include "./xsimd_generic_arch.hpp"
-
-namespace xsimd
-{
-
-    /**
-     * @ingroup arch
-     *
-     * AVX instructions
-     */
-    struct avx : generic
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
-        static constexpr std::size_t alignment() noexcept { return 32; }
-        static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr char const* name() noexcept { return "avx"; }
-    };
-}
-
-#if XSIMD_WITH_AVX
-
-#include <immintrin.h>
-
-namespace xsimd
-{
-    namespace types
-    {
-
-        XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
-        XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
-        XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
-    }
-}
-#endif
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@ -1,147 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_BATCH_CONSTANT_HPP
-#define XSIMD_BATCH_CONSTANT_HPP
-
-#include "./xsimd_batch.hpp"
-#include "./xsimd_utils.hpp"
-
-namespace xsimd
-{
-    /**
-     * @brief batch of boolean constant
-     *
-     * Abstract representation of a batch of boolean constants.
-     *
-     * @tparam batch_type the type of the associated batch values.
-     * @tparam Values boolean constant represented by this batch
-     **/
-    template <class batch_type, bool... Values>
-    struct batch_bool_constant
-    {
-        static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
-        using value_type = bool;
-        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
-
-        operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
-
-        bool get(size_t i) const noexcept
-        {
-            return std::array<value_type, size> { { Values... } }[i];
-        }
-
-        static constexpr int mask() noexcept
-        {
-            return mask_helper(0, static_cast<int>(Values)...);
-        }
-
-    private:
-        static constexpr int mask_helper(int acc) noexcept { return acc; }
-        template <class... Tys>
-        static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
-        {
-            return mask_helper(acc | mask, (masks << 1)...);
-        }
-    };
-
-    /**
-     * @brief batch of integral constants
-     *
-     * Abstract representation of a batch of integral constants.
-     *
-     * @tparam batch_type the type of the associated batch values.
-     * @tparam Values constants represented by this batch
-     **/
-    template <class batch_type, typename batch_type::value_type... Values>
-    struct batch_constant
-    {
-        static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
-        using value_type = typename batch_type::value_type;
-        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
-
-        /**
-         * @brief Generate a batch of @p batch_type from this @p batch_constant
-         */
-        operator batch_type() const noexcept { return { Values... }; }
-
-        /**
-         * @brief Get the @p i th element of this @p batch_constant
-         */
-        constexpr value_type get(size_t i) const noexcept
-        {
-            return get(i, std::array<value_type, size> { Values... });
-        }
-
-    private:
-        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
-        {
-            return values[i];
-        }
-    };
-
-    namespace detail
-    {
-        template <class batch_type, class G, std::size_t... Is>
-        inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
-        {
-            return {};
-        }
-        template <class batch_type, class G, std::size_t... Is>
-        inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
-        {
-            return {};
-        }
-
-    } // namespace detail
-
-    /**
-     * @brief Build a @c batch_constant out of a generator function
-     *
-     * @tparam batch_type type of the (non-constant) batch to build
-     * @tparam G type used to generate that batch. That type must have a static
-     * member @c get that's used to generate the batch constant. Conversely, the
-     * generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
-     *
-     * The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
-     *
-     * @code
-     * struct Rot
-     * {
-     *     static constexpr unsigned get(unsigned i, unsigned n)
-     *     {
-     *         return (i + n - 1) % n;
-     *     }
-     * };
-     * @endcode
-     */
-    template <class batch_type, class G>
-    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
-    {
-        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
-    }
-
-    template <class batch_type, class G>
-    inline constexpr auto make_batch_bool_constant() noexcept
-        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>()))
-    {
-        return detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>());
-    }
-
-} // namespace xsimd
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
@ -1,46 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
-#define XSIMD_FMA3_AVX2_REGISTER_HPP
-
-#include "./xsimd_avx2_register.hpp"
-
-namespace xsimd
-{
-    template <typename arch>
-    struct fma3;
-
-    /**
-     * @ingroup arch
-     *
-     * AVX2 + FMA instructions
-     */
-    template <>
-    struct fma3<avx2> : avx2
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
-        static constexpr char const* name() noexcept { return "fma3+avx2"; }
-    };
-
-#if XSIMD_WITH_FMA3_AVX2
-    namespace types
-    {
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
-
-    }
-#endif
-
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
@ -1,46 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
-#define XSIMD_FMA3_AVX_REGISTER_HPP
-
-#include "./xsimd_avx_register.hpp"
-
-namespace xsimd
-{
-    template <typename arch>
-    struct fma3;
-
-    /**
-     * @ingroup arch
-     *
-     * AVX + FMA instructions
-     */
-    template <>
-    struct fma3<avx> : avx
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
-        static constexpr char const* name() noexcept { return "fma3+avx"; }
-    };
-
-#if XSIMD_WITH_FMA3_AVX
-    namespace types
-    {
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
-
-    }
-#endif
-
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
@ -1,46 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
-#define XSIMD_FMA3_SSE_REGISTER_HPP
-
-#include "./xsimd_sse4_2_register.hpp"
-
-namespace xsimd
-{
-    template <typename arch>
-    struct fma3;
-
-    /**
-     * @ingroup arch
-     *
-     * SSE4.2 + FMA instructions
-     */
-    template <>
-    struct fma3<sse4_2> : sse4_2
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
-        static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
-    };
-
-#if XSIMD_WITH_FMA3_SSE
-    namespace types
-    {
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
-
-    }
-#endif
-
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
@ -1,42 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_FMA4_REGISTER_HPP
-#define XSIMD_FMA4_REGISTER_HPP
-
-#include "./xsimd_sse4_2_register.hpp"
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * FMA4 instructions
-     */
-    struct fma4 : sse4_2
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
-        static constexpr char const* name() noexcept { return "fma4"; }
-    };
-
-#if XSIMD_WITH_FMA4
-    namespace types
-    {
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
-
-    }
-#endif
-
-}
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
@ -1,35 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_GENERIC_ARCH_HPP
-#define XSIMD_GENERIC_ARCH_HPP
-
-#include "../config/xsimd_config.hpp"
-
-/**
- * @defgroup arch Architecture description
- * */
-namespace xsimd
-{
-    struct generic
-    {
-        static constexpr bool supported() noexcept { return true; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr std::size_t alignment() noexcept { return 0; }
-        static constexpr bool requires_alignment() noexcept { return false; }
-        static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
-
-    protected:
-        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
-    };
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
@ -1,52 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_NEON64_REGISTER_HPP
-#define XSIMD_NEON64_REGISTER_HPP
-
-#include "xsimd_neon_register.hpp"
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * NEON instructions for arm64
-     */
-    struct neon64 : neon
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr std::size_t alignment() noexcept { return 16; }
-        static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
-        static constexpr char const* name() noexcept { return "arm64+neon"; }
-    };
-
-#if XSIMD_WITH_NEON64
-
-    namespace types
-    {
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
-        XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
-
-        template <class T>
-        struct get_bool_simd_register<T, neon64>
-            : detail::neon_bool_simd_register<T, neon64>
-        {
-        };
-    }
-
-#endif
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
@ -1,155 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_NEON_REGISTER_HPP
-#define XSIMD_NEON_REGISTER_HPP
-
-#include "xsimd_generic_arch.hpp"
-#include "xsimd_register.hpp"
-
-#if XSIMD_WITH_NEON
-#include <arm_neon.h>
-#endif
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * NEON instructions for arm32
-     */
-    struct neon : generic
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr std::size_t alignment() noexcept { return 16; }
-        static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
-        static constexpr char const* name() noexcept { return "arm32+neon"; }
-    };
-
-#if XSIMD_WITH_NEON
-    namespace types
-    {
-        namespace detail
-        {
-            template <size_t S>
-            struct neon_vector_type_impl;
-
-            template <>
-            struct neon_vector_type_impl<8>
-            {
-                using signed_type = int8x16_t;
-                using unsigned_type = uint8x16_t;
-            };
-
-            template <>
-            struct neon_vector_type_impl<16>
-            {
-                using signed_type = int16x8_t;
-                using unsigned_type = uint16x8_t;
-            };
-
-            template <>
-            struct neon_vector_type_impl<32>
-            {
-                using signed_type = int32x4_t;
-                using unsigned_type = uint32x4_t;
-            };
-
-            template <>
-            struct neon_vector_type_impl<64>
-            {
-                using signed_type = int64x2_t;
-                using unsigned_type = uint64x2_t;
-            };
-
-            template <class T>
-            using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
-
-            template <class T>
-            using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
-
-            template <class T>
-            using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
-                                                               signed_neon_vector_type<T>,
-                                                               unsigned_neon_vector_type<T>>::type;
-
-            using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
-                                                                    signed_neon_vector_type<char>,
-                                                                    unsigned_neon_vector_type<char>>::type;
-        }
-
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
-        XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
-        XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
-        XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
-        XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
-        XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
-
-        namespace detail
-        {
-            template <size_t S>
-            struct get_unsigned_type;
-
-            template <>
-            struct get_unsigned_type<1>
-            {
-                using type = uint8_t;
-            };
-
-            template <>
-            struct get_unsigned_type<2>
-            {
-                using type = uint16_t;
-            };
-
-            template <>
-            struct get_unsigned_type<4>
-            {
-                using type = uint32_t;
-            };
-
-            template <>
-            struct get_unsigned_type<8>
-            {
-                using type = uint64_t;
-            };
-
-            template <size_t S>
-            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
-
-            template <class T, class A>
-            struct neon_bool_simd_register
-            {
-                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
-            };
-        }
-
-        template <class T>
-        struct get_bool_simd_register<T, neon>
-            : detail::neon_bool_simd_register<T, neon>
-        {
-        };
-
-    }
-#endif
-
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
@ -1,94 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_REGISTER_HPP
-#define XSIMD_REGISTER_HPP
-
-#include <type_traits>
-
-namespace xsimd
-{
-    namespace types
-    {
-        template <class T, class A>
-        struct has_simd_register : std::false_type
-        {
-        };
-
-        template <class T, class Arch>
-        struct simd_register
-        {
-            struct register_type
-            {
-            };
-        };
-
-#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
-    template <>                                                    \
-    struct simd_register<SCALAR_TYPE, ISA>                         \
-    {                                                              \
-        using register_type = VECTOR_TYPE;                         \
-        register_type data;                                        \
-        operator register_type() const noexcept                    \
-        {                                                          \
-            return data;                                           \
-        }                                                          \
-    };                                                             \
-    template <>                                                    \
-    struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type    \
-    {                                                              \
-    }
-
-#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA)    \
-    template <>                                                  \
-    struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
-    {                                                            \
-    }
-
-#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE)                          \
-    template <class T>                                                            \
-    struct simd_register<T, ISA> : simd_register<T, ISA_BASE>                     \
-    {                                                                             \
-        using register_type = typename simd_register<T, ISA_BASE>::register_type; \
-        simd_register(register_type reg) noexcept                                 \
-            : simd_register<T, ISA_BASE> { reg }                                  \
-        {                                                                         \
-        }                                                                         \
-        simd_register() = default;                                                \
-    };                                                                            \
-    template <class T>                                                            \
-    struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE>             \
-    {                                                                             \
-    }
-
-        template <class T, class Arch>
-        struct get_bool_simd_register
-        {
-            using type = simd_register<T, Arch>;
-        };
-
-        template <class T, class Arch>
-        using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
-    }
-
-    namespace kernel
-    {
-        template <class A>
-        // makes requires_arch equal to A const&, using type_traits functions
-        using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
-        template <class T>
-        struct convert
-        {
-        };
-    }
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
@ -1,61 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE2_REGISTER_HPP
-#define XSIMD_SSE2_REGISTER_HPP
-
-#include "./xsimd_generic_arch.hpp"
-#include "./xsimd_register.hpp"
-
-#if XSIMD_WITH_SSE2
-#include <emmintrin.h>
-#include <xmmintrin.h>
-#endif
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * SSE2 instructions
-     */
-    struct sse2 : generic
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr bool requires_alignment() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
-        static constexpr std::size_t alignment() noexcept { return 16; }
-        static constexpr char const* name() noexcept { return "sse2"; }
-    };
-
-#if XSIMD_WITH_SSE2
-    namespace types
-    {
-        XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
-        XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
-        XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
@ -1,45 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE3_REGISTER_HPP
-#define XSIMD_SSE3_REGISTER_HPP
-
-#include "./xsimd_sse2_register.hpp"
-
-#if XSIMD_WITH_SSE3
-#include <pmmintrin.h>
-#endif
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * SSE3 instructions
-     */
-    struct sse3 : sse2
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
-        static constexpr char const* name() noexcept { return "sse3"; }
-    };
-
-#if XSIMD_WITH_SSE3
-    namespace types
-    {
-
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
@ -1,44 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE4_1_REGISTER_HPP
-#define XSIMD_SSE4_1_REGISTER_HPP
-
-#include "./xsimd_ssse3_register.hpp"
-
-#if XSIMD_WITH_SSE4_1
-#include <smmintrin.h>
-#endif
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * SSE4.1 instructions
-     */
-    struct sse4_1 : ssse3
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
-        static constexpr char const* name() noexcept { return "sse4.1"; }
-    };
-
-#if XSIMD_WITH_SSE4_1
-    namespace types
-    {
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
@ -1,44 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSE4_2_REGISTER_HPP
-#define XSIMD_SSE4_2_REGISTER_HPP
-
-#include "./xsimd_sse4_1_register.hpp"
-
-#if XSIMD_WITH_SSE4_2
-#include <nmmintrin.h>
-#endif
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * SSE4.2 instructions
-     */
-    struct sse4_2 : sse4_1
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
-        static constexpr char const* name() noexcept { return "sse4.2"; }
-    };
-
-#if XSIMD_WITH_SSE4_2
-    namespace types
-    {
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
@ -1,44 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SSSE3_REGISTER_HPP
-#define XSIMD_SSSE3_REGISTER_HPP
-
-#include "./xsimd_sse3_register.hpp"
-
-#if XSIMD_WITH_SSSE3
-#include <tmmintrin.h>
-#endif
-
-namespace xsimd
-{
-    /**
-     * @ingroup arch
-     *
-     * SSSE3 instructions
-     */
-    struct ssse3 : sse3
-    {
-        static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
-        static constexpr bool available() noexcept { return true; }
-        static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
-        static constexpr char const* name() noexcept { return "ssse3"; }
-    };
-
-#if XSIMD_WITH_SSSE3
-    namespace types
-    {
-        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
-    }
-#endif
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
@ -1,155 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- * Copyright (c) Yibo Cai                                                   *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_SVE_REGISTER_HPP
-#define XSIMD_SVE_REGISTER_HPP
-
-#include "xsimd_generic_arch.hpp"
-#include "xsimd_register.hpp"
-
-#if XSIMD_WITH_SVE
-#include <arm_sve.h>
-#endif
-
-namespace xsimd
-{
-    namespace detail
-    {
-        /**
-         * @ingroup arch
-         *
-         * SVE instructions (fixed vector size) for arm64
-         */
-        template <size_t Width>
-        struct sve : xsimd::generic
-        {
-            static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
-            static constexpr bool available() noexcept { return true; }
-            static constexpr bool requires_alignment() noexcept { return true; }
-            static constexpr std::size_t alignment() noexcept { return 16; }
-            static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
-            static constexpr char const* name() noexcept { return "arm64+sve"; }
-        };
-    }
-
-#if XSIMD_WITH_SVE
-
-    using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
-
-    namespace types
-    {
-        namespace detail
-        {
-// define fixed size alias per SVE sizeless type
-#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
-            using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
-            using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
-            using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
-            using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
-            using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
-            using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
-            using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
-            using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
-            using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
-            using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
-            using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
-#undef SVE_TO_FIXED_SIZE
-
-            template <size_t S>
-            struct sve_vector_type_impl;
-
-            template <>
-            struct sve_vector_type_impl<8>
-            {
-                using signed_type = sve_int8_t;
-                using unsigned_type = sve_uint8_t;
-                using floating_point_type = void;
-            };
-
-            template <>
-            struct sve_vector_type_impl<16>
-            {
-                using signed_type = sve_int16_t;
-                using unsigned_type = sve_uint16_t;
-                using floating_point_type = void;
-            };
-
-            template <>
-            struct sve_vector_type_impl<32>
-            {
-                using signed_type = sve_int32_t;
-                using unsigned_type = sve_uint32_t;
-                using floating_point_type = sve_float32_t;
-            };
-
-            template <>
-            struct sve_vector_type_impl<64>
-            {
-                using signed_type = sve_int64_t;
-                using unsigned_type = sve_uint64_t;
-                using floating_point_type = sve_float64_t;
-            };
-
-            template <class T>
-            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
-
-            template <class T>
-            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
-
-            template <class T>
-            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
-
-            template <class T>
-            using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
-                                                                                           floating_point_sve_vector_type<T>,
-                                                                                           signed_int_sve_vector_type<T>>::type;
-
-            template <class T>
-            using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
-                                                              signed_int_or_floating_point_sve_vector_type<T>,
-                                                              unsigned_int_sve_vector_type<T>>::type;
-        } // namespace detail
-
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
-        XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
-        XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
-        XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
-        XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
-        XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
-        XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
-
-        namespace detail
-        {
-            struct sve_bool_simd_register
-            {
-                using register_type = sve_bool_t;
-                register_type data;
-                operator register_type() const noexcept { return data; }
-            };
-        } // namespace detail
-
-        template <class T>
-        struct get_bool_simd_register<T, sve>
-        {
-            using type = detail::sve_bool_simd_register;
-        };
-    } // namespace types
-#endif
-} // namespace xsimd
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
@ -1,251 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_TRAITS_HPP
-#define XSIMD_TRAITS_HPP
-
-#include <type_traits>
-
-#include "xsimd_batch.hpp"
-
-namespace xsimd
-{
-
-    /**************************************
-     * simd_traits and revert_simd_traits *
-     **************************************/
-
-    template <class T, class A = default_arch>
-    struct has_simd_register : types::has_simd_register<T, A>
-    {
-    };
-
-    namespace detail
-    {
-        template <class T, bool>
-        struct simd_traits_impl;
-
-        template <class T>
-        struct simd_traits_impl<T, false>
-        {
-            using type = T;
-            using bool_type = bool;
-            static constexpr size_t size = 1;
-        };
-
-        template <class T>
-        constexpr size_t simd_traits_impl<T, false>::size;
-
-        template <class T>
-        struct simd_traits_impl<T, true>
-        {
-            using type = batch<T>;
-            using bool_type = typename type::batch_bool_type;
-            static constexpr size_t size = type::size;
-        };
-
-        template <class T>
-        constexpr size_t simd_traits_impl<T, true>::size;
-
-        template <class T, class A>
-        struct static_check_supported_config_emitter
-        {
-
-            static_assert(A::supported(),
-                          "usage of batch type with unsupported architecture");
-            static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
-                          "usage of batch type with unsupported type");
-        };
-
-        template <class T, class A>
-        struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
-        {
-        };
-
-#ifdef XSIMD_ENABLE_XTL_COMPLEX
-        template <class T, class A, bool i3ec>
-        struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
-        {
-        };
-#endif
-
-        // consistency checker
-        template <class T, class A>
-        void static_check_supported_config()
-        {
-            (void)static_check_supported_config_emitter<T, A>();
-        }
-    }
-
-    template <class T>
-    struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
-    {
-    };
-
-    template <class T>
-    struct simd_traits<std::complex<T>>
-        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
-    {
-    };
-
-#ifdef XSIMD_ENABLE_XTL_COMPLEX
-    template <class T, bool i3ec>
-    struct simd_traits<xtl::xcomplex<T, T, i3ec>>
-        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
-    {
-    };
-#endif
-
-    template <class T>
-    struct revert_simd_traits
-    {
-        using type = T;
-        static constexpr size_t size = simd_traits<type>::size;
-    };
-
-    template <class T>
-    constexpr size_t revert_simd_traits<T>::size;
-
-    template <class T>
-    struct revert_simd_traits<batch<T>>
-    {
-        using type = T;
-        static constexpr size_t size = batch<T>::size;
-    };
-
-    template <class T>
-    constexpr size_t revert_simd_traits<batch<T>>::size;
-
-    template <class T>
-    using simd_type = typename simd_traits<T>::type;
-
-    template <class T>
-    using simd_bool_type = typename simd_traits<T>::bool_type;
-
-    template <class T>
-    using revert_simd_type = typename revert_simd_traits<T>::type;
-
-    /********************
-     * simd_return_type *
-     ********************/
-
-    namespace detail
-    {
-        template <class T1, class T2>
-        struct simd_condition
-        {
-            static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
-        };
-
-        template <class T1, class T2, class A>
-        struct simd_return_type_impl
-            : std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
-        {
-        };
-
-        template <class T2, class A>
-        struct simd_return_type_impl<bool, T2, A>
-            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
-        {
-        };
-
-        template <class T2, class A>
-        struct simd_return_type_impl<bool, std::complex<T2>, A>
-            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
-        {
-        };
-
-        template <class T1, class T2, class A>
-        struct simd_return_type_impl<std::complex<T1>, T2, A>
-            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
-        {
-        };
-
-        template <class T1, class T2, class A>
-        struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
-            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
-        {
-        };
-
-#ifdef XSIMD_ENABLE_XTL_COMPLEX
-        template <class T1, class T2, bool I3EC, class A>
-        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
-            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
-        {
-        };
-
-        template <class T1, class T2, bool I3EC, class A>
-        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
-            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
-        {
-        };
-
-        template <class T1, class T2, bool I3EC, class A>
-        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
-            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
-        {
-        };
-
-        template <class T1, class T2, bool I3EC, class A>
-        struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
-            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
-        {
-        };
-#endif
-    }
-
-    template <class T1, class T2, class A = default_arch>
-    using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
-
-    /************
-     * is_batch *
-     ************/
-
-    template <class V>
-    struct is_batch : std::false_type
-    {
-    };
-
-    template <class T, class A>
-    struct is_batch<batch<T, A>> : std::true_type
-    {
-    };
-
-    /*****************
-     * is_batch_bool *
-     *****************/
-
-    template <class V>
-    struct is_batch_bool : std::false_type
-    {
-    };
-
-    template <class T, class A>
-    struct is_batch_bool<batch_bool<T, A>> : std::true_type
-    {
-    };
-
-    /********************
-     * is_batch_complex *
-     ********************/
-
-    template <class V>
-    struct is_batch_complex : std::false_type
-    {
-    };
-
-    template <class T, class A>
-    struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
-    {
-    };
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
@ -1,530 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_UTILS_HPP
-#define XSIMD_UTILS_HPP
-
-#include <complex>
-#include <cstdint>
-#include <cstring>
-#include <tuple>
-#include <type_traits>
-
-#ifdef XSIMD_ENABLE_XTL_COMPLEX
-#include "xtl/xcomplex.hpp"
-#endif
-
-namespace xsimd
-{
-
-    template <class T, class A>
-    class batch;
-
-    template <class T, class A>
-    class batch_bool;
-
-    /**************
-     * index      *
-     **************/
-
-    template <size_t I>
-    using index = std::integral_constant<size_t, I>;
-
-    /**************
-     * as_integer *
-     **************/
-
-    template <class T>
-    struct as_integer : std::make_signed<T>
-    {
-    };
-
-    template <>
-    struct as_integer<float>
-    {
-        using type = int32_t;
-    };
-
-    template <>
-    struct as_integer<double>
-    {
-        using type = int64_t;
-    };
-
-    template <class T, class A>
-    struct as_integer<batch<T, A>>
-    {
-        using type = batch<typename as_integer<T>::type, A>;
-    };
-
-    template <class B>
-    using as_integer_t = typename as_integer<B>::type;
-
-    /***********************
-     * as_unsigned_integer *
-     ***********************/
-
-    template <class T>
-    struct as_unsigned_integer : std::make_unsigned<T>
-    {
-    };
-
-    template <>
-    struct as_unsigned_integer<float>
-    {
-        using type = uint32_t;
-    };
-
-    template <>
-    struct as_unsigned_integer<double>
-    {
-        using type = uint64_t;
-    };
-
-    template <class T, class A>
-    struct as_unsigned_integer<batch<T, A>>
-    {
-        using type = batch<typename as_unsigned_integer<T>::type, A>;
-    };
-
-    template <class T>
-    using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
-
-    /*********************
-     * as_signed_integer *
-     *********************/
-
-    template <class T>
-    struct as_signed_integer : std::make_signed<T>
-    {
-    };
-
-    template <class T>
-    using as_signed_integer_t = typename as_signed_integer<T>::type;
-
-    /******************
-     * flip_sign_type *
-     ******************/
-
-    namespace detail
-    {
-        template <class T, bool is_signed>
-        struct flipped_sign_type_impl : std::make_signed<T>
-        {
-        };
-
-        template <class T>
-        struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
-        {
-        };
-    }
-
-    template <class T>
-    struct flipped_sign_type
-        : detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
-    {
-    };
-
-    template <class T>
-    using flipped_sign_type_t = typename flipped_sign_type<T>::type;
-
-    /***********
-     * as_float *
-     ************/
-
-    template <class T>
-    struct as_float;
-
-    template <>
-    struct as_float<int32_t>
-    {
-        using type = float;
-    };
-
-    template <>
-    struct as_float<int64_t>
-    {
-        using type = double;
-    };
-
-    template <class T, class A>
-    struct as_float<batch<T, A>>
-    {
-        using type = batch<typename as_float<T>::type, A>;
-    };
-
-    template <class T>
-    using as_float_t = typename as_float<T>::type;
-
-    /**************
-     * as_logical *
-     **************/
-
-    template <class T>
-    struct as_logical;
-
-    template <class T, class A>
-    struct as_logical<batch<T, A>>
-    {
-        using type = batch_bool<T, A>;
-    };
-
-    template <class T>
-    using as_logical_t = typename as_logical<T>::type;
-
-    /********************
-     * bit_cast *
-     ********************/
-
-    template <class To, class From>
-    inline To bit_cast(From val) noexcept
-    {
-        static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
-        // FIXME: Some old version of GCC don't support that trait
-        // static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
-        // static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
-        To res;
-        std::memcpy(&res, &val, sizeof(val));
-        return res;
-    }
-
-    namespace kernel
-    {
-        namespace detail
-        {
-            /**************************************
-             * enabling / disabling metafunctions *
-             **************************************/
-
-            template <class T>
-            using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
-
-            template <class T, size_t S>
-            using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
-
-            template <class T, size_t S>
-            using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
-
-            template <class T, size_t S>
-            using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
-
-            template <class T, size_t S>
-            using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
-
-            template <class T, size_t S>
-            using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
-
-            /********************************
-             * Matching & mismatching sizes *
-             ********************************/
-
-            template <class T, class U, class B = int>
-            using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
-
-            template <class T, class U, class B = int>
-            using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
-
-            template <class T, class U, class B = int>
-            using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
-        } // namespace detail
-    } // namespace kernel
-
-    /*****************************************
-     * Backport of index_sequence from c++14 *
-     *****************************************/
-
-    // TODO: Remove this once we drop C++11 support
-    namespace detail
-    {
-        template <typename T>
-        struct identity
-        {
-            using type = T;
-        };
-
-#ifdef __cpp_lib_integer_sequence
-        using std::index_sequence;
-        using std::integer_sequence;
-        using std::make_index_sequence;
-        using std::make_integer_sequence;
-
-        using std::index_sequence_for;
-#else
-        template <typename T, T... Is>
-        struct integer_sequence
-        {
-            using value_type = T;
-            static constexpr std::size_t size() noexcept { return sizeof...(Is); }
-        };
-
-        template <typename Lhs, typename Rhs>
-        struct make_integer_sequence_concat;
-
-        template <typename T, T... Lhs, T... Rhs>
-        struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
-                                            integer_sequence<T, Rhs...>>
-            : identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
-        {
-        };
-
-        template <typename T>
-        struct make_integer_sequence_impl;
-
-        template <typename T>
-        struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
-        {
-        };
-
-        template <typename T>
-        struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
-        {
-        };
-
-        template <typename T, T N>
-        struct make_integer_sequence_impl<std::integral_constant<T, N>>
-            : make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
-                                           typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
-        {
-        };
-
-        template <typename T, T N>
-        using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
-
-        template <std::size_t... Is>
-        using index_sequence = integer_sequence<std::size_t, Is...>;
-
-        template <std::size_t N>
-        using make_index_sequence = make_integer_sequence<std::size_t, N>;
-
-        template <typename... Ts>
-        using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
-
-#endif
-
-        template <int... Is>
-        using int_sequence = integer_sequence<int, Is...>;
-
-        template <int N>
-        using make_int_sequence = make_integer_sequence<int, N>;
-
-        template <typename... Ts>
-        using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
-
-        // Type-casted index sequence.
-        template <class P, size_t... Is>
-        inline P indexes_from(index_sequence<Is...>) noexcept
-        {
-            return { static_cast<typename P::value_type>(Is)... };
-        }
-
-        template <class P>
-        inline P make_sequence_as_batch() noexcept
-        {
-            return indexes_from<P>(make_index_sequence<P::size>());
-        }
-    }
-
-    /***********************************
-     * Backport of std::get from C++14 *
-     ***********************************/
-
-    namespace detail
-    {
-        template <class T, class... Types, size_t I, size_t... Is>
-        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
-        {
-            return std::get<I>(t);
-        }
-
-        template <class T, class U, class... Types, size_t I, size_t... Is>
-        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
-        {
-            using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
-            return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
-        }
-
-        template <class T, class... Types>
-        inline const T& get(const std::tuple<Types...>& t) noexcept
-        {
-            using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
-            return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
-        }
-    }
-
-    /*********************************
-     * Backport of void_t from C++17 *
-     *********************************/
-
-    namespace detail
-    {
-        template <class... T>
-        struct make_void
-        {
-            using type = void;
-        };
-
-        template <class... T>
-        using void_t = typename make_void<T...>::type;
-    }
-
-    /**************************************************
-     * Equivalent of void_t but with size_t parameter *
-     **************************************************/
-
-    namespace detail
-    {
-        template <std::size_t>
-        struct check_size
-        {
-            using type = void;
-        };
-
-        template <std::size_t S>
-        using check_size_t = typename check_size<S>::type;
-    }
-
-    /*****************************************
-     * Supplementary std::array constructors *
-     *****************************************/
-
-    namespace detail
-    {
-        // std::array constructor from scalar value ("broadcast")
-        template <typename T, std::size_t... Is>
-        inline constexpr std::array<T, sizeof...(Is)>
-        array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
-        {
-            // You can safely ignore this silly ternary, the "scalar" is all
-            // that matters. The rest is just a dirty workaround...
-            return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
-        }
-
-        template <typename T, std::size_t N>
-        inline constexpr std::array<T, N>
-        array_from_scalar(const T& scalar) noexcept
-        {
-            return array_from_scalar_impl(scalar, make_index_sequence<N>());
-        }
-
-        // std::array constructor from C-style pointer (handled as an array)
-        template <typename T, std::size_t... Is>
-        inline constexpr std::array<T, sizeof...(Is)>
-        array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
-        {
-            return std::array<T, sizeof...(Is)> { c_array[Is]... };
-        }
-
-        template <typename T, std::size_t N>
-        inline constexpr std::array<T, N>
-        array_from_pointer(const T* c_array) noexcept
-        {
-            return array_from_pointer_impl(c_array, make_index_sequence<N>());
-        }
-    }
-
-    /************************
-     * is_array_initializer *
-     ************************/
-
-    namespace detail
-    {
-        template <bool...>
-        struct bool_pack;
-
-        template <bool... bs>
-        using all_true = std::is_same<
-            bool_pack<bs..., true>, bool_pack<true, bs...>>;
-
-        template <typename T, typename... Args>
-        using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
-
-        template <typename T, std::size_t N, typename... Args>
-        using is_array_initializer = std::enable_if<
-            (sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
-
-        // Check that a variadic argument pack is a list of N values of type T,
-        // as usable for instantiating a value of type std::array<T, N>.
-        template <typename T, std::size_t N, typename... Args>
-        using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
-    }
-
-    /**************
-     * is_complex *
-     **************/
-
-    // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
-    // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
-    // so we cannot define is_complex in xsimd_traits.hpp. Besides, if
-    // no file defining batches is included, we still need this definition
-    // in xsimd_traits.hpp, so let's define it here.
-
-    namespace detail
-    {
-        template <class T>
-        struct is_complex : std::false_type
-        {
-        };
-
-        template <class T>
-        struct is_complex<std::complex<T>> : std::true_type
-        {
-        };
-
-#ifdef XSIMD_ENABLE_XTL_COMPLEX
-        template <class T, bool i3ec>
-        struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
-        {
-        };
-#endif
-    }
-
-    /*******************
-     * real_batch_type *
-     *******************/
-
-    template <class B>
-    struct real_batch_type
-    {
-        using type = B;
-    };
-
-    template <class T, class A>
-    struct real_batch_type<batch<std::complex<T>, A>>
-    {
-        using type = batch<T, A>;
-    };
-
-    template <class B>
-    using real_batch_type_t = typename real_batch_type<B>::type;
-
-    /**********************
-     * complex_batch_type *
-     **********************/
-
-    template <class B>
-    struct complex_batch_type
-    {
-        using real_value_type = typename B::value_type;
-        using arch_type = typename B::arch_type;
-        using type = batch<std::complex<real_value_type>, arch_type>;
-    };
-
-    template <class T, class A>
-    struct complex_batch_type<batch<std::complex<T>, A>>
-    {
-        using type = batch<std::complex<T>, A>;
-    };
-
-    template <class B>
-    using complex_batch_type_t = typename complex_batch_type<B>::type;
-}
-
-#endif
--- a/third_party/xsimd/include/xsimd/xsimd.hpp
+++ b/third_party/xsimd/include/xsimd/xsimd.hpp
@ -1,68 +0,0 @@
-/***************************************************************************
- * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
- * Martin Renou                                                             *
- * Copyright (c) QuantStack                                                 *
- * Copyright (c) Serge Guelton                                              *
- *                                                                          *
- * Distributed under the terms of the BSD 3-Clause License.                 *
- *                                                                          *
- * The full license is in the file LICENSE, distributed with this software. *
- ****************************************************************************/
-
-#ifndef XSIMD_HPP
-#define XSIMD_HPP
-
-#if defined(__has_cpp_attribute)
-// if this check passes, then the compiler supports feature test macros
-#if __has_cpp_attribute(nodiscard) >= 201603L
-// if this check passes, then the compiler supports [[nodiscard]] without a message
-#define XSIMD_NO_DISCARD [[nodiscard]]
-#endif
-#endif
-
-#if !defined(XSIMD_NO_DISCARD) && __cplusplus >= 201703L
-// this means that the previous tests failed, but we are using C++17 or higher
-#define XSIMD_NO_DISCARD [[nodiscard]]
-#endif
-
-#if !defined(XSIMD_NO_DISCARD) && (defined(__GNUC__) || defined(__clang__))
-// this means that the previous checks failed, but we are using GCC or Clang
-#define XSIMD_NO_DISCARD __attribute__((warn_unused_result))
-#endif
-
-#if !defined(XSIMD_NO_DISCARD)
-// this means that all the previous checks failed, so we fallback to doing nothing
-#define XSIMD_NO_DISCARD
-#endif
-
-#ifdef __cpp_if_constexpr
-// this means that the compiler supports the `if constexpr` construct
-#define XSIMD_IF_CONSTEXPR if constexpr
-#endif
-
-#if !defined(XSIMD_IF_CONSTEXPR) && __cplusplus >= 201703L
-// this means that the previous test failed, but we are using C++17 or higher
-#define XSIMD_IF_CONSTEXPR if constexpr
-#endif
-
-#if !defined(XSIMD_IF_CONSTEXPR)
-// this means that all the previous checks failed, so we fallback to a normal `if`
-#define XSIMD_IF_CONSTEXPR if
-#endif
-
-#include "config/xsimd_config.hpp"
-
-#include "arch/xsimd_scalar.hpp"
-#include "memory/xsimd_aligned_allocator.hpp"
-
-#if defined(XSIMD_NO_SUPPORTED_ARCHITECTURE)
-// to type definition or anything appart from scalar definition and aligned allocator
-#else
-#include "types/xsimd_batch.hpp"
-#include "types/xsimd_batch_constant.hpp"
-#include "types/xsimd_traits.hpp"
-
-// This include must come last
-#include "types/xsimd_api.hpp"
-#endif
-#endif
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@ -1,37 +0,0 @@
-schema: 1
-
-bugzilla:
-  product: Toolkit
-  component: "General"
-
-origin:
-  name: xsimd
-  description: C++ wrappers for SIMD intrinsics
-
-  url: https://github.com/QuantStack/xsimd
-
-  release: 5186173c33515769d49bae8cb8bc8469770427b8 (2022-12-06T11:35:51Z).
-  revision: 5186173c33515769d49bae8cb8bc8469770427b8
-
-  license: BSD-3-Clause
-
-vendoring:
-  url: https://github.com/QuantStack/xsimd
-  source-hosting: github
-  tracking: commit
-
-  exclude:
-    - ".*"
-    - "*.md"
-    - "*.yml"
-    - "*.txt"
-    - "*.in"
-    - "*.sh"
-    - benchmark
-    - cmake
-    - docs
-    - examples
-    - test
-
-  keep:
-    - include/