diff --git a/third_party/xsimd/Changelog.rst b/third_party/xsimd/Changelog.rst
index 9e6299c3b4cc..2c569a8bd3fc 100644
--- a/third_party/xsimd/Changelog.rst
+++ b/third_party/xsimd/Changelog.rst
@@ -9,6 +9,41 @@
 Changelog
 =========
 
+12.1.1
+------
+
+    * Update readme with a section on adoption, and a section on the history of the project 
+
+    * Fix/avx512vnni implementation
+
+    * Fix regression on XSIMD_NO_SUPPORTED_ARCHITECTURE
+
+12.1.0
+------
+
+    * Fix various problems with architecture version handling
+
+    * Specialize xsimd::compress for riscv
+
+    * Provide stubs for various avx512xx architectures
+
+12.0.0
+------
+
+    * Fix sincos implementation to cope with Emscripten
+    
+    * Upgraded minimal version of cmake to remove deprecation warning
+
+    * Fixed constants::signmask for GCC when using ffast-math
+
+    * Add RISC-V Vector support
+
+    * Generic, simple implementation fox xsimd::compress
+
+    * Disable batch of bools, and suggest using batch_bool instead
+
+    * Add an option to skip installation
+
 11.2.0
 ------
 
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
index 10bf2abffbdf..05d27b3d4704 100644
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -95,12 +95,12 @@ namespace xsimd
         template <class A>
         inline batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
         {
-            return self & constants::minuszero<batch<float, A>>();
+            return self & constants::signmask<batch<float, A>>();
         }
         template <class A>
         inline batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
         {
-            return self & constants::minuszero<batch<double, A>>();
+            return self & constants::signmask<batch<double, A>>();
         }
 
         // bitwise_cast
@@ -974,12 +974,8 @@ namespace xsimd
         template <class A, class T>
         inline batch<std::complex<T>, A> polar(const batch<T, A>& r, const batch<T, A>& theta, requires_arch<generic>) noexcept
         {
-#ifndef EMSCRIPTEN
             auto sincosTheta = sincos(theta);
             return { r * sincosTheta.second, r * sincosTheta.first };
-#else
-            return { r * cos(theta), r * sin(theta) };
-#endif
         }
 
         // fdim
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
index 90e58c9d7be1..e9e9065832a1 100644
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -32,6 +32,60 @@ namespace xsimd
 
         using namespace types;
 
+        // compress
+        namespace detail
+        {
+            template <class IT, class A, class I, size_t... Is>
+            inline batch<IT, A> create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+            {
+                batch<IT, A> swizzle_mask(IT(0));
+                alignas(A::alignment()) IT mask_buffer[batch<IT, A>::size] = { Is... };
+                size_t inserted = 0;
+                for (size_t i = 0; i < sizeof...(Is); ++i)
+                    if ((bitmask >> i) & 1u)
+                        std::swap(mask_buffer[inserted++], mask_buffer[i]);
+                return batch<IT, A>::load_aligned(&mask_buffer[0]);
+            }
+        }
+
+        template <typename A, typename T>
+        inline batch<T, A>
+        compress(batch<T, A> const& x, batch_bool<T, A> const& mask,
+                 kernel::requires_arch<generic>) noexcept
+        {
+            using IT = as_unsigned_integer_t<T>;
+            constexpr std::size_t size = batch_bool<T, A>::size;
+            auto bitmask = mask.mask();
+            auto z = select(mask, x, batch<T, A>((T)0));
+            auto compress_mask = detail::create_compress_swizzle_mask<IT, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
+            return swizzle(z, compress_mask);
+        }
+
+        // expand
+        namespace detail
+        {
+            template <class IT, class A, class I, size_t... Is>
+            inline batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+            {
+                batch<IT, A> swizzle_mask(IT(0));
+                IT j = 0;
+                (void)std::initializer_list<bool> { ((swizzle_mask = insert(swizzle_mask, j, index<Is>())), (j += ((bitmask >> Is) & 1u)), true)... };
+                return swizzle_mask;
+            }
+        }
+
+        template <typename A, typename T>
+        inline batch<T, A>
+        expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
+               kernel::requires_arch<generic>) noexcept
+        {
+            constexpr std::size_t size = batch_bool<T, A>::size;
+            auto bitmask = mask.mask();
+            auto swizzle_mask = detail::create_expand_swizzle_mask<as_unsigned_integer_t<T>, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
+            auto z = swizzle(x, swizzle_mask);
+            return select(mask, z, batch<T, A>(T(0)));
+        }
+
         // extract_pair
         template <class A, class T>
         inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512er.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512er.hpp
new file mode 100644
index 000000000000..be02f9850b11
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512er.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512ER_HPP
+#define XSIMD_AVX512ER_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512er_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
index 572aba3b068e..7ee46101356a 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
@@ -661,6 +661,38 @@ namespace xsimd
             return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
         }
 
+        // compress
+        template <class A>
+        inline batch<float, A> compress(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_ps(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<double, A> compress(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_pd(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<int32_t, A> compress(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi32(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<uint32_t, A> compress(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi32(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<int64_t, A> compress(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi64(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<uint64_t, A> compress(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_compress_epi64(mask.mask(), self);
+        }
+
         // convert
         namespace detail
         {
@@ -756,6 +788,38 @@ namespace xsimd
             return register_type(~self.data ^ other.data);
         }
 
+        // expand
+        template <class A>
+        inline batch<float, A> expand(batch<float, A> const& self, batch_bool<float, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_ps(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<double, A> expand(batch<double, A> const& self, batch_bool<double, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_pd(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<int32_t, A> expand(batch<int32_t, A> const& self, batch_bool<int32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi32(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<uint32_t, A> expand(batch<uint32_t, A> const& self, batch_bool<uint32_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi32(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<int64_t, A> expand(batch<int64_t, A> const& self, batch_bool<int64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi64(mask.mask(), self);
+        }
+        template <class A>
+        inline batch<uint64_t, A> expand(batch<uint64_t, A> const& self, batch_bool<uint64_t, A> const& mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_maskz_expand_epi64(mask.mask(), self);
+        }
+
         // floor
         template <class A>
         inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
@@ -1969,10 +2033,12 @@ namespace xsimd
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
                 assert(false && "not implemented yet");
+                return {};
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
                 assert(false && "not implemented yet");
+                return {};
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
@@ -2035,10 +2101,12 @@ namespace xsimd
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
                 assert(false && "not implemented yet");
+                return {};
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
                 assert(false && "not implemented yet");
+                return {};
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp
new file mode 100644
index 000000000000..df382881b0b2
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_HPP
+#define XSIMD_AVX512VBMI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vbmi_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp
new file mode 100644
index 000000000000..6265c91718fb
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512PF_HPP
+#define XSIMD_AVX512PF_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512pf_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp
new file mode 100644
index 000000000000..df382881b0b2
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_HPP
+#define XSIMD_AVX512VBMI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vbmi_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
new file mode 100644
index 000000000000..b285623d02f6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP
+#define XSIMD_AVX512VNNI_AVX512_BW_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vnni_avx512bw_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp
new file mode 100644
index 000000000000..a70d30fad598
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512VBMI_HPP
+#define XSIMD_AVX512VNNI_AVX512VBMI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512vnni_avx512vbmi_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp
new file mode 100644
index 000000000000..a97ba9296c51
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp
@@ -0,0 +1,20 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVXVNNI_HPP
+#define XSIMD_AVXVNNI_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avxvnni_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
index a9cf971db5ad..22dd5d3e3030 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
@@ -56,6 +56,11 @@ namespace xsimd
         return bit_cast<double>((uint64_t)DOUBLE);      \
     }
 
+// Under fast-math, GCC might replace signmask (minus zero) by zero
+#if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC optimize("signed-zeros")
+#endif
         XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
         XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
         XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
@@ -79,7 +84,6 @@ namespace xsimd
         XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
         XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
         XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
-        XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
         XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
         XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
         XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
@@ -104,6 +108,9 @@ namespace xsimd
         XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
         XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
         XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
+#if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
 
 #undef XSIMD_DEFINE_CONSTANT
 #undef XSIMD_DEFINE_CONSTANT_HEX
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
index 8f05a5dab2c0..0edd77674178 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@@ -52,6 +52,10 @@
 #include "./xsimd_fma3_avx.hpp"
 #endif
 
+#if XSIMD_WITH_AVXVNNI
+#include "./xsimd_avxvnni.hpp"
+#endif
+
 #if XSIMD_WITH_AVX2
 #include "./xsimd_avx2.hpp"
 #endif
@@ -68,6 +72,30 @@
 #include "./xsimd_avx512bw.hpp"
 #endif
 
+#if XSIMD_WITH_AVX512ER
+#include "./xsimd_avx512er.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512PF
+#include "./xsimd_avx512pf.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512IFMA
+#include "./xsimd_avx512ifma.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512VBMI
+#include "./xsimd_avx512vbmi.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512VNNI_AVX512BW
+#include "./xsimd_avx512vnni_avx512bw.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512VNNI_AVX512VBMI
+#include "./xsimd_avx512vnni_avx512vbmi.hpp"
+#endif
+
 #if XSIMD_WITH_NEON
 #include "./xsimd_neon.hpp"
 #endif
@@ -80,6 +108,10 @@
 #include "./xsimd_sve.hpp"
 #endif
 
+#if XSIMD_WITH_RVV
+#include "./xsimd_rvv.hpp"
+#endif
+
 #if XSIMD_WITH_WASM
 #include "./xsimd_wasm.hpp"
 #endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
new file mode 100644
index 000000000000..98d1de9ce341
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
@@ -0,0 +1,1499 @@
+/***************************************************************************
+
+ * Copyright (c) Rivos Inc.                                                 *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_RVV_HPP
+#define XSIMD_RVV_HPP
+
+#include <complex>
+#include <type_traits>
+#include <utility>
+
+#include "../types/xsimd_rvv_register.hpp"
+#include "xsimd_constants.hpp"
+
+// This set of macros allows the synthesis of identifiers using a template and
+// variable macro arguments.  A single template can then be used by multiple
+// macros, or multiple instances of a macro to define the same logic for
+// different data types.
+//
+// First some logic to paste text together...
+//
+#define XSIMD_RVV_JOIN_(x, y) x##y
+#define XSIMD_RVV_JOIN(x, y) XSIMD_RVV_JOIN_(x, y)
+#define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_RVV_JOIN(T, then)
+#define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_RVV_JOIN(S, then)
+#define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_RVV_JOIN(m1, then)
+#define XSIMD_RVV_PREFIX(T, S, then) then
+//
+// XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for
+// the identifier.  The template is a comma-separated list of alternating
+// literal and parameter segments.  Each parameter is appended to XSIMD_RVV_PREFIX to
+// form a new macro name which decides which parameter should be inserted.
+// Then a literal segment is inserted after that.  Empty literals are used to
+// join two or more variables together.
+//
+#define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t
+#define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__)))
+#define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__)))
+//
+// UNBRACKET and REPARSE force the preprocessor to handle expansion in a
+// specific order.  XSIMD_RVV_UNBRACKET strips the parentheses from the template
+// (which were necessary to keep the template as a single, named macro
+// parameter up to this point).  XSIMD_RVV_ARG_LIST then forms the new parameter list
+// to pass to XSIMD_RVV_IDENTIFIER0, with trailing commas to ensure the unrolled
+// XSIMD_RVV_IDENTIFIER loop runs to completion adding empty strings.
+//
+// However XSIMD_RVV_IDENTIFIER0 is not expanded immediately because it does not
+// match a function-like macro in this pass.  XSIMD_RVV_REPARSE forces another
+// evaluation after the expansion of XSIMD_RVV_ARG_LIST, where XSIMD_RVV_IDENTIFIER0 will
+// now match as a function-like macro, and the cycle of substitutions and
+// insertions can begin.
+//
+#define XSIMD_RVV_REPARSE(v) (v)
+#define XSIMD_RVV_UNBRACKET(...) __VA_ARGS__
+#define XSIMD_RVV_ARG_LIST(T, S, name) (T, S, XSIMD_RVV_UNBRACKET name, , , , , , , , , , , , , , , , , , , , , )
+#define XSIMD_RVV_IDENTIFIER(T, S, name) XSIMD_RVV_REPARSE(XSIMD_RVV_IDENTIFIER0 XSIMD_RVV_ARG_LIST(T, S, name))
+//
+// To avoid comma-counting bugs, replace the variable references with macros
+// which include enough commas to keep proper phase, and then use no commas at
+// all in the templates.
+//
+#define XSIMD_RVV_T , _T,
+#define XSIMD_RVV_S , _S,
+#define XSIMD_RVV_M , _M,
+#define XSIMD_RVV_TSM XSIMD_RVV_T XSIMD_RVV_S XSIMD_RVV_M
+
+// XSIMD_RVV_OVERLOAD, below, expands to a head section, a number of body sections
+// (depending on which types are supported), and a tail section.  Different
+// variants of these sections are implemented with different suffixes on the
+// three macro names XSIMD_RVV_WRAPPER_HEAD, XSIMD_RVV_WRAPPER, and XSIMD_RVV_WRAPPER_TAIL and
+// specified as an argument to XSIMD_RVV_OVERLOAD (the empty string is the default,
+// but still needs an extra comma to hold its place).
+//
+// The default XSIMD_RVV_WRAPPER_HEAD provides a class containing convenient names
+// for the function signature argument(s) to XSIMD_RVV_OVERLOAD.  That signature can
+// also reference the template argument T, because it's a text substitution
+// into the template.
+#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...)                      \
+    namespace NAME##_cruft                                                \
+    {                                                                     \
+        template <class T>                                                \
+        struct ctx                                                        \
+        {                                                                 \
+            static constexpr size_t width = XSIMD_RVV_BITS;               \
+            static constexpr size_t vl = width / (sizeof(T) * 8);         \
+            using vec = rvv_reg_t<T, width>;                              \
+            using uvec = rvv_reg_t<as_unsigned_relaxed_t<T>, width>;      \
+            using svec = rvv_reg_t<as_signed_relaxed_t<T>, width>;        \
+            using fvec = rvv_reg_t<as_float_relaxed_t<T>, width>;         \
+            using bvec = rvv_bool_t<T, width>;                            \
+            using scalar_vec = rvv_reg_t<T, types::detail::rvv_width_m1>; \
+            using wide_vec = rvv_reg_t<T, width * 2>;                     \
+            using narrow_vec = rvv_reg_t<T, width / 2>;                   \
+            using type = SIGNATURE;                                       \
+        };                                                                \
+        template <class T>                                                \
+        using sig_t = typename ctx<T>::type;                              \
+        template <class K, class T>                                       \
+        struct impl                                                       \
+        {                                                                 \
+            void operator()() const noexcept {};                          \
+        };                                                                \
+        template <class K>                                                \
+        using impl_t = impl<K, sig_t<K>>;
+
+#define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__)
+
+// The body of the wrapper defines a functor (because partial specialisation of
+// functions is not legal) which forwards its arguments to the named intrinsic
+// with a few manipulations.  In general, vector types are handled as
+// rvv_reg_t<> and rely on the conversion operators in that class for
+// compatibility with the intrinsics.
+//
+// The function signature is not mentioned here.  Instead it's provided in the
+// tail code as the template argument for which this is a specialisation, which
+// overcomes the problem of converting a function signature type to an argument
+// list to pass to another function.
+//
+#define XSIMD_RVV_WRAPPER(KEY, CALLEE, ...)                   \
+    template <class Ret, class... Args>                       \
+    struct impl<KEY, Ret(Args...)>                            \
+    {                                                         \
+        using ctx = ctx<KEY>;                                 \
+        constexpr Ret operator()(Args... args) const noexcept \
+        {                                                     \
+            return CALLEE(args..., ctx::vl);                  \
+        };                                                    \
+    };
+#define XSIMD_RVV_WRAPPER_NOVL(KEY, CALLEE, ...)              \
+    template <class Ret, class... Args>                       \
+    struct impl<KEY, Ret(Args...)>                            \
+    {                                                         \
+        constexpr Ret operator()(Args... args) const noexcept \
+        {                                                     \
+            return CALLEE(args...);                           \
+        };                                                    \
+    };
+#define XSIMD_RVV_WRAPPER_DROP_1ST(KEY, CALLEE, ...)                 \
+    template <class Ret, class First, class... Args>                 \
+    struct impl<KEY, Ret(First, Args...)>                            \
+    {                                                                \
+        using ctx = ctx<KEY>;                                        \
+        constexpr Ret operator()(First, Args... args) const noexcept \
+        {                                                            \
+            return CALLEE(args..., ctx::vl);                         \
+        };                                                           \
+    };
+#define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS(KEY, CALLEE, SIGNATURE, ...) \
+    template <class Ret, class First, class... Args>                        \
+    struct impl<KEY, Ret(First, Args...)>                                   \
+    {                                                                       \
+        using ctx = ctx<KEY>;                                               \
+        constexpr Ret operator()(First, Args... args) const noexcept        \
+        {                                                                   \
+            return CALLEE(__VA_ARGS__, ctx::vl);                            \
+        };                                                                  \
+    };
+#define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS_NOVL(KEY, CALLEE, SIGNATURE, ...) \
+    template <class Ret, class First, class... Args>                             \
+    struct impl<KEY, Ret(First, Args...)>                                        \
+    {                                                                            \
+        constexpr Ret operator()(First, Args... args) const noexcept             \
+        {                                                                        \
+            return CALLEE(__VA_ARGS__);                                          \
+        };                                                                       \
+    };
+
+// This part folds all the above templates down into a single functor instance
+// with all the different function signatures available under the one name.
+// Not all of the base classes necessarily contain useful code, but there's a
+// default implementation so that filtering them out isn't really necessary.
+#define XSIMD_RVV_WRAPPER_TAIL(NAME, ...)                     \
+    } /* namespace NAME##_cruft */                            \
+    static constexpr struct : NAME##_cruft::impl_t<int8_t>,   \
+                              NAME##_cruft::impl_t<uint8_t>,  \
+                              NAME##_cruft::impl_t<int16_t>,  \
+                              NAME##_cruft::impl_t<uint16_t>, \
+                              NAME##_cruft::impl_t<int32_t>,  \
+                              NAME##_cruft::impl_t<uint32_t>, \
+                              NAME##_cruft::impl_t<int64_t>,  \
+                              NAME##_cruft::impl_t<uint64_t>, \
+                              NAME##_cruft::impl_t<float>,    \
+                              NAME##_cruft::impl_t<double>    \
+    {                                                         \
+        using NAME##_cruft::impl_t<int8_t>::operator();       \
+        using NAME##_cruft::impl_t<uint8_t>::operator();      \
+        using NAME##_cruft::impl_t<int16_t>::operator();      \
+        using NAME##_cruft::impl_t<uint16_t>::operator();     \
+        using NAME##_cruft::impl_t<int32_t>::operator();      \
+        using NAME##_cruft::impl_t<uint32_t>::operator();     \
+        using NAME##_cruft::impl_t<int64_t>::operator();      \
+        using NAME##_cruft::impl_t<uint64_t>::operator();     \
+        using NAME##_cruft::impl_t<float>::operator();        \
+        using NAME##_cruft::impl_t<double>::operator();       \
+    } NAME {};
+#define XSIMD_RVV_WRAPPER_TAIL_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__)
+
+// clang-format off
+
+#define XSIMD_RVV_OVERLOAD_head(my_name, variant, ...) \
+    XSIMD_RVV_WRAPPER_HEAD##variant(my_name, __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_i(name, variant, ...)                                        \
+    XSIMD_RVV_WRAPPER##variant(int8_t, XSIMD_RVV_IDENTIFIER(i, 8, name), __VA_ARGS__)   \
+    XSIMD_RVV_WRAPPER##variant(int16_t, XSIMD_RVV_IDENTIFIER(i, 16, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(int32_t, XSIMD_RVV_IDENTIFIER(i, 32, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(int64_t, XSIMD_RVV_IDENTIFIER(i, 64, name), __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_u(name, variant, ...)                                         \
+    XSIMD_RVV_WRAPPER##variant(uint8_t, XSIMD_RVV_IDENTIFIER(u, 8, name), __VA_ARGS__)   \
+    XSIMD_RVV_WRAPPER##variant(uint16_t, XSIMD_RVV_IDENTIFIER(u, 16, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(uint32_t, XSIMD_RVV_IDENTIFIER(u, 32, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(uint64_t, XSIMD_RVV_IDENTIFIER(u, 64, name), __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_f(name, variant, ...)                                      \
+    XSIMD_RVV_WRAPPER##variant(float, XSIMD_RVV_IDENTIFIER(f, 32, name), __VA_ARGS__) \
+    XSIMD_RVV_WRAPPER##variant(double, XSIMD_RVV_IDENTIFIER(f, 64, name), __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_tail(my_name, variant, ...) \
+    XSIMD_RVV_WRAPPER_TAIL##variant(my_name, __VA_ARGS__)
+
+// Use these to create function (actually functor, sorry) wrappers overloaded
+// for whichever types are supported.  Being functors means they can't take a
+// template argument (until C++14), so if a type can't be deduced then a junk
+// value can be passed as the first argument and discarded by using the
+// _DROP_1ST variant, instead.
+//
+// The wrappers use the rvv_reg_t<> types for template accessibility, and
+// because some types (eg., vfloat64mf2_t) don't exist and need extra
+// abstraction to emulate.
+//
+// In many cases the intrinsic names are different for signed, unsigned, or
+// float variants, the macros OVERLOAD2 and OVERLOAD3 (depending on whether or
+// not a float variant exists) take multiple intrinsic names and bring them
+// together under a single overloaded identifier where they can be used within
+// templates.
+//
+#define XSIMD_RVV_OVERLOAD2(my_name, name_i, name_u, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)         \
+    XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__)         \
+    XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__)     \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD3(my_name, name_i, name_u, name_f, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)                 \
+    XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__)                     \
+    XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__)                     \
+    XSIMD_RVV_OVERLOAD_f(name_f, variant, __VA_ARGS__)                     \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD(my_name, name, ...) XSIMD_RVV_OVERLOAD3(my_name, name, name, name, __VA_ARGS__)
+#define XSIMD_RVV_OVERLOAD_INTS(my_name, name, ...) XSIMD_RVV_OVERLOAD2(my_name, name, name, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD_SINTS(my_name, name, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)    \
+    XSIMD_RVV_OVERLOAD_i(name, variant, __VA_ARGS__)          \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD_UINTS(my_name, name, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)    \
+    XSIMD_RVV_OVERLOAD_u(name, variant, __VA_ARGS__)          \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+#define XSIMD_RVV_OVERLOAD_FLOATS(my_name, name, variant, ...) \
+    XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__)     \
+    XSIMD_RVV_OVERLOAD_f(name, variant, __VA_ARGS__)           \
+    XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__)
+
+// clang-format on
+
+namespace xsimd
+{
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            template <class T>
+            using rvv_fix_char_t = types::detail::rvv_fix_char_t<T>;
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_reg_t = types::detail::rvv_reg_t<T, Width>;
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_bool_t = types::detail::rvv_bool_t<T, Width>;
+
+            template <size_t>
+            struct as_signed_relaxed;
+            template <>
+            struct as_signed_relaxed<1>
+            {
+                using type = int8_t;
+            };
+            template <>
+            struct as_signed_relaxed<2>
+            {
+                using type = int16_t;
+            };
+            template <>
+            struct as_signed_relaxed<4>
+            {
+                using type = int32_t;
+            };
+            template <>
+            struct as_signed_relaxed<8>
+            {
+                using type = int64_t;
+            };
+            template <class T>
+            using as_signed_relaxed_t = typename as_signed_relaxed<sizeof(T)>::type;
+            template <size_t>
+            struct as_unsigned_relaxed;
+            template <>
+            struct as_unsigned_relaxed<1>
+            {
+                using type = uint8_t;
+            };
+            template <>
+            struct as_unsigned_relaxed<2>
+            {
+                using type = uint16_t;
+            };
+            template <>
+            struct as_unsigned_relaxed<4>
+            {
+                using type = uint32_t;
+            };
+            template <>
+            struct as_unsigned_relaxed<8>
+            {
+                using type = uint64_t;
+            };
+            template <class T>
+            using as_unsigned_relaxed_t = typename as_unsigned_relaxed<sizeof(T)>::type;
+            template <size_t>
+            struct as_float_relaxed;
+            template <>
+            struct as_float_relaxed<1>
+            {
+                using type = int8_t;
+            };
+            template <>
+            struct as_float_relaxed<2>
+            {
+                using type = int16_t;
+            };
+            template <>
+            struct as_float_relaxed<4>
+            {
+                using type = float;
+            };
+            template <>
+            struct as_float_relaxed<8>
+            {
+                using type = double;
+            };
+            template <class T>
+            using as_float_relaxed_t = typename as_float_relaxed<sizeof(T)>::type;
+
+            template <class T, class U>
+            rvv_reg_t<T, U::width> rvvreinterpret(U const& arg) noexcept
+            {
+                return rvv_reg_t<T, U::width>(arg, types::detail::XSIMD_RVV_BITCAST);
+            }
+            template <class T, class A, class U>
+            rvv_reg_t<T, A::width> rvvreinterpret(batch<U, A> const& arg) noexcept
+            {
+                typename batch<U, A>::register_type r = arg;
+                return rvvreinterpret<T>(r);
+            }
+
+            template <class A, class T, class U = as_unsigned_integer_t<T>>
+            inline batch<U, A> rvv_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            {
+                return rvvreinterpret<U>(arg.data);
+            }
+
+            XSIMD_RVV_OVERLOAD(rvvid,
+                               (__riscv_vid_v_u XSIMD_RVV_S XSIMD_RVV_M), _DROP_1ST, uvec(T))
+
+            XSIMD_RVV_OVERLOAD3(rvvmv_splat,
+                                (__riscv_vmv_v_x_ XSIMD_RVV_TSM),
+                                (__riscv_vmv_v_x_ XSIMD_RVV_TSM),
+                                (__riscv_vfmv_v_f_ XSIMD_RVV_TSM), , vec(T))
+
+            XSIMD_RVV_OVERLOAD3(rvvmv_lane0,
+                                (__riscv_vmv_x),
+                                (__riscv_vmv_x),
+                                (__riscv_vfmv_f), _NOVL, T(vec))
+
+            XSIMD_RVV_OVERLOAD(rvvmerge, (__riscv_vmerge), , vec(vec, vec, bvec))
+            XSIMD_RVV_OVERLOAD3(rvvmerge_splat,
+                                (__riscv_vmerge),
+                                (__riscv_vmerge),
+                                (__riscv_vfmerge), , vec(vec, T, bvec))
+
+            // count active lanes in a predicate
+            XSIMD_RVV_OVERLOAD(rvvcpop, (__riscv_vcpop),
+                               , size_t(bvec));
+
+            template <class T, size_t Width>
+            inline rvv_bool_t<T, Width> pmask8(uint8_t mask) noexcept
+            {
+                return rvv_bool_t<T, Width>(mask);
+            }
+            template <class T, size_t Width>
+            inline rvv_bool_t<T, Width> pmask(uint64_t mask) noexcept
+            {
+                return rvv_bool_t<T, Width>(mask);
+            }
+
+            template <class A, class T, size_t offset = 0, int shift = 0>
+            inline rvv_reg_t<T, A::width> vindex() noexcept
+            {
+                auto index = rvvid(T {});
+                if (shift < 0)
+                    index = __riscv_vsrl(index, -shift, batch<T, A>::size);
+                else
+                    index = __riscv_vsll(index, shift, batch<T, A>::size);
+                return __riscv_vadd(index, T(offset), batch<T, A>::size);
+            }
+
+            // enable for signed integers
+            template <class T>
+            using rvv_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
+
+            // enable for unsigned integers
+            template <class T>
+            using rvv_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value, int>::type;
+
+            // enable for floating points
+            template <class T>
+            using rvv_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
+
+            // enable for signed integers or floating points
+            template <class T>
+            using rvv_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
+
+            // enable for all RVE supported types
+            template <class T>
+            using rvv_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
+        } // namespace detail
+
+        /********************
+         * Scalar to vector *
+         ********************/
+
+        namespace detail
+        {
+            template <class T, size_t Width>
+            inline detail::rvv_reg_t<T, Width> broadcast(T arg) noexcept
+            {
+                // A bit of a dance, here, because rvvmv_splat has no other
+                // argument from which to deduce type, and T=char is not
+                // supported.
+                detail::rvv_fix_char_t<T> arg_not_char(arg);
+                const auto splat = detail::rvvmv_splat(arg_not_char);
+                return detail::rvv_reg_t<T, Width>(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
+            }
+        }
+
+        // broadcast
+        template <class A, class T>
+        inline batch<T, A> broadcast(T arg, requires_arch<rvv>) noexcept
+        {
+            return detail::broadcast<T, A::width>(arg);
+        }
+
+        /*********
+         * Load *
+         *********/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*))
+            XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec))
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvle(reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src));
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<rvv>) noexcept
+        {
+            return load_aligned<A>(src, convert<T>(), rvv {});
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
+            inline rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
+            {
+                typename rvv_reg_t<T, W * 2>::register_type tmp;
+                tmp = __riscv_vset(tmp, 0, lo);
+                return __riscv_vset(tmp, 1, hi);
+            }
+
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> inline rvv_reg_t<T, W * 2> rvvabut(rvv_reg_t<T, W> const& lo, rvv_reg_t<T, W> const& hi) noexcept
+            {
+                return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2);
+            }
+
+            XSIMD_RVV_OVERLOAD(rvvget_lo_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 0)
+            XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1)
+
+            template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
+            rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                typename rvv_reg_t<T, W>::register_type tmp = rvvget_lo_(T {}, vv);
+                return tmp;
+            }
+            template <class T, size_t W, typename std::enable_if<W >= types::detail::rvv_width_m1, int>::type = 0>
+            rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                typename rvv_reg_t<T, W>::register_type tmp = rvvget_hi_(T {}, vv);
+                return tmp;
+            }
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t<T, W> rvvget_lo(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                typename rvv_reg_t<T, W>::register_type tmp = vv;
+                return tmp;
+            }
+            template <class T, size_t W, typename std::enable_if<W<types::detail::rvv_width_m1, int>::type = 0> rvv_reg_t<T, W> rvvget_hi(rvv_reg_t<T, W * 2> const& vv) noexcept
+            {
+                return __riscv_vslidedown(vv, vv.vl / 2, vv.vl);
+            }
+
+            template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<rvv>) noexcept
+            {
+                const auto real_index = vindex<A, as_unsigned_integer_t<T>, 0, 1>();
+                const auto imag_index = vindex<A, as_unsigned_integer_t<T>, 1, 1>();
+                const auto index = rvvabut<as_unsigned_integer_t<T>, A::width>(real_index, imag_index);
+                const auto input = rvvabut<T, A::width>(lo.data, hi.data);
+                const rvv_reg_t<T, A::width * 2> result = __riscv_vrgather(input, index, index.vl);
+
+                return { rvvget_lo<T, A::width>(result), rvvget_hi<T, A::width>(result) };
+            }
+        }
+
+        /*********
+         * Store *
+         *********/
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
+        {
+            detail::rvvse(reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst), src);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<rvv>) noexcept
+        {
+            store_aligned<A>(dst, src, rvv {});
+        }
+
+        /******************
+         * scatter/gather *
+         ******************/
+
+        namespace detail
+        {
+            template <class T, class U>
+            using rvv_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
+            XSIMD_RVV_OVERLOAD(rvvloxei, (__riscv_vloxei XSIMD_RVV_S), , vec(T const*, uvec))
+            XSIMD_RVV_OVERLOAD(rvvsoxei, (__riscv_vsoxei XSIMD_RVV_S), , void(T*, uvec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmul_splat,
+                                (__riscv_vmul),
+                                (__riscv_vmul),
+                                (__riscv_vfmul), , vec(vec, T))
+        }
+
+        // scatter
+        template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
+        inline void scatter(batch<T, A> const& vals, T* dst, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
+        {
+            using UU = as_unsigned_integer_t<U>;
+            const auto uindex = detail::rvv_to_unsigned_batch(index);
+            auto* base = reinterpret_cast<detail::rvv_fix_char_t<T>*>(dst);
+            // or rvvsuxei
+            const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
+            detail::rvvsoxei(base, bi, vals);
+        }
+
+        // gather
+        template <class A, class T, class U, detail::rvv_enable_sg_t<T, U> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<rvv>) noexcept
+        {
+            using UU = as_unsigned_integer_t<U>;
+            const auto uindex = detail::rvv_to_unsigned_batch(index);
+            auto const* base = reinterpret_cast<detail::rvv_fix_char_t<T> const*>(src);
+            // or rvvluxei
+            const auto bi = detail::rvvmul_splat(uindex, sizeof(T));
+            return detail::rvvloxei(base, bi);
+        }
+
+        /**************
+         * Arithmetic *
+         **************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD3(rvvadd,
+                                (__riscv_vadd),
+                                (__riscv_vadd),
+                                (__riscv_vfadd), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvsadd,
+                                (__riscv_vsadd),
+                                (__riscv_vsaddu), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvsub,
+                                (__riscv_vsub),
+                                (__riscv_vsub),
+                                (__riscv_vfsub), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvssub,
+                                (__riscv_vssub),
+                                (__riscv_vssubu), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD2(rvvaadd,
+                                (__riscv_vaadd),
+                                (__riscv_vaaddu), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmul,
+                                (__riscv_vmul),
+                                (__riscv_vmul),
+                                (__riscv_vfmul), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvdiv,
+                                (__riscv_vdiv),
+                                (__riscv_vdivu),
+                                (__riscv_vfdiv), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmax,
+                                (__riscv_vmax),
+                                (__riscv_vmaxu),
+                                (__riscv_vfmax), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmin,
+                                (__riscv_vmin),
+                                (__riscv_vminu),
+                                (__riscv_vfmin), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvneg,
+                                (__riscv_vneg),
+                                (abort),
+                                (__riscv_vfneg), , vec(vec))
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvabs,
+                                      (__riscv_vfabs), , vec(vec))
+            XSIMD_RVV_OVERLOAD3(rvvmacc,
+                                (__riscv_vmacc),
+                                (__riscv_vmacc),
+                                (__riscv_vfmacc), , vec(vec, vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvnmsac,
+                                (__riscv_vnmsac),
+                                (__riscv_vnmsac),
+                                (__riscv_vfnmsac), , vec(vec, vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvmadd,
+                                (__riscv_vmadd),
+                                (__riscv_vmadd),
+                                (__riscv_vfmadd), , vec(vec, vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvnmsub,
+                                (__riscv_vnmsub),
+                                (__riscv_vnmsub),
+                                (__riscv_vfnmsub), , vec(vec, vec, vec))
+
+#define RISCV_VMSXX(XX)                                      \
+    XSIMD_RVV_OVERLOAD3(rvvms##XX,                           \
+                        (__riscv_vms##XX),                   \
+                        (__riscv_vms##XX##u),                \
+                        (__riscv_vmf##XX), , bvec(vec, vec)) \
+    XSIMD_RVV_OVERLOAD3(rvvms##XX##_splat,                   \
+                        (__riscv_vms##XX),                   \
+                        (__riscv_vms##XX##u),                \
+                        (__riscv_vmf##XX), , bvec(vec, T))
+#define __riscv_vmsequ __riscv_vmseq
+#define __riscv_vmsneu __riscv_vmsne
+            RISCV_VMSXX(eq)
+            RISCV_VMSXX(ne)
+            RISCV_VMSXX(lt)
+            RISCV_VMSXX(le)
+            RISCV_VMSXX(gt)
+            RISCV_VMSXX(ge)
+#undef __riscv_vmsequ
+#undef __riscv_vmsneu
+#undef RISCV_VMSXX
+        } // namespace detail
+
+        // add
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvadd(lhs, rhs);
+        }
+
+        // sadd
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsadd(lhs, rhs);
+        }
+
+        // sub
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsub(lhs, rhs);
+        }
+
+        // ssub
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvssub(lhs, rhs);
+        }
+
+        // mul
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmul(lhs, rhs);
+        }
+
+        // div
+        template <class A, class T, typename detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvdiv(lhs, rhs);
+        }
+
+        // max
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmax(lhs, rhs);
+        }
+
+        // min
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmin(lhs, rhs);
+        }
+
+        // neg
+        template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            using S = as_signed_integer_t<T>;
+            const auto as_signed = detail::rvvreinterpret<S>(arg);
+            const auto result = detail::rvvneg(as_signed);
+            return detail::rvvreinterpret<T>(result);
+        }
+
+        template <class A, class T, detail::rvv_enable_signed_int_or_floating_point_t<T> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvneg(arg);
+        }
+
+        // abs
+        template <class A, class T, detail::rvv_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return arg;
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvabs(arg);
+        }
+
+        // fma: x * y + z
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also detail::rvvmadd(x, y, z);
+            return detail::rvvmacc(z, x, y);
+        }
+
+        // fnma: z - x * y
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also detail::rvvnmsub(x, y, z);
+            return detail::rvvnmsac(z, x, y);
+        }
+
+        // fms: x * y - z
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also vfmsac(z, x, y), but lacking integer version
+            // also vfmsub(x, y, z), but lacking integer version
+            return -fnma(x, y, z);
+        }
+
+        // fnms: - x * y - z
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<rvv>) noexcept
+        {
+            // also vfnmacc(z, x, y), but lacking integer version
+            // also vfnmadd(x, y, z), but lacking integer version
+            return -fma(z, x, y);
+        }
+
+        /**********************
+         * Logical operations *
+         **********************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvor_splat, (__riscv_vor), , vec(vec, T))
+            XSIMD_RVV_OVERLOAD_INTS(rvvxor, (__riscv_vxor), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvnot, (__riscv_vnot), , vec(vec))
+            XSIMD_RVV_OVERLOAD(rvvmand, (__riscv_vmand_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmor, (__riscv_vmor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmxor, (__riscv_vmxor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmandn, (__riscv_vmandn_mm_b XSIMD_RVV_S), , bvec(bvec, bvec))
+            XSIMD_RVV_OVERLOAD(rvvmnot, (__riscv_vmnot), , bvec(bvec))
+        }
+
+        // bitwise_and
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvand(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail::rvvand(lhs_bits, rhs_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmand(lhs, rhs);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto not_rhs = detail::rvvnot(rhs);
+            return detail::rvvand(lhs, not_rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto not_rhs = detail::rvvnot(rhs_bits);
+            const auto result_bits = detail::rvvand(lhs_bits, not_rhs);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmandn(lhs, rhs);
+        }
+
+        // bitwise_or
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvor(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail::rvvor(lhs_bits, rhs_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmor(lhs, rhs);
+        }
+
+        // bitwise_xor
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvxor(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs);
+            const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs);
+            const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmxor(lhs, rhs);
+        }
+
+        // bitwise_not
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvnot(arg);
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto arg_bits = detail::rvv_to_unsigned_batch(arg);
+            const auto result_bits = detail::rvvnot(arg_bits);
+            return detail::rvvreinterpret<T>(result_bits);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmnot(arg);
+        }
+
+        /**********
+         * Shifts *
+         **********/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t))
+            XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec))
+            XSIMD_RVV_OVERLOAD2(rvvsr_splat,
+                                (__riscv_vsra),
+                                (__riscv_vsrl), , vec(vec, size_t))
+            XSIMD_RVV_OVERLOAD2(rvvsr,
+                                (__riscv_vsra),
+                                (__riscv_vsrl), , vec(vec, uvec))
+        } // namespace detail
+
+        // bitwise_lshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
+        {
+            constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
+            return detail::rvvsll_splat(arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
+        }
+
+        // bitwise_rshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<rvv>) noexcept
+        {
+            constexpr size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<size_t>(n) < size && "index in bounds");
+            return detail::rvvsr_splat(arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch<A, T>(rhs));
+        }
+
+        /**************
+         * Reductions *
+         **************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD3(rvvredsum,
+                                (__riscv_vredsum),
+                                (__riscv_vredsum),
+                                (__riscv_vfredosum), // or __riscv_vfredusum
+                                , scalar_vec(vec, scalar_vec))
+            XSIMD_RVV_OVERLOAD3(rvvredmax,
+                                (__riscv_vredmax),
+                                (__riscv_vredmaxu),
+                                (__riscv_vfredmax), , scalar_vec(vec, scalar_vec))
+            XSIMD_RVV_OVERLOAD3(rvvredmin,
+                                (__riscv_vredmin),
+                                (__riscv_vredminu),
+                                (__riscv_vfredmin), , scalar_vec(vec, scalar_vec))
+            XSIMD_RVV_OVERLOAD3(rvvslide1up,
+                                (__riscv_vslide1up),
+                                (__riscv_vslide1up),
+                                (__riscv_vfslide1up), , vec(vec, vec))
+            XSIMD_RVV_OVERLOAD3(rvvslide1down,
+                                (__riscv_vslide1down),
+                                (__riscv_vslide1down),
+                                (__riscv_vfslide1down), , vec(vec, T))
+
+            template <class A, class T>
+            inline T reduce_scalar(rvv_reg_t<T, types::detail::rvv_width_m1> const& arg)
+            {
+                return detail::rvvmv_lane0(rvv_reg_t<T, A::width>(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST));
+            }
+        }
+        // reduce_add
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::rvv_enable_all_t<T> = 0>
+        inline V reduce_add(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto zero = detail::broadcast<T, types::detail::rvv_width_m1>(T(0));
+            const auto r = detail::rvvredsum(arg, zero);
+            return detail::reduce_scalar<A, T>(r);
+        }
+
+        // reduce_max
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline T reduce_max(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto lowest = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::lowest());
+            const auto r = detail::rvvredmax(arg, lowest);
+            return detail::reduce_scalar<A, T>(r);
+        }
+
+        // reduce_min
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline T reduce_min(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto max = detail::broadcast<T, types::detail::rvv_width_m1>(std::numeric_limits<T>::max());
+            const auto r = detail::rvvredmin(arg, max);
+            return detail::reduce_scalar<A, T>(r);
+        }
+
+        // haddp
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> haddp(const batch<T, A>* row, requires_arch<rvv>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            T sums[size];
+#pragma unroll size
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                sums[i] = reduce_add(row[i], rvv {});
+            }
+            return load_aligned<A>(sums, convert<T>(), rvv {});
+        }
+
+        /***************
+         * Comparisons *
+         ***************/
+
+        // eq
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmseq(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto neq_result = detail::rvvmxor(lhs, rhs);
+            return detail::rvvmnot(neq_result);
+        }
+
+        // neq
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsne(lhs, rhs);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmxor(lhs, rhs);
+        }
+
+        // lt
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmslt(lhs, rhs);
+        }
+
+        // le
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsle(lhs, rhs);
+        }
+
+        // gt
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsgt(lhs, rhs);
+        }
+
+        // ge
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmsge(lhs, rhs);
+        }
+
+        /*************
+         * Selection *
+         *************/
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress), , vec(vec, bvec))
+        }
+        // compress
+        template <class A, class T>
+        inline batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvcompress(x, mask);
+        }
+
+        /***************
+         * Permutation *
+         ***************/
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec))
+            XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t))
+            XSIMD_RVV_OVERLOAD(rvvslidedown, (__riscv_vslidedown), , vec(vec, size_t))
+        }
+
+        // swizzle
+        template <class A, class T, class I, I... idx>
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<rvv>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
+            const batch<I, A> indices { idx... };
+            return detail::rvvrgather(arg, indices);
+        }
+
+        template <class A, class T, class I, I... idx>
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                 batch_constant<batch<I, A>, idx...>,
+                                                 requires_arch<rvv>) noexcept
+        {
+            const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, rvv {});
+            const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, rvv {});
+            return batch<std::complex<T>>(real, imag);
+        }
+
+        /*************
+         * Selection *
+         *************/
+
+        // extract_pair
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, size_t n, requires_arch<rvv>) noexcept
+        {
+            const auto tmp = detail::rvvslidedown(rhs, n);
+            return detail::rvvslideup(tmp, lhs, lhs.size - n);
+        }
+
+        // select
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvmerge(b, a, cond);
+        }
+
+        template <class A, class T, bool... b>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
+        }
+
+        // zip_lo
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto index = detail::vindex<A, as_unsigned_integer_t<T>, 0, -1>();
+            const auto mask = detail::pmask8<T, A::width>(0xaa);
+            return detail::rvvmerge(detail::rvvrgather(lhs, index),
+                                    detail::rvvrgather(rhs, index),
+                                    mask);
+        }
+
+        // zip_hi
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<rvv>) noexcept
+        {
+            const auto index = detail::vindex<A, as_unsigned_integer_t<T>, batch<T, A>::size / 2, -1>();
+            const auto mask = detail::pmask8<T, A::width>(0xaa);
+            return detail::rvvmerge(detail::rvvrgather(lhs, index),
+                                    detail::rvvrgather(rhs, index),
+                                    mask);
+        }
+
+        // store_complex
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
+        {
+            const auto lo = zip_lo(src.real(), src.imag());
+            const auto hi = zip_hi(src.real(), src.imag());
+            T* buf = reinterpret_cast<T*>(dst);
+            store_aligned(buf, lo, rvv {});
+            store_aligned(buf + lo.size, hi, rvv {});
+        }
+
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<rvv>) noexcept
+        {
+            store_complex_aligned(dst, src, rvv {});
+        }
+
+        /*****************************
+         * Floating-point arithmetic *
+         *****************************/
+
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec))
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec))
+            XSIMD_RVV_OVERLOAD_FLOATS(rvvfrsqrt7, (__riscv_vfrsqrt7), , vec(vec))
+        }
+
+        // rsqrt
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            auto approx = detail::rvvfrsqrt7(arg);
+            approx = approx * (1.5 - (0.5 * arg * approx * approx));
+            return approx;
+        }
+
+        // sqrt
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvfsqrt(arg);
+        }
+
+        // reciprocal
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvfrec7(arg);
+        }
+
+        /******************************
+         * Floating-point conversions *
+         ******************************/
+
+        // fast_cast
+        namespace detail
+        {
+            XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C.
+                                (__riscv_vfcvt_rtz_x),
+                                (__riscv_vfcvt_rtz_xu), _DROP_1ST, vec(T, fvec))
+            XSIMD_RVV_OVERLOAD2(rvvfcvt_rne, // round to nearest, ties to even
+                                (__riscv_vfcvt_x),
+                                (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RNE)
+            XSIMD_RVV_OVERLOAD2(rvvfcvt_rmm, // round to nearest, ties to max magnitude
+                                (__riscv_vfcvt_x),
+                                (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RMM)
+            XSIMD_RVV_OVERLOAD2(rvvfcvt, // round to current rounding mode.
+                                (__riscv_vfcvt_x),
+                                (__riscv_vfcvt_xu), _DROP_1ST, vec(T, fvec))
+            XSIMD_RVV_OVERLOAD_INTS(rvvfcvt_f, (__riscv_vfcvt_f), , fvec(vec))
+
+            template <class T, class U>
+            using rvv_enable_ftoi_t = typename std::enable_if<(sizeof(T) == sizeof(U) && std::is_floating_point<T>::value && !std::is_floating_point<U>::value), int>::type;
+            template <class T, class U>
+            using rvv_enable_itof_t = typename std::enable_if<(sizeof(T) == sizeof(U) && !std::is_floating_point<T>::value && std::is_floating_point<U>::value), int>::type;
+
+            template <class A, class T, class U, rvv_enable_ftoi_t<T, U> = 0>
+            inline batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
+            {
+                return rvvfcvt_rtz(U {}, arg);
+            }
+            template <class A, class T, class U, rvv_enable_itof_t<T, U> = 0>
+            inline batch<U, A> fast_cast(batch<T, A> const& arg, batch<U, A> const&, requires_arch<rvv>) noexcept
+            {
+                return rvvfcvt_f(arg);
+            }
+        }
+
+        /*********
+         * Miscs *
+         *********/
+
+        // set
+        template <class A, class T, class... Args>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<rvv>, Args... args) noexcept
+        {
+            const std::array<T, batch<T, A>::size> tmp { args... };
+            return load_unaligned<A>(tmp.data(), convert<T>(), rvv {});
+        }
+
+        template <class A, class T, class... Args>
+        inline batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<rvv>,
+                                             Args... args_complex) noexcept
+        {
+            return batch<std::complex<T>>(set(batch<T, rvv> {}, rvv {}, args_complex.real()...),
+                                          set(batch<T, rvv> {}, rvv {}, args_complex.imag()...));
+        }
+
+        template <class A, class T, class... Args>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<rvv>, Args... args) noexcept
+        {
+            using U = as_unsigned_integer_t<T>;
+            const auto values = set(batch<U, rvv> {}, rvv {}, static_cast<U>(args)...);
+            const auto zero = broadcast<A>(U(0), rvv {});
+            detail::rvv_bool_t<T> result = detail::rvvmsne(values, zero);
+            return result;
+        }
+
+        // insert
+        template <class A, class T, size_t I, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<rvv>) noexcept
+        {
+            const auto mask = detail::pmask<T, A::width>(uint64_t(1) << I);
+            return detail::rvvmerge_splat(arg, val, mask);
+        }
+
+        // get
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline T get(batch<T, A> const& arg, size_t i, requires_arch<rvv>) noexcept
+        {
+            const auto tmp = detail::rvvslidedown(arg, i);
+            return detail::rvvmv_lane0(tmp);
+        }
+
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline std::complex<T> get(batch<std::complex<T>, A> const& arg, size_t i, requires_arch<rvv>) noexcept
+        {
+            const auto tmpr = detail::rvvslidedown(arg.real(), i);
+            const auto tmpi = detail::rvvslidedown(arg.imag(), i);
+            return std::complex<T> { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) };
+        }
+
+        // all
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvcpop(arg) == batch_bool<T, A>::size;
+        }
+
+        // any
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return detail::rvvcpop(arg) > 0;
+        }
+
+        // bitwise_cast
+        template <class A, class T, class R, detail::rvv_enable_all_t<T> = 0, detail::rvv_enable_all_t<R> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<rvv>) noexcept
+        {
+            return detail::rvv_reg_t<R, A::width>(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, detail::rvv_enable_all_t<T_in> = 0>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<rvv>) noexcept
+        {
+            using intermediate_t = typename detail::rvv_bool_t<T_out>;
+            return intermediate_t(arg.data);
+        }
+
+        // from_bool
+        template <class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto zero = broadcast<A>(T(0), rvv {});
+            return detail::rvvmerge_splat(zero, T(1), arg);
+        }
+
+        namespace detail
+        {
+            template <size_t Width>
+            inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i)
+            {
+                return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8);
+            }
+            template <>
+            inline vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf2>(vuint8m1_t arg, size_t i)
+            {
+                const auto bytes = __riscv_vlmul_trunc_u8mf2(arg);
+                const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8);
+                return __riscv_vlmul_ext_u8m1(result);
+            }
+            template <>
+            inline vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf4>(vuint8m1_t arg, size_t i)
+            {
+                const auto bytes = __riscv_vlmul_trunc_u8mf4(arg);
+                const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8);
+                return __riscv_vlmul_ext_u8m1(result);
+            }
+            template <>
+            inline vuint8m1_t rvvslidedownbytes<types::detail::rvv_width_mf8>(vuint8m1_t arg, size_t i)
+            {
+                const auto bytes = __riscv_vlmul_trunc_u8mf8(arg);
+                const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8);
+                return __riscv_vlmul_ext_u8m1(result);
+            }
+        }
+
+        // slide_left
+        template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            const auto zero = broadcast<A>(uint8_t(0), rvv {});
+            const auto bytes = arg.data.get_bytes();
+            return detail::rvvreinterpret<T>(detail::rvvslideup(zero, bytes, N));
+        }
+
+        // slide_right
+        template <size_t N, class A, class T, detail::rvv_enable_all_t<T> = 0>
+        inline batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            using reg_t = detail::rvv_reg_t<T, A::width>;
+            const auto bytes = arg.data.get_bytes();
+            return reg_t(detail::rvvslidedownbytes<A::width>(bytes, N), types::detail::XSIMD_RVV_BITCAST);
+        }
+
+        // isnan
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        namespace detail
+        {
+            template <class T>
+            using rvv_as_signed_integer_t = as_signed_integer_t<as_unsigned_integer_t<T>>;
+
+            template <class A, class T, class U = rvv_as_signed_integer_t<T>>
+            inline batch<U, A> rvvfcvt_default(batch<T, A> const& arg) noexcept
+            {
+                return rvvfcvt_rne(U {}, arg);
+            }
+
+            template <class A, class T, class U = rvv_as_signed_integer_t<T>>
+            inline batch<U, A> rvvfcvt_afz(batch<T, A> const& arg) noexcept
+            {
+                return rvvfcvt_rmm(U {}, arg);
+            }
+        }
+
+        // nearbyint_as_int
+        template <class A, class T, class U = detail::rvv_as_signed_integer_t<T>>
+        inline batch<U, A> nearbyint_as_int(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            // Reference rounds ties to nearest even
+            return detail::rvvfcvt_default(arg);
+        }
+
+        // round
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> round(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            // Round ties away from zero.
+            const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
+            return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {});
+        }
+
+        // nearbyint
+        template <class A, class T, detail::rvv_enable_floating_point_t<T> = 0>
+        inline batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<rvv>) noexcept
+        {
+            // Round according to current rounding mode.
+            const auto mask = abs(arg) < constants::maxflint<batch<T, A>>();
+            return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {});
+        }
+    } // namespace kernel
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
index 32a5d67c8eec..8160b2423bb7 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
@@ -380,7 +380,7 @@ namespace xsimd
         template <class A>
         inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<wasm>) noexcept
         {
-            return wasm_f32x4_eq(self, other);
+            return wasm_i32x4_eq(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -440,7 +440,7 @@ namespace xsimd
         template <class A>
         inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<wasm>) noexcept
         {
-            return wasm_f64x2_eq(self, other);
+            return wasm_i64x2_eq(self, other);
         }
 
         // fast_cast
@@ -579,6 +579,30 @@ namespace xsimd
                 0xFFFFFF00,
                 0xFFFFFFFF,
             };
+            alignas(A::alignment()) static const uint32_t lut16[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            alignas(A::alignment()) static const uint64_t lut8[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
                 assert(!(mask & ~0xFFFF) && "inbound mask");
@@ -587,15 +611,17 @@ namespace xsimd
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
                 assert(!(mask & ~0xFF) && "inbound mask");
-                return wasm_i64x2_make(lut64[mask >> 4], lut64[mask & 0xF]);
+                return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]);
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
-                return batch_bool_cast<T>(from_mask(batch_bool<float, A> {}, mask, wasm {}));
+                assert(!(mask & ~0xFul) && "inbound mask");
+                return wasm_v128_load((const v128_t*)lut16[mask]);
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
             {
-                return batch_bool_cast<T>(from_mask(batch_bool<double, A> {}, mask, wasm {}));
+                assert(!(mask & ~0x3ul) && "inbound mask");
+                return wasm_v128_load((const v128_t*)lut8[mask]);
             }
         }
 
@@ -1114,44 +1140,6 @@ namespace xsimd
             return wasm_f64x2_extract_lane(tmp2, 0);
         }
 
-        // reduce_max
-        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_max(batch<T, A> const& self, requires_arch<wasm>) noexcept
-        {
-            batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
-            batch<T, A> acc0 = max(self, step0);
-
-            batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
-            batch<T, A> acc1 = max(acc0, step1);
-
-            batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
-            batch<T, A> acc2 = max(acc1, step2);
-            if (sizeof(T) == 2)
-                return acc2.get(0);
-            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
-            batch<T, A> acc3 = max(acc2, step3);
-            return acc3.get(0);
-        }
-
-        // reduce_min
-        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        inline T reduce_min(batch<T, A> const& self, requires_arch<wasm>) noexcept
-        {
-            batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
-            batch<T, A> acc0 = min(self, step0);
-
-            batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
-            batch<T, A> acc1 = min(acc0, step1);
-
-            batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
-            batch<T, A> acc2 = min(acc1, step2);
-            if (sizeof(T) == 2)
-                return acc2.get(0);
-            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
-            batch<T, A> acc3 = min(acc2, step3);
-            return acc3.get(0);
-        }
-
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1171,15 +1159,15 @@ namespace xsimd
         inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_shuffle(
-                wasm_i64x2_const(0, 0), x, ((N)&0xF0) ? 0 : 16 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 17 - ((N)&0xF), ((N)&0xF0) ? 0 : 18 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 19 - ((N)&0xF), ((N)&0xF0) ? 0 : 20 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 21 - ((N)&0xF), ((N)&0xF0) ? 0 : 22 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 23 - ((N)&0xF), ((N)&0xF0) ? 0 : 24 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 25 - ((N)&0xF), ((N)&0xF0) ? 0 : 26 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 27 - ((N)&0xF), ((N)&0xF0) ? 0 : 28 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 29 - ((N)&0xF), ((N)&0xF0) ? 0 : 30 - ((N)&0xF),
-                ((N)&0xF0) ? 0 : 31 - ((N)&0xF));
+                wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 17 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 18 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 19 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 20 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 21 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 22 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 23 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 24 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 25 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 26 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 27 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 28 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 29 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 30 - ((N) & 0xF),
+                ((N) & 0xF0) ? 0 : 31 - ((N) & 0xF));
         }
 
         // slide_right
@@ -1187,15 +1175,15 @@ namespace xsimd
         inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_shuffle(
-                x, wasm_i64x2_const(0, 0), ((N)&0xF0) ? 16 : ((N)&0xF) + 0,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 1, ((N)&0xF0) ? 16 : ((N)&0xF) + 2,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 3, ((N)&0xF0) ? 16 : ((N)&0xF) + 4,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 5, ((N)&0xF0) ? 16 : ((N)&0xF) + 6,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 7, ((N)&0xF0) ? 16 : ((N)&0xF) + 8,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 9, ((N)&0xF0) ? 16 : ((N)&0xF) + 10,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 11, ((N)&0xF0) ? 16 : ((N)&0xF) + 12,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 13, ((N)&0xF0) ? 16 : ((N)&0xF) + 14,
-                ((N)&0xF0) ? 16 : ((N)&0xF) + 15);
+                x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 1, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 2,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 3, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 4,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 5, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 6,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 7, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 8,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 9, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 10,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 11, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 12,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 13, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 14,
+                ((N) & 0xF0) ? 16 : ((N) & 0xF) + 15);
         }
 
         // sadd
@@ -1259,29 +1247,15 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<wasm>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
         {
-            // shuffle within lane
-            if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
-                return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
-
-            // shuffle within opposite lane
-            if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
-                return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3);
-            return shuffle(x, y, mask, generic {});
+            return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<wasm>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1>, requires_arch<wasm>) noexcept
         {
-            // shuffle within lane
-            if (I0 < 2 && I1 >= 2)
-                return wasm_i64x2_shuffle(x, y, I0, I1);
-
-            // shuffle within opposite lane
-            if (I0 >= 2 && I1 < 2)
-                return wasm_i64x2_shuffle(y, x, I0, I1);
-            return shuffle(x, y, mask, generic {});
+            return wasm_i64x2_shuffle(x, y, I0, I1);
         }
 
         // set
@@ -1500,7 +1474,6 @@ namespace xsimd
         }
 
         // swizzle
-
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
@@ -1516,7 +1489,7 @@ namespace xsimd
         template <class A, uint64_t V0, uint64_t V1>
         inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
         {
-            return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+            return wasm_i64x2_shuffle(self, self, V0, V1);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
@@ -1528,7 +1501,7 @@ namespace xsimd
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
-            return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3);
+            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
@@ -1537,6 +1510,32 @@ namespace xsimd
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
         }
 
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
+        {
+            return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
+        }
+
         // trunc
         template <class A>
         inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1625,4 +1624,4 @@ namespace xsimd
     }
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
index ab9ecbc298e8..fe8c54166923 100644
--- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
@@ -57,7 +57,7 @@ namespace xsimd
         {
         };
 
-        template <class... Archs>
+        template <unsigned... Vals>
         struct is_sorted;
 
         template <>
@@ -65,14 +65,14 @@ namespace xsimd
         {
         };
 
-        template <class Arch>
-        struct is_sorted<Arch> : std::true_type
+        template <unsigned Val>
+        struct is_sorted<Val> : std::true_type
         {
         };
 
-        template <class A0, class A1, class... Archs>
-        struct is_sorted<A0, A1, Archs...>
-            : std::conditional<(A0::version() >= A1::version()), is_sorted<Archs...>,
+        template <unsigned V0, unsigned V1, unsigned... Vals>
+        struct is_sorted<V0, V1, Vals...>
+            : std::conditional<(V0 >= V1), is_sorted<V1, Vals...>,
                                std::false_type>::type
         {
         };
@@ -111,7 +111,7 @@ namespace xsimd
     struct arch_list
     {
 #ifndef NDEBUG
-        static_assert(detail::is_sorted<Archs...>::value,
+        static_assert(detail::is_sorted<Archs::version()...>::value,
                       "architecture list must be sorted by version");
 #endif
 
@@ -190,16 +190,23 @@ namespace xsimd
     struct unsupported
     {
     };
-    using all_x86_architectures = arch_list<avx512bw, avx512dq, avx512cd, avx512f, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
+    using all_x86_architectures = arch_list<
+        avx512vnni<avx512vbmi>, avx512vbmi, avx512ifma, avx512pf, avx512vnni<avx512bw>, avx512bw, avx512er, avx512dq, avx512cd, avx512f,
+        avxvnni, fma3<avx2>, avx2, fma3<avx>, avx, fma4, fma3<sse4_2>,
+        sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>;
+
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
+    using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
     using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
+    using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
-    using all_architectures = typename detail::join<all_arm_architectures, all_x86_architectures, all_wasm_architectures>::type;
+    using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 
     using x86_arch = typename detail::supported<all_x86_architectures>::type::best;
     using arm_arch = typename detail::supported<all_arm_architectures>::type::best;
+    using riscv_arch = typename detail::supported<all_riscv_architectures>::type::best;
     using best_arch = typename supported_architectures::best;
 
 #ifdef XSIMD_DEFAULT_ARCH
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
index 161f123fea2a..cf5163c37efa 100644
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@@ -12,9 +12,9 @@
 #ifndef XSIMD_CONFIG_HPP
 #define XSIMD_CONFIG_HPP
 
-#define XSIMD_VERSION_MAJOR 11
-#define XSIMD_VERSION_MINOR 2
-#define XSIMD_VERSION_PATCH 0
+#define XSIMD_VERSION_MAJOR 12
+#define XSIMD_VERSION_MINOR 1
+#define XSIMD_VERSION_PATCH 1
 
 /**
  * high level free functions
@@ -99,6 +99,17 @@
 #define XSIMD_WITH_AVX2 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVXVNNI is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVXVNNI__
+#define XSIMD_WITH_AVXVNNI 1
+#else
+#define XSIMD_WITH_AVXVNNI 0
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -244,6 +255,72 @@
 #define XSIMD_WITH_AVX512BW 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512ER is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512ER__
+#define XSIMD_WITH_AVX512ER XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512ER 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512PF is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512PF__
+#define XSIMD_WITH_AVX512PF XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512PF 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512IFMA is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512IFMA__
+#define XSIMD_WITH_AVX512IFMA XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512IFMA 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512VBMI is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512VBMI__
+#define XSIMD_WITH_AVX512VBMI XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512VBMI 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if AVX512VNNI is available at compile-time, to 0 otherwise.
+ */
+#ifdef __AVX512VNNI__
+
+#if XSIMD_WITH_AVX512_VBMI
+#define XSIMD_WITH_AVX512VNNI_AVX512VBMI XSIMD_WITH_AVX512F
+#define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F
+#else
+#define XSIMD_WITH_AVX512VNNI_AVX512VBMI 0
+#define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F
+#endif
+
+#else
+
+#define XSIMD_WITH_AVX512VNNI_AVX512VBMI 0
+#define XSIMD_WITH_AVX512VNNI_AVX512BW 0
+
+#endif
+
 #ifdef __ARM_NEON
 
 /**
@@ -285,6 +362,19 @@
 #define XSIMD_SVE_BITS 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if RVV is available and bit width is pre-set at compile-time, to 0 otherwise.
+ */
+#if defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
+#define XSIMD_WITH_RVV 1
+#define XSIMD_RVV_BITS __riscv_v_fixed_vlen
+#else
+#define XSIMD_WITH_RVV 0
+#define XSIMD_RVV_BITS 0
+#endif
+
 /**
  * @ingroup xsimd_config_macro
  *
@@ -354,7 +444,8 @@
 
 #endif
 
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_WASM
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM
+#define XSIMD_NO_SUPPORTED_ARCHITECTURE
 #endif
 
 #endif
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
index 76fd26c2b52c..62aca6132fdd 100644
--- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
@@ -15,7 +15,7 @@
 #include <algorithm>
 #include <cstring>
 
-#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM))
+#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector))
 #include <asm/hwcap.h>
 #include <sys/auxv.h>
 #endif
@@ -45,14 +45,22 @@ namespace xsimd
             unsigned avx : 1;
             unsigned fma3_avx : 1;
             unsigned avx2 : 1;
+            unsigned avxvnni : 1;
             unsigned fma3_avx2 : 1;
             unsigned avx512f : 1;
             unsigned avx512cd : 1;
             unsigned avx512dq : 1;
             unsigned avx512bw : 1;
+            unsigned avx512er : 1;
+            unsigned avx512pf : 1;
+            unsigned avx512ifma : 1;
+            unsigned avx512vbmi : 1;
+            unsigned avx512vnni_bw : 1;
+            unsigned avx512vnni_vbmi : 1;
             unsigned neon : 1;
             unsigned neon64 : 1;
             unsigned sve : 1;
+            unsigned rvv : 1;
 
             // version number of the best arch available
             unsigned best;
@@ -85,15 +93,27 @@ namespace xsimd
 #endif
                 best = sve::version() * sve;
 
+#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+#ifndef HWCAP_V
+#define HWCAP_V (1 << ('V' - 'A'))
+#endif
+                rvv = bool(getauxval(AT_HWCAP) & HWCAP_V);
+#else
+                rvv = 0;
+#endif
+
+                best = ::xsimd::rvv::version() * rvv;
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
-                auto get_cpuid = [](int reg[4], int func_id) noexcept
+                auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept
                 {
 
 #if defined(_MSC_VER)
-                    __cpuidex(reg, func_id, 0);
+                    __cpuidex(reg, level, count);
 
 #elif defined(__INTEL_COMPILER)
-                    __cpuid(reg, func_id);
+                    __cpuid(reg, level);
 
 #elif defined(__GNUC__) || defined(__clang__)
 
@@ -104,13 +124,13 @@ namespace xsimd
                             "xchg{l}\t{%%}ebx, %1\n\t"
                             : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]),
                               "=d"(reg[3])
-                            : "a"(func_id), "c"(0));
+                            : "0"(level), "2"(count));
 
 #else
                     __asm__("cpuid\n\t"
                             : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]),
                               "=d"(reg[3])
-                            : "a"(func_id), "c"(0));
+                            : "0"(level), "2"(count));
 #endif
 
 #else
@@ -163,6 +183,11 @@ namespace xsimd
                 avx2 = regs7[1] >> 5 & 1;
                 best = std::max(best, avx2::version() * avx2);
 
+                int regs7a[4];
+                get_cpuid(regs7a, 0x7, 0x1);
+                avxvnni = regs7a[0] >> 4 & 1;
+                best = std::max(best, avxvnni::version() * avxvnni * avx2);
+
                 fma3_avx2 = avx2 && fma3_sse;
                 best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);
 
@@ -178,6 +203,23 @@ namespace xsimd
                 avx512bw = regs7[1] >> 30 & 1;
                 best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);
 
+                avx512er = regs7[1] >> 27 & 1;
+                best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f);
+
+                avx512pf = regs7[1] >> 26 & 1;
+                best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f);
+
+                avx512ifma = regs7[1] >> 21 & 1;
+                best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);
+
+                avx512vbmi = regs7[2] >> 1 & 1;
+                best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f);
+
+                avx512vnni_bw = regs7[2] >> 11 & 1;
+                best = std::max(best, avx512vnni<xsimd::avx512bw>::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f);
+
+                avx512vnni_vbmi = avx512vbmi && avx512vnni_bw;
+                best = std::max(best, avx512vnni<xsimd::avx512vbmi>::version() * avx512vnni_vbmi);
 #endif
             }
         };
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
index ec20ce5fba3b..4350ca0a281a 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@@ -18,9 +18,19 @@
 
 #include "xsimd_avx2_register.hpp"
 #include "xsimd_avx_register.hpp"
+#include "xsimd_avxvnni_register.hpp"
 #include "xsimd_fma3_avx2_register.hpp"
 #include "xsimd_fma3_avx_register.hpp"
 
+#include "xsimd_avx512vnni_avx512bw_register.hpp"
+#include "xsimd_avx512vnni_avx512vbmi_register.hpp"
+
+#include "xsimd_avx512ifma_register.hpp"
+#include "xsimd_avx512vbmi_register.hpp"
+
+#include "xsimd_avx512er_register.hpp"
+#include "xsimd_avx512pf_register.hpp"
+
 #include "xsimd_avx512bw_register.hpp"
 #include "xsimd_avx512cd_register.hpp"
 #include "xsimd_avx512dq_register.hpp"
@@ -31,4 +41,6 @@
 
 #include "xsimd_sve_register.hpp"
 
+#include "xsimd_rvv_register.hpp"
+
 #include "xsimd_wasm_register.hpp"
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
index 6a1526d95867..0420f0a09d6e 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@@ -530,6 +530,19 @@ namespace xsimd
         return kernel::clip(x, lo, hi, A {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Pick elements from \c x selected by \c mask, and append them to the
+     * resulting vector, zeroing the remaining slots
+     */
+    template <class T, class A>
+    inline batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::compress<A>(x, mask, A {});
+    }
+
     /**
      * @ingroup batch_complex
      *
@@ -705,6 +718,19 @@ namespace xsimd
         return kernel::exp2<A>(x, A {});
     }
 
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Load contiguous elements from \c x and place them in slots selected by \c
+     * mask, zeroing the other slots
+     */
+    template <class T, class A>
+    inline batch<T, A> expand(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expand<A>(x, mask, A {});
+    }
+
     /**
      * @ingroup batch_math
      *
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp
new file mode 100644
index 000000000000..a99157cf3723
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512ER_REGISTER_HPP
+#define XSIMD_AVX512ER_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512ER instructions
+     */
+    struct avx512er : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512ER; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 3, 1); }
+        static constexpr char const* name() noexcept { return "avx512er"; }
+    };
+
+#if XSIMD_WITH_AVX512ER
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512er>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512er, avx512cd);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
index fb8e473e1e97..c1f80a122ddb 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
@@ -53,7 +53,6 @@ namespace xsimd
             using type = simd_avx512_bool_register<T>;
         };
 
-        XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i);
         XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
         XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
         XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp
new file mode 100644
index 000000000000..ba76ea147bf5
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512IFMA_REGISTER_HPP
+#define XSIMD_AVX512IFMA_REGISTER_HPP
+
+#include "./xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512IFMA instructions
+     */
+    struct avx512ifma : avx512bw
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 5, 0); }
+        static constexpr char const* name() noexcept { return "avx512ifma"; }
+    };
+
+#if XSIMD_WITH_AVX512IFMA
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512ifma>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512bw);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp
new file mode 100644
index 000000000000..38a10f022737
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512PF_REGISTER_HPP
+#define XSIMD_AVX512PF_REGISTER_HPP
+
+#include "./xsimd_avx512er_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512BW instructions
+     */
+    struct avx512pf : avx512er
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512PF; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); }
+        static constexpr char const* name() noexcept { return "avx512pf"; }
+    };
+
+#if XSIMD_WITH_AVX512PF
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512pf>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512pf, avx512er);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp
new file mode 100644
index 000000000000..19ff744d7208
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_REGISTER_HPP
+#define XSIMD_AVX512VBMI_REGISTER_HPP
+
+#include "./xsimd_avx512ifma_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VBMI instructions
+     */
+    struct avx512vbmi : avx512ifma
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 6, 0); }
+        static constexpr char const* name() noexcept { return "avx512vbmi"; }
+    };
+
+#if XSIMD_WITH_AVX512VBMI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vbmi>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vbmi, avx512ifma);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
new file mode 100644
index 000000000000..85edbdf230ce
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
@@ -0,0 +1,51 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct avx512vnni;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    template <>
+    struct avx512vnni<avx512bw> : avx512bw
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); }
+        static constexpr char const* name() noexcept { return "avx512vnni+avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni<avx512bw>>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni<avx512bw>, avx512bw);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
new file mode 100644
index 000000000000..232b19a5cb82
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
@@ -0,0 +1,51 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP
+#define XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP
+
+#include "./xsimd_avx512vbmi_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct avx512vnni;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    template <>
+    struct avx512vnni<avx512vbmi> : avx512vbmi
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512VBMI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 6, 1); }
+        static constexpr char const* name() noexcept { return "avx512vnni+avx512vbmi"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI_AVX512VBMI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni<avx512vbmi>>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni<avx512vbmi>, avx512vbmi);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp
new file mode 100644
index 000000000000..c276fb00792a
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_REGISTER_HPP
+#define XSIMD_AVX512VNNI_REGISTER_HPP
+
+#include "./xsimd_avx512vbmi_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    struct avx512vnni : avx512vbmi
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 7, 0); }
+        static constexpr char const* name() noexcept { return "avx512vnni"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512vbmi);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
index 95f18ebfb6f7..6b1951f964b9 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
@@ -42,7 +42,6 @@ namespace xsimd
     namespace types
     {
 
-        XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i);
         XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
         XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
         XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp
new file mode 100644
index 000000000000..f68fe16bad2b
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVXVNNI_REGISTER_HPP
+#define XSIMD_AVXVNNI_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * AVXVNNI instructions
+     */
+    struct avxvnni : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 3, 0); }
+        static constexpr char const* name() noexcept { return "avxvnni"; }
+    };
+
+#if XSIMD_WITH_AVXVNNI
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avxvnni, avx2);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
index c8a8239cc3a3..b4989fc88d0d 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
@@ -112,6 +112,7 @@ namespace xsimd
     template <class T, class A>
     class batch : public types::simd_register<T, A>, public types::integral_only_operators<T, A>
     {
+        static_assert(!std::is_same<T, bool>::value, "use xsimd::batch_bool<T, A> instead of xsimd::batch<bool, A>");
 
     public:
         static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
index bf2b9569e794..0de9c8ad42c1 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@@ -88,7 +88,7 @@ namespace xsimd
 #define MAKE_BINARY_OP(OP, NAME)                                                            \
     template <bool... OtherValues>                                                          \
     constexpr auto operator OP(batch_bool_constant<batch_type, OtherValues...> other) const \
-        ->decltype(apply<NAME>(*this, other))                                               \
+        -> decltype(apply<NAME>(*this, other))                                              \
     {                                                                                       \
         return apply<NAME>(*this, other);                                                   \
     }
@@ -199,7 +199,7 @@ namespace xsimd
 #define MAKE_BINARY_OP(OP, NAME)                                                       \
     template <value_type... OtherValues>                                               \
     constexpr auto operator OP(batch_constant<batch_type, OtherValues...> other) const \
-        ->decltype(apply<NAME>(*this, other))                                          \
+        -> decltype(apply<NAME>(*this, other))                                         \
     {                                                                                  \
         return apply<NAME>(*this, other);                                              \
     }
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
index 6aaee93393e0..2aa25419c6f4 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
@@ -41,7 +41,7 @@ namespace xsimd
         static constexpr char const* name() noexcept { return "generic"; }
 
     protected:
-        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
+        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; }
     };
 }
 
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp
new file mode 100644
index 000000000000..1b3daf4592d4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp
@@ -0,0 +1,417 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_RVV_REGISTER_HPP
+#define XSIMD_RVV_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_RVV
+#include <riscv_vector.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup architectures
+         *
+         * RVV instructions (fixed vector size) for riscv
+         */
+        template <size_t Width>
+        struct rvv : xsimd::generic
+        {
+            static constexpr size_t width = Width;
+            static constexpr bool supported() noexcept { return Width == XSIMD_RVV_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr unsigned version() noexcept { return generic::version(1, 0, 0, /*multiplier=*/1000); }
+            static constexpr char const* name() noexcept { return "riscv+rvv"; }
+        };
+    }
+
+#if XSIMD_WITH_RVV
+
+    using rvv = detail::rvv<__riscv_v_fixed_vlen>;
+
+#define XSIMD_RVV_JOINT_(a, b, c) a##b##c
+#define XSIMD_RVV_JOINT(a, b, c) XSIMD_RVV_JOINT_(a, b, c)
+#define XSIMD_RVV_JOINT5(a, b, c, d, e) XSIMD_RVV_JOINT(XSIMD_RVV_JOINT(a, b, c), d, e)
+
+#define XSIMD_RVV_TYPE_i(S, V) XSIMD_RVV_JOINT5(vint, S, m, V, _t)
+#define XSIMD_RVV_TYPE_u(S, V) XSIMD_RVV_JOINT5(vuint, S, m, V, _t)
+#define XSIMD_RVV_TYPE_f(S, V) XSIMD_RVV_JOINT5(vfloat, S, m, V, _t)
+#define XSIMD_RVV_TYPE(T, S, V) XSIMD_RVV_JOINT(XSIMD_RVV_TYPE, _, T)(S, V)
+
+    namespace types
+    {
+        namespace detail
+        {
+            static constexpr size_t rvv_width_mf8 = XSIMD_RVV_BITS / 8;
+            static constexpr size_t rvv_width_mf4 = XSIMD_RVV_BITS / 4;
+            static constexpr size_t rvv_width_mf2 = XSIMD_RVV_BITS / 2;
+            static constexpr size_t rvv_width_m1 = XSIMD_RVV_BITS;
+            static constexpr size_t rvv_width_m2 = XSIMD_RVV_BITS * 2;
+            static constexpr size_t rvv_width_m4 = XSIMD_RVV_BITS * 4;
+            static constexpr size_t rvv_width_m8 = XSIMD_RVV_BITS * 8;
+
+            // rvv_type_info is a utility class to convert scalar type and
+            // bitwidth into rvv register types.
+            //
+            // * `type` is the unadorned vector type.
+            // * `fixed_type` is the same type, but with the storage attribute
+            //    applied.
+            // * `byte_type` is the type which is the same size in unsigned
+            //    bytes, used as an intermediate step for bit-cast operations,
+            //    because only a subset of __riscv_vreinterpret() intrinsics
+            //    exist -- but always enough to get us to bytes and back.
+            //
+            template <class T, size_t Width>
+            struct rvv_type_info;
+#define XSIMD_RVV_MAKE_TYPE(scalar, t, s, vmul)                                           \
+    template <>                                                                           \
+    struct rvv_type_info<scalar, rvv_width_m1 * vmul>                                     \
+    {                                                                                     \
+        static constexpr size_t width = rvv_width_m1 * vmul;                              \
+        using type = XSIMD_RVV_TYPE(t, s, vmul);                                          \
+        using byte_type = XSIMD_RVV_TYPE(u, 8, vmul);                                     \
+        using fixed_type = type __attribute__((riscv_rvv_vector_bits(width)));            \
+        template <class U>                                                                \
+        static inline type bitcast(U x) noexcept                                          \
+        {                                                                                 \
+            const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
+            return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, t, s, m, vmul)(words);         \
+        }                                                                                 \
+        template <>                                                                       \
+        inline type bitcast<type>(type x) noexcept { return x; }                          \
+        static inline byte_type as_bytes(type x) noexcept                                 \
+        {                                                                                 \
+            const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
+            return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words);         \
+        }                                                                                 \
+    };
+
+#define XSIMD_RVV_MAKE_TYPES(vmul)             \
+    XSIMD_RVV_MAKE_TYPE(int8_t, i, 8, vmul)    \
+    XSIMD_RVV_MAKE_TYPE(uint8_t, u, 8, vmul)   \
+    XSIMD_RVV_MAKE_TYPE(int16_t, i, 16, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint16_t, u, 16, vmul) \
+    XSIMD_RVV_MAKE_TYPE(int32_t, i, 32, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint32_t, u, 32, vmul) \
+    XSIMD_RVV_MAKE_TYPE(int64_t, i, 64, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint64_t, u, 64, vmul) \
+    XSIMD_RVV_MAKE_TYPE(float, f, 32, vmul)    \
+    XSIMD_RVV_MAKE_TYPE(double, f, 64, vmul)
+
+            XSIMD_RVV_MAKE_TYPES(8)
+            XSIMD_RVV_MAKE_TYPES(4)
+            XSIMD_RVV_MAKE_TYPES(2)
+            XSIMD_RVV_MAKE_TYPES(1)
+#undef XSIMD_RVV_TYPE
+#undef XSIMD_RVV_TYPE_f
+#undef XSIMD_RVV_TYPE_u
+#undef XSIMD_RVV_TYPE_i
+#undef XSIMD_RVV_MAKE_TYPES
+#undef XSIMD_RVV_MAKE_TYPE
+
+            // rvv_blob is storage-type abstraction for a vector register.
+            template <class T, size_t Width>
+            struct rvv_blob : public rvv_type_info<T, Width>
+            {
+                using super = rvv_type_info<T, Width>;
+                using typename super::fixed_type;
+                using typename super::type;
+
+                fixed_type value;
+                type get() const { return value; }
+                void set(type v) { value = v; }
+            };
+            //
+            // But sometimes we want our storage type to be less than a whole
+            // register, while presenting as a whole register to the outside
+            // world.  This is because some partial-register types are not
+            // defined, but they can (mostly) be emulated using shorter vl on a
+            // full-width register for arithmetic, and cast back to a partial
+            // byte register for storage.
+            //
+            template <class T, size_t divisor>
+            struct rvv_semiblob : public rvv_type_info<T, rvv_width_m1>
+            {
+                using super = rvv_type_info<T, rvv_width_m1>;
+                static constexpr size_t width = rvv_width_m1 / divisor;
+                using typename super::type;
+                template <size_t div>
+                struct semitype;
+                template <>
+                struct semitype<2>
+                {
+                    using type = vuint8mf2_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf2)));
+                };
+                template <>
+                struct semitype<4>
+                {
+                    using type = vuint8mf4_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf4)));
+                };
+                template <>
+                struct semitype<8>
+                {
+                    using type = vuint8mf8_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf8)));
+                };
+                using fixed_type = typename semitype<divisor>::type;
+                using super::as_bytes;
+                using super::bitcast;
+
+                fixed_type value;
+                template <size_t div>
+                vuint8m1_t get_bytes() const;
+                template <>
+                vuint8m1_t get_bytes<2>() const { return __riscv_vlmul_ext_v_u8mf2_u8m1(value); }
+                template <>
+                vuint8m1_t get_bytes<4>() const { return __riscv_vlmul_ext_v_u8mf4_u8m1(value); }
+                template <>
+                vuint8m1_t get_bytes<8>() const { return __riscv_vlmul_ext_v_u8mf8_u8m1(value); }
+                type get() const noexcept
+                {
+                    vuint8m1_t bytes = get_bytes<divisor>();
+                    return bitcast(bytes);
+                }
+                template <size_t div>
+                void set_bytes(vuint8m1_t);
+                template <>
+                void set_bytes<2>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf2(v); }
+                template <>
+                void set_bytes<4>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf4(v); }
+                template <>
+                void set_bytes<8>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf8(v); }
+                void set(type v)
+                {
+                    vuint8m1_t bytes = as_bytes(v);
+                    set_bytes<divisor>(bytes);
+                }
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf2> : rvv_semiblob<T, 2>
+            {
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf4> : rvv_semiblob<T, 4>
+            {
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf8> : rvv_semiblob<T, 8>
+            {
+            };
+
+            // It's difficult dealing with both char and whichever *int8_t type
+            // is compatible with char, so just avoid it altogether.
+            //
+            using rvv_char_t = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
+            template <class T>
+            using rvv_fix_char_t = typename std::conditional<
+                std::is_same<char, typename std::decay<T>::type>::value,
+                rvv_char_t, T>::type;
+
+            // An explicit constructor isn't really explicit enough to allow
+            // implicit bit-casting operations between incompatible types, so
+            // we add this vacuous flag argument when we're serious:
+            //
+            enum rvv_bitcast_flag
+            {
+                XSIMD_RVV_BITCAST
+            };
+
+            // the general-purpose vector register type, usable within
+            // templates, and supporting arithmetic on partial registers for
+            // which there is no intrinsic type (by casting via a full register
+            // type).
+            //
+            template <class T, size_t Width>
+            struct rvv_reg
+            {
+                static constexpr size_t width = Width;
+                static constexpr size_t vl = Width / (sizeof(T) * 8);
+                using blob_type = rvv_blob<T, Width>;
+                using register_type = typename blob_type::type;
+                using byte_type = typename blob_type::byte_type;
+                blob_type value;
+                rvv_reg() noexcept = default;
+                rvv_reg(register_type x) noexcept { value.set(x); }
+                explicit rvv_reg(byte_type v, rvv_bitcast_flag) { value.set(value.bitcast(v)); }
+                template <class U>
+                explicit rvv_reg(rvv_reg<U, Width> v, rvv_bitcast_flag)
+                    : rvv_reg(v.get_bytes(), XSIMD_RVV_BITCAST)
+                {
+                }
+                byte_type get_bytes() const noexcept
+                {
+                    return blob_type::as_bytes(value.get());
+                }
+                operator register_type() const noexcept { return value.get(); }
+            };
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_reg_t = typename std::conditional<!std::is_void<T>::value, rvv_reg<rvv_fix_char_t<T>, Width>, void>::type;
+
+            // And some more of the same stuff for bool types, which have
+            // similar problems and similar workarounds.
+            //
+            template <size_t>
+            struct rvv_bool_info;
+#define XSIMD_RVV_MAKE_BOOL_TYPE(i)                                                       \
+    template <>                                                                           \
+    struct rvv_bool_info<i>                                                               \
+    {                                                                                     \
+        using type = XSIMD_RVV_JOINT(vbool, i, _t);                                       \
+        template <class T>                                                                \
+        static inline type bitcast(T value) noexcept                                      \
+        {                                                                                 \
+            return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value);                   \
+        }                                                                                 \
+        /*template <> static inline type bitcast(type value) noexcept { return value; }*/ \
+    };
+            XSIMD_RVV_MAKE_BOOL_TYPE(1);
+            XSIMD_RVV_MAKE_BOOL_TYPE(2);
+            XSIMD_RVV_MAKE_BOOL_TYPE(4);
+            XSIMD_RVV_MAKE_BOOL_TYPE(8);
+            XSIMD_RVV_MAKE_BOOL_TYPE(16);
+            XSIMD_RVV_MAKE_BOOL_TYPE(32);
+            XSIMD_RVV_MAKE_BOOL_TYPE(64);
+#undef XSIMD_RVV_MAKE_BOOL_TYPE
+#undef XSIMD_RVV_JOINT5
+#undef XSIMD_RVV_JOINT
+#undef XSIMD_RVV_JOINT_
+
+            template <class T, size_t Width>
+            struct rvv_bool
+            {
+                using bool_info = rvv_bool_info<rvv_width_m1 * sizeof(T) * 8 / Width>;
+                using storage_type = vuint8m1_t __attribute__((riscv_rvv_vector_bits(rvv_width_m1)));
+                using type = typename bool_info::type;
+                storage_type value;
+                rvv_bool() = default;
+                rvv_bool(type v) noexcept
+                    : value(__riscv_vreinterpret_u8m1(v))
+                {
+                }
+                template <class U, typename std::enable_if<sizeof(T) == sizeof(U), int>::type = 0>
+                rvv_bool(rvv_bool<U, Width> v)
+                    : value(v.value)
+                {
+                }
+                explicit rvv_bool(uint8_t mask) noexcept
+                    : value(__riscv_vmv_v_x_u8m1(mask, rvv_width_m1 / 8))
+                {
+                }
+                explicit rvv_bool(uint64_t mask) noexcept
+                    : value(__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vmv_v_x_u64m1(mask, rvv_width_m1 / 64)))
+                {
+                }
+                operator type() const noexcept { return bool_info::bitcast(value); }
+            };
+
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_bool_t = typename std::enable_if < !std::is_void<T>::value,
+                  rvv_bool<rvv_fix_char_t<T>, Width<rvv_width_m1 ? rvv_width_m1 : Width>>::type;
+
+            template <size_t S>
+            struct rvv_vector_type_impl;
+
+            template <>
+            struct rvv_vector_type_impl<8>
+            {
+                using signed_type = rvv_reg_t<int8_t>;
+                using unsigned_type = rvv_reg_t<uint8_t>;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<16>
+            {
+                using signed_type = rvv_reg_t<int16_t>;
+                using unsigned_type = rvv_reg_t<uint16_t>;
+                using floating_point_type = rvv_reg_t<_Float16>;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<32>
+            {
+                using signed_type = rvv_reg_t<int32_t>;
+                using unsigned_type = rvv_reg_t<uint32_t>;
+                using floating_point_type = rvv_reg_t<float>;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<64>
+            {
+                using signed_type = rvv_reg_t<int64_t>;
+                using unsigned_type = rvv_reg_t<uint64_t>;
+                using floating_point_type = rvv_reg_t<double>;
+            };
+
+            template <class T>
+            using signed_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_rvv_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_rvv_vector_type<T>,
+                                                                                           signed_int_rvv_vector_type<T>>::type;
+
+            template <class T>
+            using rvv_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_rvv_vector_type<T>,
+                                                              unsigned_int_rvv_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, rvv, detail::rvv_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, rvv, detail::rvv_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, rvv, detail::rvv_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, rvv, detail::rvv_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, rvv, detail::rvv_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, rvv, detail::rvv_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, rvv, detail::rvv_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, rvv, detail::rvv_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, rvv, detail::rvv_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, rvv, detail::rvv_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, rvv, detail::rvv_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, rvv, detail::rvv_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, rvv, detail::rvv_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, rvv, detail::rvv_vector_type<double>);
+
+        namespace detail
+        {
+            template <class T>
+            struct rvv_bool_simd_register
+            {
+                using register_type = rvv_bool_t<T>;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, rvv>
+        {
+            using type = detail::rvv_bool_simd_register<T>;
+        };
+    } // namespace types
+#endif
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
index 3855a9d7dc01..a9dc8960b660 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
@@ -40,7 +40,6 @@ namespace xsimd
 #if XSIMD_WITH_SSE2
     namespace types
     {
-        XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i);
         XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
         XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
         XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
index 1ae678684138..27b241980cbf 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
@@ -36,7 +36,7 @@ namespace xsimd
             static constexpr bool available() noexcept { return true; }
             static constexpr bool requires_alignment() noexcept { return true; }
             static constexpr std::size_t alignment() noexcept { return 16; }
-            static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); }
+            static constexpr unsigned version() noexcept { return generic::version(9, Width / 32, 0); }
             static constexpr char const* name() noexcept { return "arm64+sve"; }
         };
     }
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp
index ab8782ac6a84..237db95c6e30 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp
@@ -40,7 +40,6 @@ namespace xsimd
 #if XSIMD_WITH_WASM
     namespace types
     {
-        XSIMD_DECLARE_SIMD_REGISTER(bool, wasm, v128_t);
         XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t);
         XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t);
         XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t);
diff --git a/third_party/xsimd/moz.yaml b/third_party/xsimd/moz.yaml
index 76a3dc90c272..c99a5991ccf2 100644
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@@ -10,8 +10,8 @@ origin:
 
   url: https://github.com/QuantStack/xsimd
 
-  release: 11.2.0 (2023-11-08T21:37:47+01:00).
-  revision: 11.2.0
+  release: 12.1.1 (2023-12-12T17:17:27+01:00).
+  revision: 12.1.1
 
   license: BSD-3-Clause