diff --git a/third_party/xsimd/Changelog.rst b/third_party/xsimd/Changelog.rst index 9e6299c3b4cc..2c569a8bd3fc 100644 --- a/third_party/xsimd/Changelog.rst +++ b/third_party/xsimd/Changelog.rst @@ -9,6 +9,41 @@ Changelog ========= +12.1.1 +------ + + * Update readme with a section on adoption, and a section on the history of the project + + * Fix/avx512vnni implementation + + * Fix regression on XSIMD_NO_SUPPORTED_ARCHITECTURE + +12.1.0 +------ + + * Fix various problems with architecture version handling + + * Specialize xsimd::compress for riscv + + * Provide stubs for various avx512xx architectures + +12.0.0 +------ + + * Fix sincos implementation to cope with Emscripten + + * Upgraded minimal version of cmake to remove deprecation warning + + * Fixed constants::signmask for GCC when using ffast-math + + * Add RISC-V Vector support + + * Generic, simple implementation fox xsimd::compress + + * Disable batch of bools, and suggest using batch_bool instead + + * Add an option to skip installation + 11.2.0 ------ diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp index 10bf2abffbdf..05d27b3d4704 100644 --- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp @@ -95,12 +95,12 @@ namespace xsimd template inline batch bitofsign(batch const& self, requires_arch) noexcept { - return self & constants::minuszero>(); + return self & constants::signmask>(); } template inline batch bitofsign(batch const& self, requires_arch) noexcept { - return self & constants::minuszero>(); + return self & constants::signmask>(); } // bitwise_cast @@ -974,12 +974,8 @@ namespace xsimd template inline batch, A> polar(const batch& r, const batch& theta, requires_arch) noexcept { -#ifndef EMSCRIPTEN auto sincosTheta = sincos(theta); return { r * sincosTheta.second, r * sincosTheta.first }; -#else - return { r * cos(theta), r * sin(theta) }; -#endif } // fdim diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp index 90e58c9d7be1..e9e9065832a1 100644 --- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -32,6 +32,60 @@ namespace xsimd using namespace types; + // compress + namespace detail + { + template + inline batch create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) + { + batch swizzle_mask(IT(0)); + alignas(A::alignment()) IT mask_buffer[batch::size] = { Is... }; + size_t inserted = 0; + for (size_t i = 0; i < sizeof...(Is); ++i) + if ((bitmask >> i) & 1u) + std::swap(mask_buffer[inserted++], mask_buffer[i]); + return batch::load_aligned(&mask_buffer[0]); + } + } + + template + inline batch + compress(batch const& x, batch_bool const& mask, + kernel::requires_arch) noexcept + { + using IT = as_unsigned_integer_t; + constexpr std::size_t size = batch_bool::size; + auto bitmask = mask.mask(); + auto z = select(mask, x, batch((T)0)); + auto compress_mask = detail::create_compress_swizzle_mask(bitmask, ::xsimd::detail::make_index_sequence()); + return swizzle(z, compress_mask); + } + + // expand + namespace detail + { + template + inline batch create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence) + { + batch swizzle_mask(IT(0)); + IT j = 0; + (void)std::initializer_list { ((swizzle_mask = insert(swizzle_mask, j, index())), (j += ((bitmask >> Is) & 1u)), true)... }; + return swizzle_mask; + } + } + + template + inline batch + expand(batch const& x, batch_bool const& mask, + kernel::requires_arch) noexcept + { + constexpr std::size_t size = batch_bool::size; + auto bitmask = mask.mask(); + auto swizzle_mask = detail::create_expand_swizzle_mask, A>(bitmask, ::xsimd::detail::make_index_sequence()); + auto z = swizzle(x, swizzle_mask); + return select(mask, z, batch(T(0))); + } + // extract_pair template inline batch extract_pair(batch const& self, batch const& other, std::size_t i, requires_arch) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512er.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512er.hpp new file mode 100644 index 000000000000..be02f9850b11 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512er.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512ER_HPP +#define XSIMD_AVX512ER_HPP + +#include +#include + +#include "../types/xsimd_avx512er_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp index 572aba3b068e..7ee46101356a 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp @@ -661,6 +661,38 @@ namespace xsimd return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF); } + // compress + template + inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_compress_ps(mask.mask(), self); + } + template + inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_compress_pd(mask.mask(), self); + } + template + inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_compress_epi32(mask.mask(), self); + } + template + inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_compress_epi32(mask.mask(), self); + } + template + inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_compress_epi64(mask.mask(), self); + } + template + inline batch compress(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_compress_epi64(mask.mask(), self); + } + // convert namespace detail { @@ -756,6 +788,38 @@ namespace xsimd return register_type(~self.data ^ other.data); } + // expand + template + inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_expand_ps(mask.mask(), self); + } + template + inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_expand_pd(mask.mask(), self); + } + template + inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_expand_epi32(mask.mask(), self); + } + template + inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_expand_epi32(mask.mask(), self); + } + template + inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_expand_epi64(mask.mask(), self); + } + template + inline batch expand(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return _mm512_maskz_expand_epi64(mask.mask(), self); + } + // floor template inline batch floor(batch const& self, requires_arch) noexcept @@ -1969,10 +2033,12 @@ namespace xsimd XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); + return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); + return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { @@ -2035,10 +2101,12 @@ namespace xsimd XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(false && "not implemented yet"); + return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(false && "not implemented yet"); + return {}; } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp new file mode 100644 index 000000000000..df382881b0b2 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512ifma.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VBMI_HPP +#define XSIMD_AVX512VBMI_HPP + +#include +#include + +#include "../types/xsimd_avx512vbmi_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp new file mode 100644 index 000000000000..6265c91718fb --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512pf.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512PF_HPP +#define XSIMD_AVX512PF_HPP + +#include +#include + +#include "../types/xsimd_avx512pf_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp new file mode 100644 index 000000000000..df382881b0b2 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vbmi.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VBMI_HPP +#define XSIMD_AVX512VBMI_HPP + +#include +#include + +#include "../types/xsimd_avx512vbmi_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp new file mode 100644 index 000000000000..b285623d02f6 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512bw.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VNNI_AVX512_BW_HPP +#define XSIMD_AVX512VNNI_AVX512_BW_HPP + +#include +#include + +#include "../types/xsimd_avx512vnni_avx512bw_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp new file mode 100644 index 000000000000..a70d30fad598 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512vnni_avx512vbmi.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VNNI_AVX512VBMI_HPP +#define XSIMD_AVX512VNNI_AVX512VBMI_HPP + +#include +#include + +#include "../types/xsimd_avx512vnni_avx512vbmi_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp new file mode 100644 index 000000000000..a97ba9296c51 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avxvnni.hpp @@ -0,0 +1,20 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVXVNNI_HPP +#define XSIMD_AVXVNNI_HPP + +#include +#include + +#include "../types/xsimd_avxvnni_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp index a9cf971db5ad..22dd5d3e3030 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp @@ -56,6 +56,11 @@ namespace xsimd return bit_cast((uint64_t)DOUBLE); \ } +// Under fast-math, GCC might replace signmask (minus zero) by zero +#if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) +#pragma GCC push_options +#pragma GCC optimize("signed-zeros") +#endif XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits::infinity()), (std::numeric_limits::infinity())) XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986) XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000) @@ -79,7 +84,6 @@ namespace xsimd XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.) XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167) XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity()), (-infinity())) - XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0) XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff) XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000) XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000) @@ -104,6 +108,9 @@ namespace xsimd XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883) XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0) XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286) +#if defined(__FAST_MATH__) && defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif #undef XSIMD_DEFINE_CONSTANT #undef XSIMD_DEFINE_CONSTANT_HEX diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp index 8f05a5dab2c0..0edd77674178 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp @@ -52,6 +52,10 @@ #include "./xsimd_fma3_avx.hpp" #endif +#if XSIMD_WITH_AVXVNNI +#include "./xsimd_avxvnni.hpp" +#endif + #if XSIMD_WITH_AVX2 #include "./xsimd_avx2.hpp" #endif @@ -68,6 +72,30 @@ #include "./xsimd_avx512bw.hpp" #endif +#if XSIMD_WITH_AVX512ER +#include "./xsimd_avx512er.hpp" +#endif + +#if XSIMD_WITH_AVX512PF +#include "./xsimd_avx512pf.hpp" +#endif + +#if XSIMD_WITH_AVX512IFMA +#include "./xsimd_avx512ifma.hpp" +#endif + +#if XSIMD_WITH_AVX512VBMI +#include "./xsimd_avx512vbmi.hpp" +#endif + +#if XSIMD_WITH_AVX512VNNI_AVX512BW +#include "./xsimd_avx512vnni_avx512bw.hpp" +#endif + +#if XSIMD_WITH_AVX512VNNI_AVX512VBMI +#include "./xsimd_avx512vnni_avx512vbmi.hpp" +#endif + #if XSIMD_WITH_NEON #include "./xsimd_neon.hpp" #endif @@ -80,6 +108,10 @@ #include "./xsimd_sve.hpp" #endif +#if XSIMD_WITH_RVV +#include "./xsimd_rvv.hpp" +#endif + #if XSIMD_WITH_WASM #include "./xsimd_wasm.hpp" #endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp new file mode 100644 index 000000000000..98d1de9ce341 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp @@ -0,0 +1,1499 @@ +/*************************************************************************** + + * Copyright (c) Rivos Inc. * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_RVV_HPP +#define XSIMD_RVV_HPP + +#include +#include +#include + +#include "../types/xsimd_rvv_register.hpp" +#include "xsimd_constants.hpp" + +// This set of macros allows the synthesis of identifiers using a template and +// variable macro arguments. A single template can then be used by multiple +// macros, or multiple instances of a macro to define the same logic for +// different data types. +// +// First some logic to paste text together... +// +#define XSIMD_RVV_JOIN_(x, y) x##y +#define XSIMD_RVV_JOIN(x, y) XSIMD_RVV_JOIN_(x, y) +#define XSIMD_RVV_PREFIX_T(T, S, then) XSIMD_RVV_JOIN(T, then) +#define XSIMD_RVV_PREFIX_S(T, S, then) XSIMD_RVV_JOIN(S, then) +#define XSIMD_RVV_PREFIX_M(T, S, then) XSIMD_RVV_JOIN(m1, then) +#define XSIMD_RVV_PREFIX(T, S, then) then +// +// XSIMD_RVV_IDENTIFIER accepts type and size parameters, and a template for +// the identifier. The template is a comma-separated list of alternating +// literal and parameter segments. Each parameter is appended to XSIMD_RVV_PREFIX to +// form a new macro name which decides which parameter should be inserted. +// Then a literal segment is inserted after that. Empty literals are used to +// join two or more variables together. +// +#define XSIMD_RVV_IDENTIFIER9(T, S, t, ...) t +#define XSIMD_RVV_IDENTIFIER8(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER9(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER7(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER8(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER6(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER7(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER5(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER6(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER4(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER5(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER3(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER4(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER2(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER3(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER1(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER2(T, S, __VA_ARGS__))) +#define XSIMD_RVV_IDENTIFIER0(T, S, t, p, ...) XSIMD_RVV_JOIN(t, XSIMD_RVV_PREFIX##p(T, S, XSIMD_RVV_IDENTIFIER1(T, S, __VA_ARGS__))) +// +// UNBRACKET and REPARSE force the preprocessor to handle expansion in a +// specific order. XSIMD_RVV_UNBRACKET strips the parentheses from the template +// (which were necessary to keep the template as a single, named macro +// parameter up to this point). XSIMD_RVV_ARG_LIST then forms the new parameter list +// to pass to XSIMD_RVV_IDENTIFIER0, with trailing commas to ensure the unrolled +// XSIMD_RVV_IDENTIFIER loop runs to completion adding empty strings. +// +// However XSIMD_RVV_IDENTIFIER0 is not expanded immediately because it does not +// match a function-like macro in this pass. XSIMD_RVV_REPARSE forces another +// evaluation after the expansion of XSIMD_RVV_ARG_LIST, where XSIMD_RVV_IDENTIFIER0 will +// now match as a function-like macro, and the cycle of substitutions and +// insertions can begin. +// +#define XSIMD_RVV_REPARSE(v) (v) +#define XSIMD_RVV_UNBRACKET(...) __VA_ARGS__ +#define XSIMD_RVV_ARG_LIST(T, S, name) (T, S, XSIMD_RVV_UNBRACKET name, , , , , , , , , , , , , , , , , , , , , ) +#define XSIMD_RVV_IDENTIFIER(T, S, name) XSIMD_RVV_REPARSE(XSIMD_RVV_IDENTIFIER0 XSIMD_RVV_ARG_LIST(T, S, name)) +// +// To avoid comma-counting bugs, replace the variable references with macros +// which include enough commas to keep proper phase, and then use no commas at +// all in the templates. +// +#define XSIMD_RVV_T , _T, +#define XSIMD_RVV_S , _S, +#define XSIMD_RVV_M , _M, +#define XSIMD_RVV_TSM XSIMD_RVV_T XSIMD_RVV_S XSIMD_RVV_M + +// XSIMD_RVV_OVERLOAD, below, expands to a head section, a number of body sections +// (depending on which types are supported), and a tail section. Different +// variants of these sections are implemented with different suffixes on the +// three macro names XSIMD_RVV_WRAPPER_HEAD, XSIMD_RVV_WRAPPER, and XSIMD_RVV_WRAPPER_TAIL and +// specified as an argument to XSIMD_RVV_OVERLOAD (the empty string is the default, +// but still needs an extra comma to hold its place). +// +// The default XSIMD_RVV_WRAPPER_HEAD provides a class containing convenient names +// for the function signature argument(s) to XSIMD_RVV_OVERLOAD. That signature can +// also reference the template argument T, because it's a text substitution +// into the template. +#define XSIMD_RVV_WRAPPER_HEAD(NAME, SIGNATURE, ...) \ + namespace NAME##_cruft \ + { \ + template \ + struct ctx \ + { \ + static constexpr size_t width = XSIMD_RVV_BITS; \ + static constexpr size_t vl = width / (sizeof(T) * 8); \ + using vec = rvv_reg_t; \ + using uvec = rvv_reg_t, width>; \ + using svec = rvv_reg_t, width>; \ + using fvec = rvv_reg_t, width>; \ + using bvec = rvv_bool_t; \ + using scalar_vec = rvv_reg_t; \ + using wide_vec = rvv_reg_t; \ + using narrow_vec = rvv_reg_t; \ + using type = SIGNATURE; \ + }; \ + template \ + using sig_t = typename ctx::type; \ + template \ + struct impl \ + { \ + void operator()() const noexcept {}; \ + }; \ + template \ + using impl_t = impl>; + +#define XSIMD_RVV_WRAPPER_HEAD_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) +#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) +#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) +#define XSIMD_RVV_WRAPPER_HEAD_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_HEAD(__VA_ARGS__) + +// The body of the wrapper defines a functor (because partial specialisation of +// functions is not legal) which forwards its arguments to the named intrinsic +// with a few manipulations. In general, vector types are handled as +// rvv_reg_t<> and rely on the conversion operators in that class for +// compatibility with the intrinsics. +// +// The function signature is not mentioned here. Instead it's provided in the +// tail code as the template argument for which this is a specialisation, which +// overcomes the problem of converting a function signature type to an argument +// list to pass to another function. +// +#define XSIMD_RVV_WRAPPER(KEY, CALLEE, ...) \ + template \ + struct impl \ + { \ + using ctx = ctx; \ + constexpr Ret operator()(Args... args) const noexcept \ + { \ + return CALLEE(args..., ctx::vl); \ + }; \ + }; +#define XSIMD_RVV_WRAPPER_NOVL(KEY, CALLEE, ...) \ + template \ + struct impl \ + { \ + constexpr Ret operator()(Args... args) const noexcept \ + { \ + return CALLEE(args...); \ + }; \ + }; +#define XSIMD_RVV_WRAPPER_DROP_1ST(KEY, CALLEE, ...) \ + template \ + struct impl \ + { \ + using ctx = ctx; \ + constexpr Ret operator()(First, Args... args) const noexcept \ + { \ + return CALLEE(args..., ctx::vl); \ + }; \ + }; +#define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS(KEY, CALLEE, SIGNATURE, ...) \ + template \ + struct impl \ + { \ + using ctx = ctx; \ + constexpr Ret operator()(First, Args... args) const noexcept \ + { \ + return CALLEE(__VA_ARGS__, ctx::vl); \ + }; \ + }; +#define XSIMD_RVV_WRAPPER_DROP_1ST_CUSTOM_ARGS_NOVL(KEY, CALLEE, SIGNATURE, ...) \ + template \ + struct impl \ + { \ + constexpr Ret operator()(First, Args... args) const noexcept \ + { \ + return CALLEE(__VA_ARGS__); \ + }; \ + }; + +// This part folds all the above templates down into a single functor instance +// with all the different function signatures available under the one name. +// Not all of the base classes necessarily contain useful code, but there's a +// default implementation so that filtering them out isn't really necessary. +#define XSIMD_RVV_WRAPPER_TAIL(NAME, ...) \ + } /* namespace NAME##_cruft */ \ + static constexpr struct : NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t, \ + NAME##_cruft::impl_t \ + { \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + using NAME##_cruft::impl_t::operator(); \ + } NAME {}; +#define XSIMD_RVV_WRAPPER_TAIL_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) +#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) +#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) +#define XSIMD_RVV_WRAPPER_TAIL_DROP_1ST_CUSTOM_ARGS_NOVL(...) XSIMD_RVV_WRAPPER_TAIL(__VA_ARGS__) + +// clang-format off + +#define XSIMD_RVV_OVERLOAD_head(my_name, variant, ...) \ + XSIMD_RVV_WRAPPER_HEAD##variant(my_name, __VA_ARGS__) +#define XSIMD_RVV_OVERLOAD_i(name, variant, ...) \ + XSIMD_RVV_WRAPPER##variant(int8_t, XSIMD_RVV_IDENTIFIER(i, 8, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(int16_t, XSIMD_RVV_IDENTIFIER(i, 16, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(int32_t, XSIMD_RVV_IDENTIFIER(i, 32, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(int64_t, XSIMD_RVV_IDENTIFIER(i, 64, name), __VA_ARGS__) +#define XSIMD_RVV_OVERLOAD_u(name, variant, ...) \ + XSIMD_RVV_WRAPPER##variant(uint8_t, XSIMD_RVV_IDENTIFIER(u, 8, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(uint16_t, XSIMD_RVV_IDENTIFIER(u, 16, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(uint32_t, XSIMD_RVV_IDENTIFIER(u, 32, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(uint64_t, XSIMD_RVV_IDENTIFIER(u, 64, name), __VA_ARGS__) +#define XSIMD_RVV_OVERLOAD_f(name, variant, ...) \ + XSIMD_RVV_WRAPPER##variant(float, XSIMD_RVV_IDENTIFIER(f, 32, name), __VA_ARGS__) \ + XSIMD_RVV_WRAPPER##variant(double, XSIMD_RVV_IDENTIFIER(f, 64, name), __VA_ARGS__) +#define XSIMD_RVV_OVERLOAD_tail(my_name, variant, ...) \ + XSIMD_RVV_WRAPPER_TAIL##variant(my_name, __VA_ARGS__) + +// Use these to create function (actually functor, sorry) wrappers overloaded +// for whichever types are supported. Being functors means they can't take a +// template argument (until C++14), so if a type can't be deduced then a junk +// value can be passed as the first argument and discarded by using the +// _DROP_1ST variant, instead. +// +// The wrappers use the rvv_reg_t<> types for template accessibility, and +// because some types (eg., vfloat64mf2_t) don't exist and need extra +// abstraction to emulate. +// +// In many cases the intrinsic names are different for signed, unsigned, or +// float variants, the macros OVERLOAD2 and OVERLOAD3 (depending on whether or +// not a float variant exists) take multiple intrinsic names and bring them +// together under a single overloaded identifier where they can be used within +// templates. +// +#define XSIMD_RVV_OVERLOAD2(my_name, name_i, name_u, variant, ...) \ + XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) + +#define XSIMD_RVV_OVERLOAD3(my_name, name_i, name_u, name_f, variant, ...) \ + XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_i(name_i, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_u(name_u, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_f(name_f, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) + +#define XSIMD_RVV_OVERLOAD(my_name, name, ...) XSIMD_RVV_OVERLOAD3(my_name, name, name, name, __VA_ARGS__) +#define XSIMD_RVV_OVERLOAD_INTS(my_name, name, ...) XSIMD_RVV_OVERLOAD2(my_name, name, name, __VA_ARGS__) + +#define XSIMD_RVV_OVERLOAD_SINTS(my_name, name, variant, ...) \ + XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_i(name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) + +#define XSIMD_RVV_OVERLOAD_UINTS(my_name, name, variant, ...) \ + XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_u(name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) + +#define XSIMD_RVV_OVERLOAD_FLOATS(my_name, name, variant, ...) \ + XSIMD_RVV_OVERLOAD_head(my_name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_f(name, variant, __VA_ARGS__) \ + XSIMD_RVV_OVERLOAD_tail(my_name, variant, __VA_ARGS__) + +// clang-format on + +namespace xsimd +{ + template + struct batch_constant; + + namespace kernel + { + namespace detail + { + template + using rvv_fix_char_t = types::detail::rvv_fix_char_t; + template + using rvv_reg_t = types::detail::rvv_reg_t; + template + using rvv_bool_t = types::detail::rvv_bool_t; + + template + struct as_signed_relaxed; + template <> + struct as_signed_relaxed<1> + { + using type = int8_t; + }; + template <> + struct as_signed_relaxed<2> + { + using type = int16_t; + }; + template <> + struct as_signed_relaxed<4> + { + using type = int32_t; + }; + template <> + struct as_signed_relaxed<8> + { + using type = int64_t; + }; + template + using as_signed_relaxed_t = typename as_signed_relaxed::type; + template + struct as_unsigned_relaxed; + template <> + struct as_unsigned_relaxed<1> + { + using type = uint8_t; + }; + template <> + struct as_unsigned_relaxed<2> + { + using type = uint16_t; + }; + template <> + struct as_unsigned_relaxed<4> + { + using type = uint32_t; + }; + template <> + struct as_unsigned_relaxed<8> + { + using type = uint64_t; + }; + template + using as_unsigned_relaxed_t = typename as_unsigned_relaxed::type; + template + struct as_float_relaxed; + template <> + struct as_float_relaxed<1> + { + using type = int8_t; + }; + template <> + struct as_float_relaxed<2> + { + using type = int16_t; + }; + template <> + struct as_float_relaxed<4> + { + using type = float; + }; + template <> + struct as_float_relaxed<8> + { + using type = double; + }; + template + using as_float_relaxed_t = typename as_float_relaxed::type; + + template + rvv_reg_t rvvreinterpret(U const& arg) noexcept + { + return rvv_reg_t(arg, types::detail::XSIMD_RVV_BITCAST); + } + template + rvv_reg_t rvvreinterpret(batch const& arg) noexcept + { + typename batch::register_type r = arg; + return rvvreinterpret(r); + } + + template > + inline batch rvv_to_unsigned_batch(batch const& arg) noexcept + { + return rvvreinterpret(arg.data); + } + + XSIMD_RVV_OVERLOAD(rvvid, + (__riscv_vid_v_u XSIMD_RVV_S XSIMD_RVV_M), _DROP_1ST, uvec(T)) + + XSIMD_RVV_OVERLOAD3(rvvmv_splat, + (__riscv_vmv_v_x_ XSIMD_RVV_TSM), + (__riscv_vmv_v_x_ XSIMD_RVV_TSM), + (__riscv_vfmv_v_f_ XSIMD_RVV_TSM), , vec(T)) + + XSIMD_RVV_OVERLOAD3(rvvmv_lane0, + (__riscv_vmv_x), + (__riscv_vmv_x), + (__riscv_vfmv_f), _NOVL, T(vec)) + + XSIMD_RVV_OVERLOAD(rvvmerge, (__riscv_vmerge), , vec(vec, vec, bvec)) + XSIMD_RVV_OVERLOAD3(rvvmerge_splat, + (__riscv_vmerge), + (__riscv_vmerge), + (__riscv_vfmerge), , vec(vec, T, bvec)) + + // count active lanes in a predicate + XSIMD_RVV_OVERLOAD(rvvcpop, (__riscv_vcpop), + , size_t(bvec)); + + template + inline rvv_bool_t pmask8(uint8_t mask) noexcept + { + return rvv_bool_t(mask); + } + template + inline rvv_bool_t pmask(uint64_t mask) noexcept + { + return rvv_bool_t(mask); + } + + template + inline rvv_reg_t vindex() noexcept + { + auto index = rvvid(T {}); + if (shift < 0) + index = __riscv_vsrl(index, -shift, batch::size); + else + index = __riscv_vsll(index, shift, batch::size); + return __riscv_vadd(index, T(offset), batch::size); + } + + // enable for signed integers + template + using rvv_enable_signed_int_t = typename std::enable_if::value && std::is_signed::value, int>::type; + + // enable for unsigned integers + template + using rvv_enable_unsigned_int_t = typename std::enable_if::value && std::is_unsigned::value, int>::type; + + // enable for floating points + template + using rvv_enable_floating_point_t = typename std::enable_if::value, int>::type; + + // enable for signed integers or floating points + template + using rvv_enable_signed_int_or_floating_point_t = typename std::enable_if::value, int>::type; + + // enable for all RVE supported types + template + using rvv_enable_all_t = typename std::enable_if::value, int>::type; + } // namespace detail + + /******************** + * Scalar to vector * + ********************/ + + namespace detail + { + template + inline detail::rvv_reg_t broadcast(T arg) noexcept + { + // A bit of a dance, here, because rvvmv_splat has no other + // argument from which to deduce type, and T=char is not + // supported. + detail::rvv_fix_char_t arg_not_char(arg); + const auto splat = detail::rvvmv_splat(arg_not_char); + return detail::rvv_reg_t(splat.get_bytes(), types::detail::XSIMD_RVV_BITCAST); + } + } + + // broadcast + template + inline batch broadcast(T arg, requires_arch) noexcept + { + return detail::broadcast(arg); + } + + /********* + * Load * + *********/ + + namespace detail + { + XSIMD_RVV_OVERLOAD(rvvle, (__riscv_vle XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , vec(T const*)) + XSIMD_RVV_OVERLOAD(rvvse, (__riscv_vse XSIMD_RVV_S _v_ XSIMD_RVV_TSM), , void(T*, vec)) + } + + template = 0> + inline batch load_aligned(T const* src, convert, requires_arch) noexcept + { + return detail::rvvle(reinterpret_cast const*>(src)); + } + + template = 0> + inline batch load_unaligned(T const* src, convert, requires_arch) noexcept + { + return load_aligned(src, convert(), rvv {}); + } + + // load_complex + namespace detail + { + template = types::detail::rvv_width_m1, int>::type = 0> + inline rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept + { + typename rvv_reg_t::register_type tmp; + tmp = __riscv_vset(tmp, 0, lo); + return __riscv_vset(tmp, 1, hi); + } + + template ::type = 0> inline rvv_reg_t rvvabut(rvv_reg_t const& lo, rvv_reg_t const& hi) noexcept + { + return __riscv_vslideup(lo, hi, lo.vl, lo.vl * 2); + } + + XSIMD_RVV_OVERLOAD(rvvget_lo_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 0) + XSIMD_RVV_OVERLOAD(rvvget_hi_, (__riscv_vget_ XSIMD_RVV_TSM), _DROP_1ST_CUSTOM_ARGS_NOVL, vec(T, wide_vec), args..., 1) + + template = types::detail::rvv_width_m1, int>::type = 0> + rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept + { + typename rvv_reg_t::register_type tmp = rvvget_lo_(T {}, vv); + return tmp; + } + template = types::detail::rvv_width_m1, int>::type = 0> + rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept + { + typename rvv_reg_t::register_type tmp = rvvget_hi_(T {}, vv); + return tmp; + } + template ::type = 0> rvv_reg_t rvvget_lo(rvv_reg_t const& vv) noexcept + { + typename rvv_reg_t::register_type tmp = vv; + return tmp; + } + template ::type = 0> rvv_reg_t rvvget_hi(rvv_reg_t const& vv) noexcept + { + return __riscv_vslidedown(vv, vv.vl / 2, vv.vl); + } + + template = 0> + inline batch, A> load_complex(batch const& lo, batch const& hi, requires_arch) noexcept + { + const auto real_index = vindex, 0, 1>(); + const auto imag_index = vindex, 1, 1>(); + const auto index = rvvabut, A::width>(real_index, imag_index); + const auto input = rvvabut(lo.data, hi.data); + const rvv_reg_t result = __riscv_vrgather(input, index, index.vl); + + return { rvvget_lo(result), rvvget_hi(result) }; + } + } + + /********* + * Store * + *********/ + + template = 0> + inline void store_aligned(T* dst, batch const& src, requires_arch) noexcept + { + detail::rvvse(reinterpret_cast*>(dst), src); + } + + template = 0> + inline void store_unaligned(T* dst, batch const& src, requires_arch) noexcept + { + store_aligned(dst, src, rvv {}); + } + + /****************** + * scatter/gather * + ******************/ + + namespace detail + { + template + using rvv_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type; + XSIMD_RVV_OVERLOAD(rvvloxei, (__riscv_vloxei XSIMD_RVV_S), , vec(T const*, uvec)) + XSIMD_RVV_OVERLOAD(rvvsoxei, (__riscv_vsoxei XSIMD_RVV_S), , void(T*, uvec, vec)) + XSIMD_RVV_OVERLOAD3(rvvmul_splat, + (__riscv_vmul), + (__riscv_vmul), + (__riscv_vfmul), , vec(vec, T)) + } + + // scatter + template = 0> + inline void scatter(batch const& vals, T* dst, batch const& index, kernel::requires_arch) noexcept + { + using UU = as_unsigned_integer_t; + const auto uindex = detail::rvv_to_unsigned_batch(index); + auto* base = reinterpret_cast*>(dst); + // or rvvsuxei + const auto bi = detail::rvvmul_splat(uindex, sizeof(T)); + detail::rvvsoxei(base, bi, vals); + } + + // gather + template = 0> + inline batch gather(batch const&, T const* src, batch const& index, kernel::requires_arch) noexcept + { + using UU = as_unsigned_integer_t; + const auto uindex = detail::rvv_to_unsigned_batch(index); + auto const* base = reinterpret_cast const*>(src); + // or rvvluxei + const auto bi = detail::rvvmul_splat(uindex, sizeof(T)); + return detail::rvvloxei(base, bi); + } + + /************** + * Arithmetic * + **************/ + + namespace detail + { + XSIMD_RVV_OVERLOAD3(rvvadd, + (__riscv_vadd), + (__riscv_vadd), + (__riscv_vfadd), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD2(rvvsadd, + (__riscv_vsadd), + (__riscv_vsaddu), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvsub, + (__riscv_vsub), + (__riscv_vsub), + (__riscv_vfsub), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD2(rvvssub, + (__riscv_vssub), + (__riscv_vssubu), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD2(rvvaadd, + (__riscv_vaadd), + (__riscv_vaaddu), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvmul, + (__riscv_vmul), + (__riscv_vmul), + (__riscv_vfmul), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvdiv, + (__riscv_vdiv), + (__riscv_vdivu), + (__riscv_vfdiv), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvmax, + (__riscv_vmax), + (__riscv_vmaxu), + (__riscv_vfmax), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvmin, + (__riscv_vmin), + (__riscv_vminu), + (__riscv_vfmin), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvneg, + (__riscv_vneg), + (abort), + (__riscv_vfneg), , vec(vec)) + XSIMD_RVV_OVERLOAD_FLOATS(rvvabs, + (__riscv_vfabs), , vec(vec)) + XSIMD_RVV_OVERLOAD3(rvvmacc, + (__riscv_vmacc), + (__riscv_vmacc), + (__riscv_vfmacc), , vec(vec, vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvnmsac, + (__riscv_vnmsac), + (__riscv_vnmsac), + (__riscv_vfnmsac), , vec(vec, vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvmadd, + (__riscv_vmadd), + (__riscv_vmadd), + (__riscv_vfmadd), , vec(vec, vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvnmsub, + (__riscv_vnmsub), + (__riscv_vnmsub), + (__riscv_vfnmsub), , vec(vec, vec, vec)) + +#define RISCV_VMSXX(XX) \ + XSIMD_RVV_OVERLOAD3(rvvms##XX, \ + (__riscv_vms##XX), \ + (__riscv_vms##XX##u), \ + (__riscv_vmf##XX), , bvec(vec, vec)) \ + XSIMD_RVV_OVERLOAD3(rvvms##XX##_splat, \ + (__riscv_vms##XX), \ + (__riscv_vms##XX##u), \ + (__riscv_vmf##XX), , bvec(vec, T)) +#define __riscv_vmsequ __riscv_vmseq +#define __riscv_vmsneu __riscv_vmsne + RISCV_VMSXX(eq) + RISCV_VMSXX(ne) + RISCV_VMSXX(lt) + RISCV_VMSXX(le) + RISCV_VMSXX(gt) + RISCV_VMSXX(ge) +#undef __riscv_vmsequ +#undef __riscv_vmsneu +#undef RISCV_VMSXX + } // namespace detail + + // add + template = 0> + inline batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvadd(lhs, rhs); + } + + // sadd + template = 0> + inline batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvsadd(lhs, rhs); + } + + // sub + template = 0> + inline batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvsub(lhs, rhs); + } + + // ssub + template = 0> + inline batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvssub(lhs, rhs); + } + + // mul + template = 0> + inline batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmul(lhs, rhs); + } + + // div + template = 0> + inline batch div(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvdiv(lhs, rhs); + } + + // max + template = 0> + inline batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmax(lhs, rhs); + } + + // min + template = 0> + inline batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmin(lhs, rhs); + } + + // neg + template = 0> + inline batch neg(batch const& arg, requires_arch) noexcept + { + using S = as_signed_integer_t; + const auto as_signed = detail::rvvreinterpret(arg); + const auto result = detail::rvvneg(as_signed); + return detail::rvvreinterpret(result); + } + + template = 0> + inline batch neg(batch const& arg, requires_arch) noexcept + { + return detail::rvvneg(arg); + } + + // abs + template = 0> + inline batch abs(batch const& arg, requires_arch) noexcept + { + return arg; + } + + template = 0> + inline batch abs(batch const& arg, requires_arch) noexcept + { + return detail::rvvabs(arg); + } + + // fma: x * y + z + template = 0> + inline batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + // also detail::rvvmadd(x, y, z); + return detail::rvvmacc(z, x, y); + } + + // fnma: z - x * y + template = 0> + inline batch fnma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + // also detail::rvvnmsub(x, y, z); + return detail::rvvnmsac(z, x, y); + } + + // fms: x * y - z + template = 0> + inline batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + // also vfmsac(z, x, y), but lacking integer version + // also vfmsub(x, y, z), but lacking integer version + return -fnma(x, y, z); + } + + // fnms: - x * y - z + template = 0> + inline batch fnms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + // also vfnmacc(z, x, y), but lacking integer version + // also vfnmadd(x, y, z), but lacking integer version + return -fma(z, x, y); + } + + /********************** + * Logical operations * + **********************/ + + namespace detail + { + XSIMD_RVV_OVERLOAD_INTS(rvvand, (__riscv_vand), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD_INTS(rvvor, (__riscv_vor), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD_INTS(rvvor_splat, (__riscv_vor), , vec(vec, T)) + XSIMD_RVV_OVERLOAD_INTS(rvvxor, (__riscv_vxor), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD_INTS(rvvnot, (__riscv_vnot), , vec(vec)) + XSIMD_RVV_OVERLOAD(rvvmand, (__riscv_vmand_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) + XSIMD_RVV_OVERLOAD(rvvmor, (__riscv_vmor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) + XSIMD_RVV_OVERLOAD(rvvmxor, (__riscv_vmxor_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) + XSIMD_RVV_OVERLOAD(rvvmandn, (__riscv_vmandn_mm_b XSIMD_RVV_S), , bvec(bvec, bvec)) + XSIMD_RVV_OVERLOAD(rvvmnot, (__riscv_vmnot), , bvec(bvec)) + } + + // bitwise_and + template = 0> + inline batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvand(lhs, rhs); + } + + template = 0> + inline batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); + const auto result_bits = detail::rvvand(lhs_bits, rhs_bits); + return detail::rvvreinterpret(result_bits); + } + + template = 0> + inline batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return detail::rvvmand(lhs, rhs); + } + + // bitwise_andnot + template = 0> + inline batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto not_rhs = detail::rvvnot(rhs); + return detail::rvvand(lhs, not_rhs); + } + + template = 0> + inline batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); + const auto not_rhs = detail::rvvnot(rhs_bits); + const auto result_bits = detail::rvvand(lhs_bits, not_rhs); + return detail::rvvreinterpret(result_bits); + } + + template = 0> + inline batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return detail::rvvmandn(lhs, rhs); + } + + // bitwise_or + template = 0> + inline batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvor(lhs, rhs); + } + + template = 0> + inline batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); + const auto result_bits = detail::rvvor(lhs_bits, rhs_bits); + return detail::rvvreinterpret(result_bits); + } + + template = 0> + inline batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return detail::rvvmor(lhs, rhs); + } + + // bitwise_xor + template = 0> + inline batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvxor(lhs, rhs); + } + + template = 0> + inline batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto lhs_bits = detail::rvv_to_unsigned_batch(lhs); + const auto rhs_bits = detail::rvv_to_unsigned_batch(rhs); + const auto result_bits = detail::rvvxor(lhs_bits, rhs_bits); + return detail::rvvreinterpret(result_bits); + } + + template = 0> + inline batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return detail::rvvmxor(lhs, rhs); + } + + // bitwise_not + template = 0> + inline batch bitwise_not(batch const& arg, requires_arch) noexcept + { + return detail::rvvnot(arg); + } + + template = 0> + inline batch bitwise_not(batch const& arg, requires_arch) noexcept + { + const auto arg_bits = detail::rvv_to_unsigned_batch(arg); + const auto result_bits = detail::rvvnot(arg_bits); + return detail::rvvreinterpret(result_bits); + } + + template = 0> + inline batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept + { + return detail::rvvmnot(arg); + } + + /********** + * Shifts * + **********/ + + namespace detail + { + XSIMD_RVV_OVERLOAD_INTS(rvvsll_splat, (__riscv_vsll), , vec(vec, size_t)) + XSIMD_RVV_OVERLOAD_INTS(rvvsll, (__riscv_vsll), , vec(vec, uvec)) + XSIMD_RVV_OVERLOAD2(rvvsr_splat, + (__riscv_vsra), + (__riscv_vsrl), , vec(vec, size_t)) + XSIMD_RVV_OVERLOAD2(rvvsr, + (__riscv_vsra), + (__riscv_vsrl), , vec(vec, uvec)) + } // namespace detail + + // bitwise_lshift + template = 0> + inline batch bitwise_lshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return detail::rvvsll_splat(arg, n); + } + + template = 0> + inline batch bitwise_lshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvsll(lhs, detail::rvv_to_unsigned_batch(rhs)); + } + + // bitwise_rshift + template = 0> + inline batch bitwise_rshift(batch const& arg, int n, requires_arch) noexcept + { + constexpr size_t size = sizeof(typename batch::value_type) * 8; + assert(0 <= n && static_cast(n) < size && "index in bounds"); + return detail::rvvsr_splat(arg, n); + } + + template = 0> + inline batch bitwise_rshift(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvsr(lhs, detail::rvv_to_unsigned_batch(rhs)); + } + + /************** + * Reductions * + **************/ + + namespace detail + { + XSIMD_RVV_OVERLOAD3(rvvredsum, + (__riscv_vredsum), + (__riscv_vredsum), + (__riscv_vfredosum), // or __riscv_vfredusum + , scalar_vec(vec, scalar_vec)) + XSIMD_RVV_OVERLOAD3(rvvredmax, + (__riscv_vredmax), + (__riscv_vredmaxu), + (__riscv_vfredmax), , scalar_vec(vec, scalar_vec)) + XSIMD_RVV_OVERLOAD3(rvvredmin, + (__riscv_vredmin), + (__riscv_vredminu), + (__riscv_vfredmin), , scalar_vec(vec, scalar_vec)) + XSIMD_RVV_OVERLOAD3(rvvslide1up, + (__riscv_vslide1up), + (__riscv_vslide1up), + (__riscv_vfslide1up), , vec(vec, vec)) + XSIMD_RVV_OVERLOAD3(rvvslide1down, + (__riscv_vslide1down), + (__riscv_vslide1down), + (__riscv_vfslide1down), , vec(vec, T)) + + template + inline T reduce_scalar(rvv_reg_t const& arg) + { + return detail::rvvmv_lane0(rvv_reg_t(arg.get_bytes(), types::detail::XSIMD_RVV_BITCAST)); + } + } + // reduce_add + template ::value_type, detail::rvv_enable_all_t = 0> + inline V reduce_add(batch const& arg, requires_arch) noexcept + { + const auto zero = detail::broadcast(T(0)); + const auto r = detail::rvvredsum(arg, zero); + return detail::reduce_scalar(r); + } + + // reduce_max + template = 0> + inline T reduce_max(batch const& arg, requires_arch) noexcept + { + const auto lowest = detail::broadcast(std::numeric_limits::lowest()); + const auto r = detail::rvvredmax(arg, lowest); + return detail::reduce_scalar(r); + } + + // reduce_min + template = 0> + inline T reduce_min(batch const& arg, requires_arch) noexcept + { + const auto max = detail::broadcast(std::numeric_limits::max()); + const auto r = detail::rvvredmin(arg, max); + return detail::reduce_scalar(r); + } + + // haddp + template = 0> + inline batch haddp(const batch* row, requires_arch) noexcept + { + constexpr std::size_t size = batch::size; + T sums[size]; +#pragma unroll size + for (std::size_t i = 0; i < size; ++i) + { + sums[i] = reduce_add(row[i], rvv {}); + } + return load_aligned(sums, convert(), rvv {}); + } + + /*************** + * Comparisons * + ***************/ + + // eq + template = 0> + inline batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmseq(lhs, rhs); + } + + template = 0> + inline batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + const auto neq_result = detail::rvvmxor(lhs, rhs); + return detail::rvvmnot(neq_result); + } + + // neq + template = 0> + inline batch_bool neq(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmsne(lhs, rhs); + } + + template = 0> + inline batch_bool neq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept + { + return detail::rvvmxor(lhs, rhs); + } + + // lt + template = 0> + inline batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmslt(lhs, rhs); + } + + // le + template = 0> + inline batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmsle(lhs, rhs); + } + + // gt + template = 0> + inline batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmsgt(lhs, rhs); + } + + // ge + template = 0> + inline batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return detail::rvvmsge(lhs, rhs); + } + + /************* + * Selection * + *************/ + namespace detail + { + XSIMD_RVV_OVERLOAD(rvvcompress, (__riscv_vcompress), , vec(vec, bvec)) + } + // compress + template + inline batch compress(batch const& x, batch_bool const& mask, requires_arch) noexcept + { + return detail::rvvcompress(x, mask); + } + + /*************** + * Permutation * + ***************/ + namespace detail + { + XSIMD_RVV_OVERLOAD(rvvrgather, (__riscv_vrgather), , vec(vec, uvec)) + XSIMD_RVV_OVERLOAD(rvvslideup, (__riscv_vslideup), , vec(vec, vec, size_t)) + XSIMD_RVV_OVERLOAD(rvvslidedown, (__riscv_vslidedown), , vec(vec, size_t)) + } + + // swizzle + template + inline batch swizzle(batch const& arg, batch_constant, idx...>, requires_arch) noexcept + { + static_assert(batch::size == sizeof...(idx), "invalid swizzle indices"); + const batch indices { idx... }; + return detail::rvvrgather(arg, indices); + } + + template + inline batch, A> swizzle(batch, A> const& self, + batch_constant, idx...>, + requires_arch) noexcept + { + const auto real = swizzle(self.real(), batch_constant, idx...> {}, rvv {}); + const auto imag = swizzle(self.imag(), batch_constant, idx...> {}, rvv {}); + return batch>(real, imag); + } + + /************* + * Selection * + *************/ + + // extract_pair + + template = 0> + inline batch extract_pair(batch const& lhs, batch const& rhs, size_t n, requires_arch) noexcept + { + const auto tmp = detail::rvvslidedown(rhs, n); + return detail::rvvslideup(tmp, lhs, lhs.size - n); + } + + // select + template = 0> + inline batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept + { + return detail::rvvmerge(b, a, cond); + } + + template + inline batch select(batch_bool_constant, b...> const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { b... }, true_br, false_br, rvv {}); + } + + // zip_lo + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto index = detail::vindex, 0, -1>(); + const auto mask = detail::pmask8(0xaa); + return detail::rvvmerge(detail::rvvrgather(lhs, index), + detail::rvvrgather(rhs, index), + mask); + } + + // zip_hi + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + const auto index = detail::vindex, batch::size / 2, -1>(); + const auto mask = detail::pmask8(0xaa); + return detail::rvvmerge(detail::rvvrgather(lhs, index), + detail::rvvrgather(rhs, index), + mask); + } + + // store_complex + template = 0> + inline void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + const auto lo = zip_lo(src.real(), src.imag()); + const auto hi = zip_hi(src.real(), src.imag()); + T* buf = reinterpret_cast(dst); + store_aligned(buf, lo, rvv {}); + store_aligned(buf + lo.size, hi, rvv {}); + } + + template = 0> + inline void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + { + store_complex_aligned(dst, src, rvv {}); + } + + /***************************** + * Floating-point arithmetic * + *****************************/ + + namespace detail + { + XSIMD_RVV_OVERLOAD_FLOATS(rvvfsqrt, (__riscv_vfsqrt), , vec(vec)) + XSIMD_RVV_OVERLOAD_FLOATS(rvvfrec7, (__riscv_vfrec7), , vec(vec)) + XSIMD_RVV_OVERLOAD_FLOATS(rvvfrsqrt7, (__riscv_vfrsqrt7), , vec(vec)) + } + + // rsqrt + template = 0> + inline batch rsqrt(batch const& arg, requires_arch) noexcept + { + auto approx = detail::rvvfrsqrt7(arg); + approx = approx * (1.5 - (0.5 * arg * approx * approx)); + return approx; + } + + // sqrt + template = 0> + inline batch sqrt(batch const& arg, requires_arch) noexcept + { + return detail::rvvfsqrt(arg); + } + + // reciprocal + template = 0> + inline batch reciprocal(const batch& arg, requires_arch) noexcept + { + return detail::rvvfrec7(arg); + } + + /****************************** + * Floating-point conversions * + ******************************/ + + // fast_cast + namespace detail + { + XSIMD_RVV_OVERLOAD2(rvvfcvt_rtz, // truncating conversion, like C. + (__riscv_vfcvt_rtz_x), + (__riscv_vfcvt_rtz_xu), _DROP_1ST, vec(T, fvec)) + XSIMD_RVV_OVERLOAD2(rvvfcvt_rne, // round to nearest, ties to even + (__riscv_vfcvt_x), + (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RNE) + XSIMD_RVV_OVERLOAD2(rvvfcvt_rmm, // round to nearest, ties to max magnitude + (__riscv_vfcvt_x), + (__riscv_vfcvt_xu), _DROP_1ST_CUSTOM_ARGS, vec(T, fvec), args..., __RISCV_FRM_RMM) + XSIMD_RVV_OVERLOAD2(rvvfcvt, // round to current rounding mode. + (__riscv_vfcvt_x), + (__riscv_vfcvt_xu), _DROP_1ST, vec(T, fvec)) + XSIMD_RVV_OVERLOAD_INTS(rvvfcvt_f, (__riscv_vfcvt_f), , fvec(vec)) + + template + using rvv_enable_ftoi_t = typename std::enable_if<(sizeof(T) == sizeof(U) && std::is_floating_point::value && !std::is_floating_point::value), int>::type; + template + using rvv_enable_itof_t = typename std::enable_if<(sizeof(T) == sizeof(U) && !std::is_floating_point::value && std::is_floating_point::value), int>::type; + + template = 0> + inline batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return rvvfcvt_rtz(U {}, arg); + } + template = 0> + inline batch fast_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return rvvfcvt_f(arg); + } + } + + /********* + * Miscs * + *********/ + + // set + template + inline batch set(batch const&, requires_arch, Args... args) noexcept + { + const std::array::size> tmp { args... }; + return load_unaligned(tmp.data(), convert(), rvv {}); + } + + template + inline batch, A> set(batch, A> const&, requires_arch, + Args... args_complex) noexcept + { + return batch>(set(batch {}, rvv {}, args_complex.real()...), + set(batch {}, rvv {}, args_complex.imag()...)); + } + + template + inline batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept + { + using U = as_unsigned_integer_t; + const auto values = set(batch {}, rvv {}, static_cast(args)...); + const auto zero = broadcast(U(0), rvv {}); + detail::rvv_bool_t result = detail::rvvmsne(values, zero); + return result; + } + + // insert + template = 0> + inline batch insert(batch const& arg, T val, index, requires_arch) noexcept + { + const auto mask = detail::pmask(uint64_t(1) << I); + return detail::rvvmerge_splat(arg, val, mask); + } + + // get + template = 0> + inline T get(batch const& arg, size_t i, requires_arch) noexcept + { + const auto tmp = detail::rvvslidedown(arg, i); + return detail::rvvmv_lane0(tmp); + } + + template = 0> + inline std::complex get(batch, A> const& arg, size_t i, requires_arch) noexcept + { + const auto tmpr = detail::rvvslidedown(arg.real(), i); + const auto tmpi = detail::rvvslidedown(arg.imag(), i); + return std::complex { detail::rvvmv_lane0(tmpr), detail::rvvmv_lane0(tmpi) }; + } + + // all + template = 0> + inline bool all(batch_bool const& arg, requires_arch) noexcept + { + return detail::rvvcpop(arg) == batch_bool::size; + } + + // any + template = 0> + inline bool any(batch_bool const& arg, requires_arch) noexcept + { + return detail::rvvcpop(arg) > 0; + } + + // bitwise_cast + template = 0, detail::rvv_enable_all_t = 0> + inline batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept + { + return detail::rvv_reg_t(arg.data.get_bytes(), types::detail::XSIMD_RVV_BITCAST); + } + + // batch_bool_cast + template = 0> + inline batch_bool batch_bool_cast(batch_bool const& arg, batch_bool const&, requires_arch) noexcept + { + using intermediate_t = typename detail::rvv_bool_t; + return intermediate_t(arg.data); + } + + // from_bool + template = 0> + inline batch from_bool(batch_bool const& arg, requires_arch) noexcept + { + const auto zero = broadcast(T(0), rvv {}); + return detail::rvvmerge_splat(zero, T(1), arg); + } + + namespace detail + { + template + inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) + { + return __riscv_vslidedown(arg, i, types::detail::rvv_width_m1 / 8); + } + template <> + inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) + { + const auto bytes = __riscv_vlmul_trunc_u8mf2(arg); + const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf2 / 8); + return __riscv_vlmul_ext_u8m1(result); + } + template <> + inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) + { + const auto bytes = __riscv_vlmul_trunc_u8mf4(arg); + const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf4 / 8); + return __riscv_vlmul_ext_u8m1(result); + } + template <> + inline vuint8m1_t rvvslidedownbytes(vuint8m1_t arg, size_t i) + { + const auto bytes = __riscv_vlmul_trunc_u8mf8(arg); + const auto result = __riscv_vslidedown(bytes, i, types::detail::rvv_width_mf8 / 8); + return __riscv_vlmul_ext_u8m1(result); + } + } + + // slide_left + template = 0> + inline batch slide_left(batch const& arg, requires_arch) noexcept + { + const auto zero = broadcast(uint8_t(0), rvv {}); + const auto bytes = arg.data.get_bytes(); + return detail::rvvreinterpret(detail::rvvslideup(zero, bytes, N)); + } + + // slide_right + template = 0> + inline batch slide_right(batch const& arg, requires_arch) noexcept + { + using reg_t = detail::rvv_reg_t; + const auto bytes = arg.data.get_bytes(); + return reg_t(detail::rvvslidedownbytes(bytes, N), types::detail::XSIMD_RVV_BITCAST); + } + + // isnan + template = 0> + inline batch_bool isnan(batch const& arg, requires_arch) noexcept + { + return !(arg == arg); + } + + namespace detail + { + template + using rvv_as_signed_integer_t = as_signed_integer_t>; + + template > + inline batch rvvfcvt_default(batch const& arg) noexcept + { + return rvvfcvt_rne(U {}, arg); + } + + template > + inline batch rvvfcvt_afz(batch const& arg) noexcept + { + return rvvfcvt_rmm(U {}, arg); + } + } + + // nearbyint_as_int + template > + inline batch nearbyint_as_int(batch const& arg, requires_arch) noexcept + { + // Reference rounds ties to nearest even + return detail::rvvfcvt_default(arg); + } + + // round + template = 0> + inline batch round(batch const& arg, requires_arch) noexcept + { + // Round ties away from zero. + const auto mask = abs(arg) < constants::maxflint>(); + return select(mask, to_float(detail::rvvfcvt_afz(arg)), arg, rvv {}); + } + + // nearbyint + template = 0> + inline batch nearbyint(batch const& arg, requires_arch) noexcept + { + // Round according to current rounding mode. + const auto mask = abs(arg) < constants::maxflint>(); + return select(mask, to_float(detail::rvvfcvt_default(arg)), arg, rvv {}); + } + } // namespace kernel +} // namespace xsimd + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp index 32a5d67c8eec..8160b2423bb7 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp @@ -380,7 +380,7 @@ namespace xsimd template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return wasm_f32x4_eq(self, other); + return wasm_i32x4_eq(self, other); } template ::value, void>::type> inline batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept @@ -440,7 +440,7 @@ namespace xsimd template inline batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return wasm_f64x2_eq(self, other); + return wasm_i64x2_eq(self, other); } // fast_cast @@ -579,6 +579,30 @@ namespace xsimd 0xFFFFFF00, 0xFFFFFFFF, }; + alignas(A::alignment()) static const uint32_t lut16[][4] = { + { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + }; + alignas(A::alignment()) static const uint64_t lut8[][4] = { + { 0x0000000000000000ul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + }; XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { assert(!(mask & ~0xFFFF) && "inbound mask"); @@ -587,15 +611,17 @@ namespace xsimd else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { assert(!(mask & ~0xFF) && "inbound mask"); - return wasm_i64x2_make(lut64[mask >> 4], lut64[mask & 0xF]); + return wasm_i64x2_make(lut64[mask & 0xF], lut64[mask >> 4]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - return batch_bool_cast(from_mask(batch_bool {}, mask, wasm {})); + assert(!(mask & ~0xFul) && "inbound mask"); + return wasm_v128_load((const v128_t*)lut16[mask]); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { - return batch_bool_cast(from_mask(batch_bool {}, mask, wasm {})); + assert(!(mask & ~0x3ul) && "inbound mask"); + return wasm_v128_load((const v128_t*)lut8[mask]); } } @@ -1114,44 +1140,6 @@ namespace xsimd return wasm_f64x2_extract_lane(tmp2, 0); } - // reduce_max - template ::type> - inline T reduce_max(batch const& self, requires_arch) noexcept - { - batch step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); - batch acc0 = max(self, step0); - - batch step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0); - batch acc1 = max(acc0, step1); - - batch step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7); - batch acc2 = max(acc1, step2); - if (sizeof(T) == 2) - return acc2.get(0); - batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); - batch acc3 = max(acc2, step3); - return acc3.get(0); - } - - // reduce_min - template ::type> - inline T reduce_min(batch const& self, requires_arch) noexcept - { - batch step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0); - batch acc0 = min(self, step0); - - batch step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0); - batch acc1 = min(acc0, step1); - - batch step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7); - batch acc2 = min(acc1, step2); - if (sizeof(T) == 2) - return acc2.get(0); - batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); - batch acc3 = min(acc2, step3); - return acc3.get(0); - } - // rsqrt template inline batch rsqrt(batch const& self, requires_arch) noexcept @@ -1171,15 +1159,15 @@ namespace xsimd inline batch slide_left(batch const& x, requires_arch) noexcept { return wasm_i8x16_shuffle( - wasm_i64x2_const(0, 0), x, ((N)&0xF0) ? 0 : 16 - ((N)&0xF), - ((N)&0xF0) ? 0 : 17 - ((N)&0xF), ((N)&0xF0) ? 0 : 18 - ((N)&0xF), - ((N)&0xF0) ? 0 : 19 - ((N)&0xF), ((N)&0xF0) ? 0 : 20 - ((N)&0xF), - ((N)&0xF0) ? 0 : 21 - ((N)&0xF), ((N)&0xF0) ? 0 : 22 - ((N)&0xF), - ((N)&0xF0) ? 0 : 23 - ((N)&0xF), ((N)&0xF0) ? 0 : 24 - ((N)&0xF), - ((N)&0xF0) ? 0 : 25 - ((N)&0xF), ((N)&0xF0) ? 0 : 26 - ((N)&0xF), - ((N)&0xF0) ? 0 : 27 - ((N)&0xF), ((N)&0xF0) ? 0 : 28 - ((N)&0xF), - ((N)&0xF0) ? 0 : 29 - ((N)&0xF), ((N)&0xF0) ? 0 : 30 - ((N)&0xF), - ((N)&0xF0) ? 0 : 31 - ((N)&0xF)); + wasm_i64x2_const(0, 0), x, ((N) & 0xF0) ? 0 : 16 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 17 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 18 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 19 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 20 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 21 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 22 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 23 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 24 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 25 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 26 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 27 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 28 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 29 - ((N) & 0xF), ((N) & 0xF0) ? 0 : 30 - ((N) & 0xF), + ((N) & 0xF0) ? 0 : 31 - ((N) & 0xF)); } // slide_right @@ -1187,15 +1175,15 @@ namespace xsimd inline batch slide_right(batch const& x, requires_arch) noexcept { return wasm_i8x16_shuffle( - x, wasm_i64x2_const(0, 0), ((N)&0xF0) ? 16 : ((N)&0xF) + 0, - ((N)&0xF0) ? 16 : ((N)&0xF) + 1, ((N)&0xF0) ? 16 : ((N)&0xF) + 2, - ((N)&0xF0) ? 16 : ((N)&0xF) + 3, ((N)&0xF0) ? 16 : ((N)&0xF) + 4, - ((N)&0xF0) ? 16 : ((N)&0xF) + 5, ((N)&0xF0) ? 16 : ((N)&0xF) + 6, - ((N)&0xF0) ? 16 : ((N)&0xF) + 7, ((N)&0xF0) ? 16 : ((N)&0xF) + 8, - ((N)&0xF0) ? 16 : ((N)&0xF) + 9, ((N)&0xF0) ? 16 : ((N)&0xF) + 10, - ((N)&0xF0) ? 16 : ((N)&0xF) + 11, ((N)&0xF0) ? 16 : ((N)&0xF) + 12, - ((N)&0xF0) ? 16 : ((N)&0xF) + 13, ((N)&0xF0) ? 16 : ((N)&0xF) + 14, - ((N)&0xF0) ? 16 : ((N)&0xF) + 15); + x, wasm_i64x2_const(0, 0), ((N) & 0xF0) ? 16 : ((N) & 0xF) + 0, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 1, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 2, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 3, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 4, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 5, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 6, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 7, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 8, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 9, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 10, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 11, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 12, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 13, ((N) & 0xF0) ? 16 : ((N) & 0xF) + 14, + ((N) & 0xF0) ? 16 : ((N) & 0xF) + 15); } // sadd @@ -1259,29 +1247,15 @@ namespace xsimd // shuffle template - inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3> mask, requires_arch) noexcept + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1, I2, I3>, requires_arch) noexcept { - // shuffle within lane - if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4) - return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3); - - // shuffle within opposite lane - if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4) - return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3); - return shuffle(x, y, mask, generic {}); + return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3); } template - inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1> mask, requires_arch) noexcept + inline batch shuffle(batch const& x, batch const& y, batch_constant, I0, I1>, requires_arch) noexcept { - // shuffle within lane - if (I0 < 2 && I1 >= 2) - return wasm_i64x2_shuffle(x, y, I0, I1); - - // shuffle within opposite lane - if (I0 >= 2 && I1 < 2) - return wasm_i64x2_shuffle(y, x, I0, I1); - return shuffle(x, y, mask, generic {}); + return wasm_i64x2_shuffle(x, y, I0, I1); } // set @@ -1500,7 +1474,6 @@ namespace xsimd } // swizzle - template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept { @@ -1516,7 +1489,7 @@ namespace xsimd template inline batch swizzle(batch const& self, batch_constant, V0, V1>, requires_arch) noexcept { - return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); + return wasm_i64x2_shuffle(self, self, V0, V1); } template @@ -1528,7 +1501,7 @@ namespace xsimd template inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3>, requires_arch) noexcept { - return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3); + return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3); } template @@ -1537,6 +1510,32 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); } + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch) noexcept + { + return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch) noexcept + { + return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15); + } + + template + inline batch swizzle(batch const& self, batch_constant, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, wasm {})); + } + // trunc template inline batch trunc(batch const& self, requires_arch) noexcept @@ -1625,4 +1624,4 @@ namespace xsimd } } -#endif \ No newline at end of file +#endif diff --git a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp index ab9ecbc298e8..fe8c54166923 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp @@ -57,7 +57,7 @@ namespace xsimd { }; - template + template struct is_sorted; template <> @@ -65,14 +65,14 @@ namespace xsimd { }; - template - struct is_sorted : std::true_type + template + struct is_sorted : std::true_type { }; - template - struct is_sorted - : std::conditional<(A0::version() >= A1::version()), is_sorted, + template + struct is_sorted + : std::conditional<(V0 >= V1), is_sorted, std::false_type>::type { }; @@ -111,7 +111,7 @@ namespace xsimd struct arch_list { #ifndef NDEBUG - static_assert(detail::is_sorted::value, + static_assert(detail::is_sorted::value, "architecture list must be sorted by version"); #endif @@ -190,16 +190,23 @@ namespace xsimd struct unsupported { }; - using all_x86_architectures = arch_list, avx2, fma3, avx, fma4, fma3, sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; + using all_x86_architectures = arch_list< + avx512vnni, avx512vbmi, avx512ifma, avx512pf, avx512vnni, avx512bw, avx512er, avx512dq, avx512cd, avx512f, + avxvnni, fma3, avx2, fma3, avx, fma4, fma3, + sse4_2, sse4_1, /*sse4a,*/ ssse3, sse3, sse2>; + using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; + using all_rvv_architectures = arch_list, detail::rvv<256>, detail::rvv<128>>; using all_arm_architectures = typename detail::join>::type; + using all_riscv_architectures = all_rvv_architectures; using all_wasm_architectures = arch_list; - using all_architectures = typename detail::join::type; + using all_architectures = typename detail::join::type; using supported_architectures = typename detail::supported::type; using x86_arch = typename detail::supported::type::best; using arm_arch = typename detail::supported::type::best; + using riscv_arch = typename detail::supported::type::best; using best_arch = typename supported_architectures::best; #ifdef XSIMD_DEFAULT_ARCH diff --git a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp index 161f123fea2a..cf5163c37efa 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp @@ -12,9 +12,9 @@ #ifndef XSIMD_CONFIG_HPP #define XSIMD_CONFIG_HPP -#define XSIMD_VERSION_MAJOR 11 -#define XSIMD_VERSION_MINOR 2 -#define XSIMD_VERSION_PATCH 0 +#define XSIMD_VERSION_MAJOR 12 +#define XSIMD_VERSION_MINOR 1 +#define XSIMD_VERSION_PATCH 1 /** * high level free functions @@ -99,6 +99,17 @@ #define XSIMD_WITH_AVX2 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVXVNNI is available at compile-time, to 0 otherwise. + */ +#ifdef __AVXVNNI__ +#define XSIMD_WITH_AVXVNNI 1 +#else +#define XSIMD_WITH_AVXVNNI 0 +#endif + /** * @ingroup xsimd_config_macro * @@ -244,6 +255,72 @@ #define XSIMD_WITH_AVX512BW 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVX512ER is available at compile-time, to 0 otherwise. + */ +#ifdef __AVX512ER__ +#define XSIMD_WITH_AVX512ER XSIMD_WITH_AVX512F +#else +#define XSIMD_WITH_AVX512ER 0 +#endif + +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVX512PF is available at compile-time, to 0 otherwise. + */ +#ifdef __AVX512PF__ +#define XSIMD_WITH_AVX512PF XSIMD_WITH_AVX512F +#else +#define XSIMD_WITH_AVX512PF 0 +#endif + +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVX512IFMA is available at compile-time, to 0 otherwise. + */ +#ifdef __AVX512IFMA__ +#define XSIMD_WITH_AVX512IFMA XSIMD_WITH_AVX512F +#else +#define XSIMD_WITH_AVX512IFMA 0 +#endif + +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVX512VBMI is available at compile-time, to 0 otherwise. + */ +#ifdef __AVX512VBMI__ +#define XSIMD_WITH_AVX512VBMI XSIMD_WITH_AVX512F +#else +#define XSIMD_WITH_AVX512VBMI 0 +#endif + +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if AVX512VNNI is available at compile-time, to 0 otherwise. + */ +#ifdef __AVX512VNNI__ + +#if XSIMD_WITH_AVX512_VBMI +#define XSIMD_WITH_AVX512VNNI_AVX512VBMI XSIMD_WITH_AVX512F +#define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F +#else +#define XSIMD_WITH_AVX512VNNI_AVX512VBMI 0 +#define XSIMD_WITH_AVX512VNNI_AVX512BW XSIMD_WITH_AVX512F +#endif + +#else + +#define XSIMD_WITH_AVX512VNNI_AVX512VBMI 0 +#define XSIMD_WITH_AVX512VNNI_AVX512BW 0 + +#endif + #ifdef __ARM_NEON /** @@ -285,6 +362,19 @@ #define XSIMD_SVE_BITS 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if RVV is available and bit width is pre-set at compile-time, to 0 otherwise. + */ +#if defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0 +#define XSIMD_WITH_RVV 1 +#define XSIMD_RVV_BITS __riscv_v_fixed_vlen +#else +#define XSIMD_WITH_RVV 0 +#define XSIMD_RVV_BITS 0 +#endif + /** * @ingroup xsimd_config_macro * @@ -354,7 +444,8 @@ #endif -#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_WASM +#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM +#define XSIMD_NO_SUPPORTED_ARCHITECTURE #endif #endif diff --git a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp index 76fd26c2b52c..62aca6132fdd 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp @@ -15,7 +15,7 @@ #include #include -#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM)) +#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) #include #include #endif @@ -45,14 +45,22 @@ namespace xsimd unsigned avx : 1; unsigned fma3_avx : 1; unsigned avx2 : 1; + unsigned avxvnni : 1; unsigned fma3_avx2 : 1; unsigned avx512f : 1; unsigned avx512cd : 1; unsigned avx512dq : 1; unsigned avx512bw : 1; + unsigned avx512er : 1; + unsigned avx512pf : 1; + unsigned avx512ifma : 1; + unsigned avx512vbmi : 1; + unsigned avx512vnni_bw : 1; + unsigned avx512vnni_vbmi : 1; unsigned neon : 1; unsigned neon64 : 1; unsigned sve : 1; + unsigned rvv : 1; // version number of the best arch available unsigned best; @@ -85,15 +93,27 @@ namespace xsimd #endif best = sve::version() * sve; +#elif defined(__riscv_vector) && defined(__riscv_v_fixed_vlen) && __riscv_v_fixed_vlen > 0 + +#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) +#ifndef HWCAP_V +#define HWCAP_V (1 << ('V' - 'A')) +#endif + rvv = bool(getauxval(AT_HWCAP) & HWCAP_V); +#else + rvv = 0; +#endif + + best = ::xsimd::rvv::version() * rvv; #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86) - auto get_cpuid = [](int reg[4], int func_id) noexcept + auto get_cpuid = [](int reg[4], int level, int count = 0) noexcept { #if defined(_MSC_VER) - __cpuidex(reg, func_id, 0); + __cpuidex(reg, level, count); #elif defined(__INTEL_COMPILER) - __cpuid(reg, func_id); + __cpuid(reg, level); #elif defined(__GNUC__) || defined(__clang__) @@ -104,13 +124,13 @@ namespace xsimd "xchg{l}\t{%%}ebx, %1\n\t" : "=a"(reg[0]), "=r"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "a"(func_id), "c"(0)); + : "0"(level), "2"(count)); #else __asm__("cpuid\n\t" : "=a"(reg[0]), "=b"(reg[1]), "=c"(reg[2]), "=d"(reg[3]) - : "a"(func_id), "c"(0)); + : "0"(level), "2"(count)); #endif #else @@ -163,6 +183,11 @@ namespace xsimd avx2 = regs7[1] >> 5 & 1; best = std::max(best, avx2::version() * avx2); + int regs7a[4]; + get_cpuid(regs7a, 0x7, 0x1); + avxvnni = regs7a[0] >> 4 & 1; + best = std::max(best, avxvnni::version() * avxvnni * avx2); + fma3_avx2 = avx2 && fma3_sse; best = std::max(best, fma3::version() * fma3_avx2); @@ -178,6 +203,23 @@ namespace xsimd avx512bw = regs7[1] >> 30 & 1; best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f); + avx512er = regs7[1] >> 27 & 1; + best = std::max(best, avx512er::version() * avx512er * avx512cd * avx512f); + + avx512pf = regs7[1] >> 26 & 1; + best = std::max(best, avx512pf::version() * avx512pf * avx512er * avx512cd * avx512f); + + avx512ifma = regs7[1] >> 21 & 1; + best = std::max(best, avx512ifma::version() * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f); + + avx512vbmi = regs7[2] >> 1 & 1; + best = std::max(best, avx512vbmi::version() * avx512vbmi * avx512ifma * avx512bw * avx512dq * avx512cd * avx512f); + + avx512vnni_bw = regs7[2] >> 11 & 1; + best = std::max(best, avx512vnni::version() * avx512vnni_bw * avx512bw * avx512dq * avx512cd * avx512f); + + avx512vnni_vbmi = avx512vbmi && avx512vnni_bw; + best = std::max(best, avx512vnni::version() * avx512vnni_vbmi); #endif } }; diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp index ec20ce5fba3b..4350ca0a281a 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp @@ -18,9 +18,19 @@ #include "xsimd_avx2_register.hpp" #include "xsimd_avx_register.hpp" +#include "xsimd_avxvnni_register.hpp" #include "xsimd_fma3_avx2_register.hpp" #include "xsimd_fma3_avx_register.hpp" +#include "xsimd_avx512vnni_avx512bw_register.hpp" +#include "xsimd_avx512vnni_avx512vbmi_register.hpp" + +#include "xsimd_avx512ifma_register.hpp" +#include "xsimd_avx512vbmi_register.hpp" + +#include "xsimd_avx512er_register.hpp" +#include "xsimd_avx512pf_register.hpp" + #include "xsimd_avx512bw_register.hpp" #include "xsimd_avx512cd_register.hpp" #include "xsimd_avx512dq_register.hpp" @@ -31,4 +41,6 @@ #include "xsimd_sve_register.hpp" +#include "xsimd_rvv_register.hpp" + #include "xsimd_wasm_register.hpp" diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp index 6a1526d95867..0420f0a09d6e 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp @@ -530,6 +530,19 @@ namespace xsimd return kernel::clip(x, lo, hi, A {}); } + /** + * @ingroup batch_data_transfer + * + * Pick elements from \c x selected by \c mask, and append them to the + * resulting vector, zeroing the remaining slots + */ + template + inline batch compress(batch const& x, batch_bool const& mask) noexcept + { + detail::static_check_supported_config(); + return kernel::compress(x, mask, A {}); + } + /** * @ingroup batch_complex * @@ -705,6 +718,19 @@ namespace xsimd return kernel::exp2(x, A {}); } + /** + * @ingroup batch_data_transfer + * + * Load contiguous elements from \c x and place them in slots selected by \c + * mask, zeroing the other slots + */ + template + inline batch expand(batch const& x, batch_bool const& mask) noexcept + { + detail::static_check_supported_config(); + return kernel::expand(x, mask, A {}); + } + /** * @ingroup batch_math * diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp new file mode 100644 index 000000000000..a99157cf3723 --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp @@ -0,0 +1,48 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512ER_REGISTER_HPP +#define XSIMD_AVX512ER_REGISTER_HPP + +#include "./xsimd_avx512dq_register.hpp" + +namespace xsimd +{ + + /** + * @ingroup architectures + * + * AVX512ER instructions + */ + struct avx512er : avx512cd + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512ER; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 3, 1); } + static constexpr char const* name() noexcept { return "avx512er"; } + }; + +#if XSIMD_WITH_AVX512ER + + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512er, avx512cd); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp index fb8e473e1e97..c1f80a122ddb 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp @@ -53,7 +53,6 @@ namespace xsimd using type = simd_avx512_bool_register; }; - XSIMD_DECLARE_SIMD_REGISTER(bool, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i); XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i); diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp new file mode 100644 index 000000000000..ba76ea147bf5 --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp @@ -0,0 +1,48 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512IFMA_REGISTER_HPP +#define XSIMD_AVX512IFMA_REGISTER_HPP + +#include "./xsimd_avx512bw_register.hpp" + +namespace xsimd +{ + + /** + * @ingroup architectures + * + * AVX512IFMA instructions + */ + struct avx512ifma : avx512bw + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 5, 0); } + static constexpr char const* name() noexcept { return "avx512ifma"; } + }; + +#if XSIMD_WITH_AVX512IFMA + + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512bw); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp new file mode 100644 index 000000000000..38a10f022737 --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp @@ -0,0 +1,48 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512PF_REGISTER_HPP +#define XSIMD_AVX512PF_REGISTER_HPP + +#include "./xsimd_avx512er_register.hpp" + +namespace xsimd +{ + + /** + * @ingroup architectures + * + * AVX512BW instructions + */ + struct avx512pf : avx512er + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512PF; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); } + static constexpr char const* name() noexcept { return "avx512pf"; } + }; + +#if XSIMD_WITH_AVX512PF + + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512pf, avx512er); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp new file mode 100644 index 000000000000..19ff744d7208 --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp @@ -0,0 +1,48 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VBMI_REGISTER_HPP +#define XSIMD_AVX512VBMI_REGISTER_HPP + +#include "./xsimd_avx512ifma_register.hpp" + +namespace xsimd +{ + + /** + * @ingroup architectures + * + * AVX512VBMI instructions + */ + struct avx512vbmi : avx512ifma + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 6, 0); } + static constexpr char const* name() noexcept { return "avx512vbmi"; } + }; + +#if XSIMD_WITH_AVX512VBMI + + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vbmi, avx512ifma); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp new file mode 100644 index 000000000000..85edbdf230ce --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp @@ -0,0 +1,51 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP +#define XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP + +#include "./xsimd_avx512bw_register.hpp" + +namespace xsimd +{ + template + struct avx512vnni; + + /** + * @ingroup architectures + * + * AVX512VNNI instructions + */ + template <> + struct avx512vnni : avx512bw + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512BW; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); } + static constexpr char const* name() noexcept { return "avx512vnni+avx512bw"; } + }; + +#if XSIMD_WITH_AVX512VNNI_AVX512BW + + namespace types + { + template + struct get_bool_simd_register> + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512bw); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp new file mode 100644 index 000000000000..232b19a5cb82 --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp @@ -0,0 +1,51 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP +#define XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP + +#include "./xsimd_avx512vbmi_register.hpp" + +namespace xsimd +{ + template + struct avx512vnni; + + /** + * @ingroup architectures + * + * AVX512VNNI instructions + */ + template <> + struct avx512vnni : avx512vbmi + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512VBMI; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 6, 1); } + static constexpr char const* name() noexcept { return "avx512vnni+avx512vbmi"; } + }; + +#if XSIMD_WITH_AVX512VNNI_AVX512VBMI + + namespace types + { + template + struct get_bool_simd_register> + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512vbmi); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp new file mode 100644 index 000000000000..c276fb00792a --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp @@ -0,0 +1,48 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVX512VNNI_REGISTER_HPP +#define XSIMD_AVX512VNNI_REGISTER_HPP + +#include "./xsimd_avx512vbmi_register.hpp" + +namespace xsimd +{ + + /** + * @ingroup architectures + * + * AVX512VNNI instructions + */ + struct avx512vnni : avx512vbmi + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(3, 7, 0); } + static constexpr char const* name() noexcept { return "avx512vnni"; } + }; + +#if XSIMD_WITH_AVX512VNNI + + namespace types + { + template + struct get_bool_simd_register + { + using type = simd_avx512_bool_register; + }; + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512vbmi); + + } +#endif +} +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp index 95f18ebfb6f7..6b1951f964b9 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp @@ -42,7 +42,6 @@ namespace xsimd namespace types { - XSIMD_DECLARE_SIMD_REGISTER(bool, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i); XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i); diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp new file mode 100644 index 000000000000..f68fe16bad2b --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp @@ -0,0 +1,40 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_AVXVNNI_REGISTER_HPP +#define XSIMD_AVXVNNI_REGISTER_HPP + +#include "./xsimd_avx2_register.hpp" + +namespace xsimd +{ + /** + * @ingroup architectures + * + * AVXVNNI instructions + */ + struct avxvnni : avx2 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(2, 3, 0); } + static constexpr char const* name() noexcept { return "avxvnni"; } + }; + +#if XSIMD_WITH_AVXVNNI + namespace types + { + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avxvnni, avx2); + } +#endif +} + +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp index c8a8239cc3a3..b4989fc88d0d 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp @@ -112,6 +112,7 @@ namespace xsimd template class batch : public types::simd_register, public types::integral_only_operators { + static_assert(!std::is_same::value, "use xsimd::batch_bool instead of xsimd::batch"); public: static constexpr std::size_t size = sizeof(types::simd_register) / sizeof(T); ///< Number of scalar elements in this batch. diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp index bf2b9569e794..0de9c8ad42c1 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp @@ -88,7 +88,7 @@ namespace xsimd #define MAKE_BINARY_OP(OP, NAME) \ template \ constexpr auto operator OP(batch_bool_constant other) const \ - ->decltype(apply(*this, other)) \ + -> decltype(apply(*this, other)) \ { \ return apply(*this, other); \ } @@ -199,7 +199,7 @@ namespace xsimd #define MAKE_BINARY_OP(OP, NAME) \ template \ constexpr auto operator OP(batch_constant other) const \ - ->decltype(apply(*this, other)) \ + -> decltype(apply(*this, other)) \ { \ return apply(*this, other); \ } diff --git a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp index 6aaee93393e0..2aa25419c6f4 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp @@ -41,7 +41,7 @@ namespace xsimd static constexpr char const* name() noexcept { return "generic"; } protected: - static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; } + static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; } }; } diff --git a/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp new file mode 100644 index 000000000000..1b3daf4592d4 --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp @@ -0,0 +1,417 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * Copyright (c) Yibo Cai * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_RVV_REGISTER_HPP +#define XSIMD_RVV_REGISTER_HPP + +#include "xsimd_generic_arch.hpp" +#include "xsimd_register.hpp" + +#if XSIMD_WITH_RVV +#include +#endif + +namespace xsimd +{ + namespace detail + { + /** + * @ingroup architectures + * + * RVV instructions (fixed vector size) for riscv + */ + template + struct rvv : xsimd::generic + { + static constexpr size_t width = Width; + static constexpr bool supported() noexcept { return Width == XSIMD_RVV_BITS; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr unsigned version() noexcept { return generic::version(1, 0, 0, /*multiplier=*/1000); } + static constexpr char const* name() noexcept { return "riscv+rvv"; } + }; + } + +#if XSIMD_WITH_RVV + + using rvv = detail::rvv<__riscv_v_fixed_vlen>; + +#define XSIMD_RVV_JOINT_(a, b, c) a##b##c +#define XSIMD_RVV_JOINT(a, b, c) XSIMD_RVV_JOINT_(a, b, c) +#define XSIMD_RVV_JOINT5(a, b, c, d, e) XSIMD_RVV_JOINT(XSIMD_RVV_JOINT(a, b, c), d, e) + +#define XSIMD_RVV_TYPE_i(S, V) XSIMD_RVV_JOINT5(vint, S, m, V, _t) +#define XSIMD_RVV_TYPE_u(S, V) XSIMD_RVV_JOINT5(vuint, S, m, V, _t) +#define XSIMD_RVV_TYPE_f(S, V) XSIMD_RVV_JOINT5(vfloat, S, m, V, _t) +#define XSIMD_RVV_TYPE(T, S, V) XSIMD_RVV_JOINT(XSIMD_RVV_TYPE, _, T)(S, V) + + namespace types + { + namespace detail + { + static constexpr size_t rvv_width_mf8 = XSIMD_RVV_BITS / 8; + static constexpr size_t rvv_width_mf4 = XSIMD_RVV_BITS / 4; + static constexpr size_t rvv_width_mf2 = XSIMD_RVV_BITS / 2; + static constexpr size_t rvv_width_m1 = XSIMD_RVV_BITS; + static constexpr size_t rvv_width_m2 = XSIMD_RVV_BITS * 2; + static constexpr size_t rvv_width_m4 = XSIMD_RVV_BITS * 4; + static constexpr size_t rvv_width_m8 = XSIMD_RVV_BITS * 8; + + // rvv_type_info is a utility class to convert scalar type and + // bitwidth into rvv register types. + // + // * `type` is the unadorned vector type. + // * `fixed_type` is the same type, but with the storage attribute + // applied. + // * `byte_type` is the type which is the same size in unsigned + // bytes, used as an intermediate step for bit-cast operations, + // because only a subset of __riscv_vreinterpret() intrinsics + // exist -- but always enough to get us to bytes and back. + // + template + struct rvv_type_info; +#define XSIMD_RVV_MAKE_TYPE(scalar, t, s, vmul) \ + template <> \ + struct rvv_type_info \ + { \ + static constexpr size_t width = rvv_width_m1 * vmul; \ + using type = XSIMD_RVV_TYPE(t, s, vmul); \ + using byte_type = XSIMD_RVV_TYPE(u, 8, vmul); \ + using fixed_type = type __attribute__((riscv_rvv_vector_bits(width))); \ + template \ + static inline type bitcast(U x) noexcept \ + { \ + const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \ + return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, t, s, m, vmul)(words); \ + } \ + template <> \ + inline type bitcast(type x) noexcept { return x; } \ + static inline byte_type as_bytes(type x) noexcept \ + { \ + const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \ + return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words); \ + } \ + }; + +#define XSIMD_RVV_MAKE_TYPES(vmul) \ + XSIMD_RVV_MAKE_TYPE(int8_t, i, 8, vmul) \ + XSIMD_RVV_MAKE_TYPE(uint8_t, u, 8, vmul) \ + XSIMD_RVV_MAKE_TYPE(int16_t, i, 16, vmul) \ + XSIMD_RVV_MAKE_TYPE(uint16_t, u, 16, vmul) \ + XSIMD_RVV_MAKE_TYPE(int32_t, i, 32, vmul) \ + XSIMD_RVV_MAKE_TYPE(uint32_t, u, 32, vmul) \ + XSIMD_RVV_MAKE_TYPE(int64_t, i, 64, vmul) \ + XSIMD_RVV_MAKE_TYPE(uint64_t, u, 64, vmul) \ + XSIMD_RVV_MAKE_TYPE(float, f, 32, vmul) \ + XSIMD_RVV_MAKE_TYPE(double, f, 64, vmul) + + XSIMD_RVV_MAKE_TYPES(8) + XSIMD_RVV_MAKE_TYPES(4) + XSIMD_RVV_MAKE_TYPES(2) + XSIMD_RVV_MAKE_TYPES(1) +#undef XSIMD_RVV_TYPE +#undef XSIMD_RVV_TYPE_f +#undef XSIMD_RVV_TYPE_u +#undef XSIMD_RVV_TYPE_i +#undef XSIMD_RVV_MAKE_TYPES +#undef XSIMD_RVV_MAKE_TYPE + + // rvv_blob is storage-type abstraction for a vector register. + template + struct rvv_blob : public rvv_type_info + { + using super = rvv_type_info; + using typename super::fixed_type; + using typename super::type; + + fixed_type value; + type get() const { return value; } + void set(type v) { value = v; } + }; + // + // But sometimes we want our storage type to be less than a whole + // register, while presenting as a whole register to the outside + // world. This is because some partial-register types are not + // defined, but they can (mostly) be emulated using shorter vl on a + // full-width register for arithmetic, and cast back to a partial + // byte register for storage. + // + template + struct rvv_semiblob : public rvv_type_info + { + using super = rvv_type_info; + static constexpr size_t width = rvv_width_m1 / divisor; + using typename super::type; + template + struct semitype; + template <> + struct semitype<2> + { + using type = vuint8mf2_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf2))); + }; + template <> + struct semitype<4> + { + using type = vuint8mf4_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf4))); + }; + template <> + struct semitype<8> + { + using type = vuint8mf8_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf8))); + }; + using fixed_type = typename semitype::type; + using super::as_bytes; + using super::bitcast; + + fixed_type value; + template + vuint8m1_t get_bytes() const; + template <> + vuint8m1_t get_bytes<2>() const { return __riscv_vlmul_ext_v_u8mf2_u8m1(value); } + template <> + vuint8m1_t get_bytes<4>() const { return __riscv_vlmul_ext_v_u8mf4_u8m1(value); } + template <> + vuint8m1_t get_bytes<8>() const { return __riscv_vlmul_ext_v_u8mf8_u8m1(value); } + type get() const noexcept + { + vuint8m1_t bytes = get_bytes(); + return bitcast(bytes); + } + template + void set_bytes(vuint8m1_t); + template <> + void set_bytes<2>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf2(v); } + template <> + void set_bytes<4>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf4(v); } + template <> + void set_bytes<8>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf8(v); } + void set(type v) + { + vuint8m1_t bytes = as_bytes(v); + set_bytes(bytes); + } + }; + template + struct rvv_blob : rvv_semiblob + { + }; + template + struct rvv_blob : rvv_semiblob + { + }; + template + struct rvv_blob : rvv_semiblob + { + }; + + // It's difficult dealing with both char and whichever *int8_t type + // is compatible with char, so just avoid it altogether. + // + using rvv_char_t = typename std::conditional::value, int8_t, uint8_t>::type; + template + using rvv_fix_char_t = typename std::conditional< + std::is_same::type>::value, + rvv_char_t, T>::type; + + // An explicit constructor isn't really explicit enough to allow + // implicit bit-casting operations between incompatible types, so + // we add this vacuous flag argument when we're serious: + // + enum rvv_bitcast_flag + { + XSIMD_RVV_BITCAST + }; + + // the general-purpose vector register type, usable within + // templates, and supporting arithmetic on partial registers for + // which there is no intrinsic type (by casting via a full register + // type). + // + template + struct rvv_reg + { + static constexpr size_t width = Width; + static constexpr size_t vl = Width / (sizeof(T) * 8); + using blob_type = rvv_blob; + using register_type = typename blob_type::type; + using byte_type = typename blob_type::byte_type; + blob_type value; + rvv_reg() noexcept = default; + rvv_reg(register_type x) noexcept { value.set(x); } + explicit rvv_reg(byte_type v, rvv_bitcast_flag) { value.set(value.bitcast(v)); } + template + explicit rvv_reg(rvv_reg v, rvv_bitcast_flag) + : rvv_reg(v.get_bytes(), XSIMD_RVV_BITCAST) + { + } + byte_type get_bytes() const noexcept + { + return blob_type::as_bytes(value.get()); + } + operator register_type() const noexcept { return value.get(); } + }; + template + using rvv_reg_t = typename std::conditional::value, rvv_reg, Width>, void>::type; + + // And some more of the same stuff for bool types, which have + // similar problems and similar workarounds. + // + template + struct rvv_bool_info; +#define XSIMD_RVV_MAKE_BOOL_TYPE(i) \ + template <> \ + struct rvv_bool_info \ + { \ + using type = XSIMD_RVV_JOINT(vbool, i, _t); \ + template \ + static inline type bitcast(T value) noexcept \ + { \ + return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value); \ + } \ + /*template <> static inline type bitcast(type value) noexcept { return value; }*/ \ + }; + XSIMD_RVV_MAKE_BOOL_TYPE(1); + XSIMD_RVV_MAKE_BOOL_TYPE(2); + XSIMD_RVV_MAKE_BOOL_TYPE(4); + XSIMD_RVV_MAKE_BOOL_TYPE(8); + XSIMD_RVV_MAKE_BOOL_TYPE(16); + XSIMD_RVV_MAKE_BOOL_TYPE(32); + XSIMD_RVV_MAKE_BOOL_TYPE(64); +#undef XSIMD_RVV_MAKE_BOOL_TYPE +#undef XSIMD_RVV_JOINT5 +#undef XSIMD_RVV_JOINT +#undef XSIMD_RVV_JOINT_ + + template + struct rvv_bool + { + using bool_info = rvv_bool_info; + using storage_type = vuint8m1_t __attribute__((riscv_rvv_vector_bits(rvv_width_m1))); + using type = typename bool_info::type; + storage_type value; + rvv_bool() = default; + rvv_bool(type v) noexcept + : value(__riscv_vreinterpret_u8m1(v)) + { + } + template ::type = 0> + rvv_bool(rvv_bool v) + : value(v.value) + { + } + explicit rvv_bool(uint8_t mask) noexcept + : value(__riscv_vmv_v_x_u8m1(mask, rvv_width_m1 / 8)) + { + } + explicit rvv_bool(uint64_t mask) noexcept + : value(__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vmv_v_x_u64m1(mask, rvv_width_m1 / 64))) + { + } + operator type() const noexcept { return bool_info::bitcast(value); } + }; + + template + using rvv_bool_t = typename std::enable_if < !std::is_void::value, + rvv_bool, Width>::type; + + template + struct rvv_vector_type_impl; + + template <> + struct rvv_vector_type_impl<8> + { + using signed_type = rvv_reg_t; + using unsigned_type = rvv_reg_t; + using floating_point_type = void; + }; + + template <> + struct rvv_vector_type_impl<16> + { + using signed_type = rvv_reg_t; + using unsigned_type = rvv_reg_t; + using floating_point_type = rvv_reg_t<_Float16>; + }; + + template <> + struct rvv_vector_type_impl<32> + { + using signed_type = rvv_reg_t; + using unsigned_type = rvv_reg_t; + using floating_point_type = rvv_reg_t; + }; + + template <> + struct rvv_vector_type_impl<64> + { + using signed_type = rvv_reg_t; + using unsigned_type = rvv_reg_t; + using floating_point_type = rvv_reg_t; + }; + + template + using signed_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::signed_type; + + template + using unsigned_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::unsigned_type; + + template + using floating_point_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::floating_point_type; + + template + using signed_int_or_floating_point_rvv_vector_type = typename std::conditional::value, + floating_point_rvv_vector_type, + signed_int_rvv_vector_type>::type; + + template + using rvv_vector_type = typename std::conditional::value, + signed_int_or_floating_point_rvv_vector_type, + unsigned_int_rvv_vector_type>::type; + } // namespace detail + + XSIMD_DECLARE_SIMD_REGISTER(bool, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(signed char, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(unsigned char, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(char, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(short, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(unsigned short, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(int, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(unsigned int, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(long int, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(long long int, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(float, rvv, detail::rvv_vector_type); + XSIMD_DECLARE_SIMD_REGISTER(double, rvv, detail::rvv_vector_type); + + namespace detail + { + template + struct rvv_bool_simd_register + { + using register_type = rvv_bool_t; + register_type data; + operator register_type() const noexcept { return data; } + }; + } // namespace detail + + template + struct get_bool_simd_register + { + using type = detail::rvv_bool_simd_register; + }; + } // namespace types +#endif +} // namespace xsimd + +#endif diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp index 3855a9d7dc01..a9dc8960b660 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp @@ -40,7 +40,6 @@ namespace xsimd #if XSIMD_WITH_SSE2 namespace types { - XSIMD_DECLARE_SIMD_REGISTER(bool, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i); XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i); diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp index 1ae678684138..27b241980cbf 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp @@ -36,7 +36,7 @@ namespace xsimd static constexpr bool available() noexcept { return true; } static constexpr bool requires_alignment() noexcept { return true; } static constexpr std::size_t alignment() noexcept { return 16; } - static constexpr unsigned version() noexcept { return generic::version(9, 0, 0); } + static constexpr unsigned version() noexcept { return generic::version(9, Width / 32, 0); } static constexpr char const* name() noexcept { return "arm64+sve"; } }; } diff --git a/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp index ab8782ac6a84..237db95c6e30 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp @@ -40,7 +40,6 @@ namespace xsimd #if XSIMD_WITH_WASM namespace types { - XSIMD_DECLARE_SIMD_REGISTER(bool, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t); XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t); diff --git a/third_party/xsimd/moz.yaml b/third_party/xsimd/moz.yaml index 76a3dc90c272..c99a5991ccf2 100644 --- a/third_party/xsimd/moz.yaml +++ b/third_party/xsimd/moz.yaml @@ -10,8 +10,8 @@ origin: url: https://github.com/QuantStack/xsimd - release: 11.2.0 (2023-11-08T21:37:47+01:00). - revision: 11.2.0 + release: 12.1.1 (2023-12-12T17:17:27+01:00). + revision: 12.1.1 license: BSD-3-Clause