Bug 1859085 - Update xsimd to 11.1.0. r=padenot,sergesanspaille

Changlog: 11.1.0 ------ * Introduce XSIMD_DEFAULT_ARCH to force default architecture (if any) * Remove C++ requirement on xsimd::exp10 scalar implementation 11.0.0 ------ * Provide a generic reducer * Fix ``find_package(xsimd)`` for xtl enabled xsimd, reloaded * Provide avx512f implementation of FMA and variant * Hexadecimal floating points are not a C++11 feature * back to slow implementation of exp10 on Windows * Changed bitwise_cast API * Provide generic signed /unsigned type conversion * Fixed sde location * Feature/incr decr Depends on D191042 Differential Revision: https://phabricator.services.mozilla.com/D191043
2024-10-08 19:04:45 +00:00 · 2023-10-17 05:59:03 +00:00 · 2023-10-17 05:59:03 +00:00 · 727ac25d91
commit 727ac25d91
parent 52ca4b99e8
33 changed files with 446 additions and 154 deletions
--- a/third_party/xsimd/Changelog.rst
+++ b/third_party/xsimd/Changelog.rst
@ -9,6 +9,61 @@
 Changelog
 =========

+11.1.0
+------
+
+    * Introduce XSIMD_DEFAULT_ARCH to force default architecture (if any)
+
+    * Remove C++ requirement on xsimd::exp10 scalar implementation
+
+    * Improve and test documentation
+
+11.0.0
+------
+
+    * Provide a generic reducer
+
+    * Fix ``find_package(xsimd)`` for xtl enabled xsimd, reloaded
+
+    * Cleanup benchmark code
+
+    * Provide avx512f implementation of FMA and variant
+
+    * Hexadecimal floating points are not a C++11 feature
+
+    * back to slow implementation of exp10 on Windows
+
+    * Changed bitwise_cast API
+
+    * Provide generic signed /unsigned type conversion
+
+    * Fixed sde location
+
+    * Feature/incr decr
+
+    * Cleanup documentation
+
+10.0.0
+------
+
+    * Fix potential ABI issue in SVE support
+
+    * Disable fast exp10 on OSX
+
+    * Assert on unaligned memory when calling aligned load/store
+
+    * Fix warning about uninitialized storage
+
+    * Always forward arch parameter
+
+    * Do not specialize the behavior of ``simd_return_type`` for char
+
+    * Support broadcasting of complex batches
+
+    * Make xsimd compatible with -fno-exceptions
+
+    * Provide and test comparison operators overloads that accept scalars
+
 9.0.1
 -----

--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@ -43,6 +43,20 @@ namespace xsimd
                                 self, other);
        }

+        // decr
+        template <class A, class T>
+        inline batch<T, A> decr(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self - T(1);
+        }
+
+        // decr_if
+        template <class A, class T, class Mask>
+        inline batch<T, A> decr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
+        {
+            return select(mask, decr(self), self);
+        }
+
        // div
        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
@ -112,6 +126,20 @@ namespace xsimd
            return { res_r, res_i };
        }

+        // incr
+        template <class A, class T>
+        inline batch<T, A> incr(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self + T(1);
+        }
+
+        // incr_if
+        template <class A, class T, class Mask>
+        inline batch<T, A> incr_if(batch<T, A> const& self, Mask const& mask, requires_arch<generic>) noexcept
+        {
+            return select(mask, incr(self), self);
+        }
+
        // mul
        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@ -135,6 +135,51 @@ namespace xsimd
            }
        }

+        // some generic fast_cast conversion
+        namespace detail
+        {
+            template <class A>
+            inline batch<uint8_t, A> fast_cast(batch<int8_t, A> const& self, batch<uint8_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint8_t>(self);
+            }
+            template <class A>
+            inline batch<uint16_t, A> fast_cast(batch<int16_t, A> const& self, batch<uint16_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint16_t>(self);
+            }
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<int32_t, A> const& self, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint32_t>(self);
+            }
+            template <class A>
+            inline batch<uint64_t, A> fast_cast(batch<int64_t, A> const& self, batch<uint64_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<uint64_t>(self);
+            }
+            template <class A>
+            inline batch<int8_t, A> fast_cast(batch<uint8_t, A> const& self, batch<int8_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int8_t>(self);
+            }
+            template <class A>
+            inline batch<int16_t, A> fast_cast(batch<uint16_t, A> const& self, batch<int16_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int16_t>(self);
+            }
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<uint32_t, A> const& self, batch<int32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int32_t>(self);
+            }
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<uint64_t, A> const& self, batch<int64_t, A> const&, requires_arch<generic>) noexcept
+            {
+                return bitwise_cast<int64_t>(self);
+            }
+        }
+
        namespace detail
        {
            // Generic conversion handling machinery. Each architecture must define
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@ -549,6 +549,13 @@ namespace xsimd
            }
        }

+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
        // div
        template <class A>
        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
@ -749,6 +756,13 @@ namespace xsimd
            return _mm256_add_pd(tmp1, tmp2);
        }

+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
        // insert
        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@ -378,7 +378,7 @@ namespace xsimd
                                                 std::complex<float> c0, std::complex<float> c1,
                                                 std::complex<float> c2, std::complex<float> c3) noexcept
        {
-            return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
+            return batch<std::complex<float>, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
                                                 float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
        }

--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@ -138,6 +138,33 @@ namespace xsimd
        return x + y;
    }

+    template <class T>
+    inline T incr(T const& x) noexcept
+    {
+        return x + T(1);
+    }
+
+    template <class T>
+    inline T incr_if(T const& x, bool mask) noexcept
+    {
+        return x + T(mask ? 1 : 0);
+    }
+
+    inline bool all(bool mask)
+    {
+        return mask;
+    }
+
+    inline bool any(bool mask)
+    {
+        return mask;
+    }
+
+    inline bool none(bool mask)
+    {
+        return !mask;
+    }
+
    template <class T>
    inline typename std::enable_if<std::is_integral<T>::value, T>::type
    bitwise_and(T x, T y) noexcept
@ -470,11 +497,13 @@ namespace xsimd
 #else
    inline float exp10(const float& x) noexcept
    {
-        return std::exp(0x1.26bb1cp+1f * x);
+        const float ln10 = std::log(10.f);
+        return std::exp(ln10 * x);
    }
    inline double exp10(const double& x) noexcept
    {
-        return std::exp(0x1.26bb1bbb55516p+1 * x);
+        const double ln10 = std::log(10.);
+        return std::exp(ln10 * x);
    }
 #endif

@ -738,6 +767,18 @@ namespace xsimd
        return x - y;
    }

+    template <class T>
+    inline T decr(T const& x) noexcept
+    {
+        return x - T(1);
+    }
+
+    template <class T>
+    inline T decr_if(T const& x, bool mask) noexcept
+    {
+        return x - T(mask ? 1 : 0);
+    }
+
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
    template <class T, bool i3ec>
    inline xtl::xcomplex<T, T, i3ec> log2(const xtl::xcomplex<T, T, i3ec>& val) noexcept
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@ -501,6 +501,13 @@ namespace xsimd
            }
        }

+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
        // div
        template <class A>
        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
@ -808,6 +815,13 @@ namespace xsimd
                              _mm_unpackhi_pd(row[0], row[1]));
        }

+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
        // insert
        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
--- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
@ -23,6 +23,22 @@
 namespace xsimd
 {

+    /**
+     * @ingroup architectures
+     *
+     * Dummy architectures that only appears in a list of architecture when no
+     * other architecture has been detected.
+     */
+    struct unavailable
+    {
+        static constexpr bool supported() noexcept { return false; }
+        static constexpr bool available() noexcept { return false; }
+        static constexpr unsigned version() noexcept { return 0; }
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr char const* name() noexcept { return "<none>"; }
+    };
+
    namespace detail
    {
        // Checks whether T appears in Tys.
@ -73,6 +89,21 @@ namespace xsimd
            return max_of((head0 > head1 ? head0 : head1), tail...);
        }

+        template <typename... Ts>
+        struct head;
+
+        template <typename T, typename... Ts>
+        struct head<T, Ts...>
+        {
+            using type = T;
+        };
+
+        template <>
+        struct head<>
+        {
+            using type = unavailable;
+        };
+
    } // namespace detail

    // An arch_list is a list of architectures, sorted by version number.
@ -84,6 +115,8 @@ namespace xsimd
                      "architecture list must be sorted by version");
 #endif

+        using best = typename detail::head<Archs...>::type;
+
        template <class Arch>
        using add = arch_list<Archs..., Arch>;

@ -109,34 +142,8 @@ namespace xsimd
        }
    };

-    struct unavailable
-    {
-        static constexpr bool supported() noexcept { return false; }
-        static constexpr bool available() noexcept { return false; }
-        static constexpr unsigned version() noexcept { return 0; }
-        static constexpr std::size_t alignment() noexcept { return 0; }
-        static constexpr bool requires_alignment() noexcept { return false; }
-        static constexpr char const* name() noexcept { return "<none>"; }
-    };
-
    namespace detail
    {
-        // Pick the best architecture in arch_list L, which is the last
-        // because architectures are sorted by version.
-        template <class L>
-        struct best;
-
-        template <>
-        struct best<arch_list<>>
-        {
-            using type = unavailable;
-        };
-
-        template <class Arch, class... Archs>
-        struct best<arch_list<Arch, Archs...>>
-        {
-            using type = Arch;
-        };

        // Filter archlists Archs, picking only supported archs and adding
        // them to L.
@ -190,12 +197,15 @@ namespace xsimd

    using supported_architectures = typename detail::supported<all_architectures>::type;

-    using x86_arch = typename detail::best<typename detail::supported<all_x86_architectures>::type>::type;
-    using arm_arch = typename detail::best<typename detail::supported<all_arm_architectures>::type>::type;
-    // using default_arch = typename detail::best<typename detail::supported<arch_list</*arm_arch,*/ x86_arch>>::type>::type;
-    using default_arch = typename std::conditional<std::is_same<x86_arch, unavailable>::value,
-                                                   arm_arch,
-                                                   x86_arch>::type;
+    using x86_arch = typename detail::supported<all_x86_architectures>::type::best;
+    using arm_arch = typename detail::supported<all_arm_architectures>::type::best;
+    using best_arch = typename supported_architectures::best;
+
+#ifdef XSIMD_DEFAULT_ARCH
+    using default_arch = XSIMD_DEFAULT_ARCH;
+#else
+    using default_arch = best_arch;
+#endif

    namespace detail
    {
@ -203,7 +213,7 @@ namespace xsimd
        class dispatcher
        {

-            const unsigned best_arch;
+            const unsigned best_arch_found;
            F functor;

            template <class Arch, class... Tys>
@ -216,7 +226,7 @@ namespace xsimd
            template <class Arch, class ArchNext, class... Archs, class... Tys>
            auto walk_archs(arch_list<Arch, ArchNext, Archs...>, Tys&&... args) noexcept -> decltype(functor(Arch {}, std::forward<Tys>(args)...))
            {
-                if (Arch::version() <= best_arch)
+                if (Arch::version() <= best_arch_found)
                    return functor(Arch {}, std::forward<Tys>(args)...);
                else
                    return walk_archs(arch_list<ArchNext, Archs...> {}, std::forward<Tys>(args)...);
@ -224,7 +234,7 @@ namespace xsimd

        public:
            dispatcher(F f) noexcept
-                : best_arch(available_architectures().best)
+                : best_arch_found(available_architectures().best)
                , functor(f)
            {
            }
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@ -12,9 +12,9 @@
 #ifndef XSIMD_CONFIG_HPP
 #define XSIMD_CONFIG_HPP

-#define XSIMD_VERSION_MAJOR 10
+#define XSIMD_VERSION_MAJOR 11
 #define XSIMD_VERSION_MINOR 0
-#define XSIMD_VERSION_PATCH 0
+#define XSIMD_VERSION_PATCH 1

 /**
 * high level free functions
--- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
@ -52,6 +52,7 @@ namespace xsimd
            unsigned avx512bw : 1;
            unsigned neon : 1;
            unsigned neon64 : 1;
+            unsigned sve : 1;

            // version number of the best arch available
            unsigned best;
@ -75,6 +76,15 @@ namespace xsimd
                neon64 = 0;
                best = neon::version() * neon;

+#elif defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
+
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+                sve = bool(getauxval(AT_HWCAP) & HWCAP_SVE);
+#else
+                sve = 0;
+#endif
+                best = sve::version() * sve;
+
 #elif defined(__x86_64__) || defined(__i386__) || defined(_M_AMD64) || defined(_M_IX86)
                auto get_cpuid = [](int reg[4], int func_id) noexcept
                {
@ -108,31 +118,38 @@ namespace xsimd
 #endif
                };

-                int regs[4];
+                int regs1[4];

-                get_cpuid(regs, 0x1);
+                get_cpuid(regs1, 0x1);

-                sse2 = regs[3] >> 26 & 1;
+                sse2 = regs1[3] >> 26 & 1;
                best = std::max(best, sse2::version() * sse2);

-                sse3 = regs[2] >> 0 & 1;
+                sse3 = regs1[2] >> 0 & 1;
                best = std::max(best, sse3::version() * sse3);

-                ssse3 = regs[2] >> 9 & 1;
+                ssse3 = regs1[2] >> 9 & 1;
                best = std::max(best, ssse3::version() * ssse3);

-                sse4_1 = regs[2] >> 19 & 1;
+                sse4_1 = regs1[2] >> 19 & 1;
                best = std::max(best, sse4_1::version() * sse4_1);

-                sse4_2 = regs[2] >> 20 & 1;
+                sse4_2 = regs1[2] >> 20 & 1;
                best = std::max(best, sse4_2::version() * sse4_2);

-                fma3_sse = regs[2] >> 12 & 1;
+                fma3_sse = regs1[2] >> 12 & 1;
                if (sse4_2)
                    best = std::max(best, fma3<xsimd::sse4_2>::version() * fma3_sse);

-                get_cpuid(regs, 0x80000001);
-                fma4 = regs[2] >> 16 & 1;
+                avx = regs1[2] >> 28 & 1;
+                best = std::max(best, avx::version() * avx);
+
+                fma3_avx = avx && fma3_sse;
+                best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
+
+                int regs8[4];
+                get_cpuid(regs8, 0x80000001);
+                fma4 = regs8[2] >> 16 & 1;
                best = std::max(best, fma4::version() * fma4);

                // sse4a = regs[2] >> 6 & 1;
@ -141,29 +158,24 @@ namespace xsimd
                // xop = regs[2] >> 11 & 1;
                // best = std::max(best, XSIMD_X86_AMD_XOP_VERSION * xop);

-                avx = regs[2] >> 28 & 1;
-                best = std::max(best, avx::version() * avx);
-
-                fma3_avx = avx && fma3_sse;
-                best = std::max(best, fma3<xsimd::avx>::version() * fma3_avx);
-
-                get_cpuid(regs, 0x7);
-                avx2 = regs[1] >> 5 & 1;
+                int regs7[4];
+                get_cpuid(regs7, 0x7);
+                avx2 = regs7[1] >> 5 & 1;
                best = std::max(best, avx2::version() * avx2);

                fma3_avx2 = avx2 && fma3_sse;
                best = std::max(best, fma3<xsimd::avx2>::version() * fma3_avx2);

-                avx512f = regs[1] >> 16 & 1;
+                avx512f = regs7[1] >> 16 & 1;
                best = std::max(best, avx512f::version() * avx512f);

-                avx512cd = regs[1] >> 28 & 1;
+                avx512cd = regs7[1] >> 28 & 1;
                best = std::max(best, avx512cd::version() * avx512cd * avx512f);

-                avx512dq = regs[1] >> 17 & 1;
+                avx512dq = regs7[1] >> 17 & 1;
                best = std::max(best, avx512dq::version() * avx512dq * avx512cd * avx512f);

-                avx512bw = regs[1] >> 30 & 1;
+                avx512bw = regs7[1] >> 30 & 1;
                best = std::max(best, avx512bw::version() * avx512bw * avx512dq * avx512cd * avx512f);

 #endif
--- a/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_aligned_allocator.hpp
@ -39,7 +39,7 @@ namespace xsimd
     * @tparam T type of objects to allocate.
     * @tparam Align alignment in bytes.
     */
-    template <class T, size_t Align = default_arch::alignment()>
+    template <class T, size_t Align>
    class aligned_allocator
    {
    public:
--- a/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
+++ b/third_party/xsimd/include/xsimd/memory/xsimd_alignment.hpp
@ -43,8 +43,8 @@ namespace xsimd
        using type = unaligned_mode;
    };

-    template <class T>
-    struct allocator_alignment<aligned_allocator<T>>
+    template <class T, size_t N>
+    struct allocator_alignment<aligned_allocator<T, N>>
    {
        using type = aligned_mode;
    };
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@ -82,7 +82,7 @@ namespace xsimd
     * @return the sum of \c x and \c y
     */
    template <class T, class A>
-    inline auto add(batch<T> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
+    inline auto add(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
    {
        detail::static_check_supported_config<T, A>();
        return x + y;
@ -546,6 +546,36 @@ namespace xsimd
        return kernel::cosh<A>(x, A {});
    }

+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x.
+     * @param x batch involved in the decrement.
+     * @return the subtraction of \c x and 1.
+     */
+    template <class T, class A>
+    inline batch<T, A> decr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the subtraction of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    inline batch<T, A> decr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr_if<A>(x, mask, A {});
+    }
+
    /**
     * @ingroup batch_arithmetic
     *
@ -878,63 +908,6 @@ namespace xsimd
        return x > y;
    }

-    /**
-     * @ingroup batch_reducers
-     *
-     * Generic reducer using only batch operations
-     * @param f reducing function, accepting `batch ()(batch, batch)`
-     * @param x batch involved in the reduction
-     * @return the result of the reduction, as a scalar.
-     */
-    template <class T, class A, class F>
-    inline T reduce(F&& f, batch<T, A> const& x) noexcept
-    {
-        detail::static_check_supported_config<T, A>();
-        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
-    }
-
-    /**
-     * @ingroup batch_reducers
-     *
-     * Adds all the scalars of the batch \c x.
-     * @param x batch involved in the reduction
-     * @return the result of the reduction.
-     */
-    template <class T, class A>
-    inline T reduce_add(batch<T, A> const& x) noexcept
-    {
-        detail::static_check_supported_config<T, A>();
-        return kernel::reduce_add<A>(x, A {});
-    }
-
-    /**
-     * @ingroup batch_reducers
-     *
-     * Max of all the scalars of the batch \c x.
-     * @param x batch involved in the reduction
-     * @return the result of the reduction.
-     */
-    template <class T, class A>
-    inline T reduce_max(batch<T, A> const& x) noexcept
-    {
-        detail::static_check_supported_config<T, A>();
-        return kernel::reduce_max<A>(x, A {});
-    }
-
-    /**
-     * @ingroup batch_reducers
-     *
-     * Min of all the scalars of the batch \c x.
-     * @param x batch involved in the reduction
-     * @return the result of the reduction.
-     */
-    template <class T, class A>
-    inline T reduce_min(batch<T, A> const& x) noexcept
-    {
-        detail::static_check_supported_config<T, A>();
-        return kernel::reduce_min<A>(x, A {});
-    }
-
    /**
     * @ingroup batch_reducers
     *
@ -981,6 +954,36 @@ namespace xsimd
        return kernel::imag<A>(x, A {});
    }

+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x.
+     * @param x batch involved in the increment.
+     * @return the sum of \c x and 1.
+     */
+    template <class T, class A>
+    inline batch<T, A> incr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the sum of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    inline batch<T, A> incr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr_if<A>(x, mask, A {});
+    }
+
    /**
     * @ingroup batch_constant
     *
@ -1595,6 +1598,20 @@ namespace xsimd
        return kernel::proj(z, A {});
    }

+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the real part of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::real<A>(z, A {});
+    }
+
    /**
     * @ingroup batch_arithmetic
     *
@ -1612,17 +1629,60 @@ namespace xsimd
    }

    /**
-     * @ingroup batch_complex
+     * @ingroup batch_reducers
     *
-     * Computes the real part of the batch \c z.
-     * @param z batch of complex or real values.
-     * @return the argument of \c z.
+     * Generic reducer using only batch operations
+     * @param f reducing function, accepting `batch ()(batch, batch)`
+     * @param x batch involved in the reduction
+     * @return the result of the reduction, as a scalar.
     */
-    template <class T, class A>
-    inline real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    template <class T, class A, class F>
+    inline T reduce(F&& f, batch<T, A> const& x) noexcept
    {
        detail::static_check_supported_config<T, A>();
-        return kernel::real<A>(z, A {});
+        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Adds all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_add<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Max of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_max(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_max<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Min of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_min(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_min<A>(x, A {});
    }

    /**
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
@ -17,7 +17,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX2 instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
@ -18,7 +18,7 @@ namespace xsimd
 {

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX512BW instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
@ -18,9 +18,9 @@ namespace xsimd
 {

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
-     * AVX512CD instrutions
+     * AVX512CD instructions
     */
    struct avx512cd : avx512f
    {
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
@ -18,7 +18,7 @@ namespace xsimd
 {

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX512DQ instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
@ -18,7 +18,7 @@ namespace xsimd
 {

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX512F instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
@ -18,7 +18,7 @@ namespace xsimd
 {

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
@ -20,7 +20,7 @@ namespace xsimd
    struct fma3;

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX2 + FMA instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
@ -20,7 +20,7 @@ namespace xsimd
    struct fma3;

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * AVX + FMA instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
@ -20,7 +20,7 @@ namespace xsimd
    struct fma3;

    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * SSE4.2 + FMA instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
@ -17,9 +17,9 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
-     * FMA4 instructions
+     * SSE4.2 + FMA4 instructions
     */
    struct fma4 : sse4_2
    {
--- a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
@ -15,17 +15,30 @@
 #include "../config/xsimd_config.hpp"

 /**
- * @defgroup arch Architecture description
+ * @defgroup architectures Architecture description
 * */
 namespace xsimd
 {
+    /**
+     * @ingroup architectures
+     *
+     * Base class for all architectures.
+     */
    struct generic
    {
+        /// Whether this architecture is supported at compile-time.
        static constexpr bool supported() noexcept { return true; }
+        /// Whether this architecture is available at run-time.
        static constexpr bool available() noexcept { return true; }
+        /// If this architectures supports aligned memory accesses, the required
+        /// alignment.
        static constexpr std::size_t alignment() noexcept { return 0; }
+        /// Whether this architecture requires aligned memory access.
        static constexpr bool requires_alignment() noexcept { return false; }
+        /// Unique identifier for this architecture.
        static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
+        /// Name of the architecture.
+        static constexpr char const* name() noexcept { return "generic"; }

    protected:
        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch) noexcept { return major * 10000u + minor * 100u + patch; }
--- a/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
@ -17,7 +17,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * NEON instructions for arm64
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
@ -22,7 +22,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * NEON instructions for arm32
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
@ -23,7 +23,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * SSE2 instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
@ -21,7 +21,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * SSE3 instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
@ -21,7 +21,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * SSE4.1 instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
@ -21,7 +21,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * SSE4.2 instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
@ -21,7 +21,7 @@
 namespace xsimd
 {
    /**
-     * @ingroup arch
+     * @ingroup architectures
     *
     * SSSE3 instructions
     */
--- a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
@ -25,7 +25,7 @@ namespace xsimd
    namespace detail
    {
        /**
-         * @ingroup arch
+         * @ingroup architectures
         *
         * SVE instructions (fixed vector size) for arm64
         */
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@ -10,8 +10,8 @@ origin:

  url: https://github.com/QuantStack/xsimd

-  release: e8f209c3397c8a866be2312682689a04e4abfd66 (2023-02-27T06:32:46Z).
-  revision: e8f209c3397c8a866be2312682689a04e4abfd66
+  release: 11.1.0 (2023-05-13T15:49:21+00:00).
+  revision: 11.1.0

  license: BSD-3-Clause