Bug 1891462 - Upgrade xsimd to be9dcb5df413a893fb6646fa950eeb4aeac70ffc r=padenot

WIP Differential Revision: https://phabricator.services.mozilla.com/D207722
2025-02-10 17:24:29 +00:00 · 2024-04-20 16:38:48 +00:00 · 2024-04-20 16:38:48 +00:00 · e02cc14f5a
commit e02cc14f5a
parent d59a006a47
22 changed files with 1227 additions and 310 deletions
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@ -2064,7 +2064,7 @@ namespace xsimd
            inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
            {
                using index_type = as_unsigned_integer_t<T>;
-                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
+                batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
                return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
            }
        }
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@ -21,10 +21,10 @@

 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
    struct batch_bool_constant;

    namespace kernel
@ -180,7 +180,7 @@ namespace xsimd
                }
            };
            batch<T, A> tmp(val);
-            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+            return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
        }

        // get
@ -295,7 +295,7 @@ namespace xsimd
                }
            };

-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
        }

        template <size_t N, class A, class T>
@ -316,7 +316,7 @@ namespace xsimd
                }
            };

-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
        }

        template <size_t N, class A, class T>
@ -412,6 +412,12 @@ namespace xsimd
                return true;
            }

+            template <typename ITy>
+            constexpr bool is_zip_lo(size_t, ITy)
+            {
+                return false;
+            }
+
            template <typename ITy0, typename ITy1, typename... ITys>
            constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
            {
@ -423,6 +429,12 @@ namespace xsimd
                return true;
            }

+            template <typename ITy>
+            constexpr bool is_zip_hi(size_t, ITy)
+            {
+                return false;
+            }
+
            template <typename ITy0, typename ITy1, typename... ITys>
            constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
            {
@ -443,19 +455,19 @@ namespace xsimd
        }

        template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
        {
            constexpr size_t bsize = sizeof...(Indices);

            // Detect common patterns
            XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
            {
-                return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
+                return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
            }

            XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
            {
-                return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
+                return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
            }

            XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
@ -470,7 +482,7 @@ namespace xsimd

            XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
            {
-                return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
+                return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
            }

 #if defined(__has_builtin)
@ -491,9 +503,9 @@ namespace xsimd
 #else
            // Use a generic_pattern. It is suboptimal but clang optimizes this
            // pretty well.
-            batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
+            batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
            return select(select_x_lane, x_lane, y_lane);
 #endif
        }
@ -530,7 +542,7 @@ namespace xsimd

        // swizzle
        template <class A, class T, class ITy, ITy... Vs>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
        {
            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
        }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@ -1161,22 +1161,22 @@ namespace xsimd
            return detail::merge_sse(res_low, res_hi);
        }
        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
        {
            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
        }

        template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
        {
-            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
            return _mm256_blend_ps(false_br, true_br, mask);
        }

        template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
        {
-            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
            return _mm256_blend_pd(false_br, true_br, mask);
        }

@ -1238,7 +1238,7 @@ namespace xsimd

        // shuffle
        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
        {
            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
            // shuffle within lane
@ -1253,7 +1253,7 @@ namespace xsimd
        }

        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
        {
            constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
            // shuffle within lane
@ -1504,7 +1504,7 @@ namespace xsimd

        // swizzle (constant mask)
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
        {
            // duplicate low and high part of input
            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
@ -1514,14 +1514,14 @@ namespace xsimd
            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);

            // normalize mask
-            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;

            // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());

            // mask to choose the right lane
-            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;

            // blend the two permutes
            constexpr auto mask = blend_mask.mask();
@ -1529,7 +1529,7 @@ namespace xsimd
        }

        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
        {
            // duplicate low and high part of input
            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
@ -1539,14 +1539,14 @@ namespace xsimd
            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);

            // normalize mask
-            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;

            // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());

            // mask to choose the right lane
-            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;

            // blend the two permutes
            constexpr auto mask = blend_mask.mask();
@ -1564,7 +1564,7 @@ namespace xsimd
                  uint32_t V7,
                  detail::enable_sized_integral_t<T, 4> = 0>
        inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<uint32_t, A>,
+                                   batch_constant<uint32_t, A,
                                                  V0,
                                                  V1,
                                                  V2,
@ -1588,7 +1588,7 @@ namespace xsimd
                  detail::enable_sized_integral_t<T, 8> = 0>
        inline batch<T, A>
        swizzle(batch<T, A> const& self,
-                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
                requires_arch<avx>) noexcept
        {
            return bitwise_cast<T>(
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@ -729,9 +729,9 @@ namespace xsimd
            }
        }
        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
        {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
            // FIXME: for some reason mask here is not considered as an immediate,
            // but it's okay for _mm256_blend_epi32
            // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
@ -912,36 +912,36 @@ namespace xsimd

        // swizzle (constant mask)
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
        {
-            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+            return _mm256_permutevar8x32_ps(self, mask.as_batch());
        }

        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
        {
            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
            return _mm256_permute4x64_pd(self, mask);
        }

        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
        {
            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
            return _mm256_permute4x64_epi64(self, mask);
        }
        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
        {
            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
        }
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
        {
-            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+            return _mm256_permutevar8x32_epi32(self, mask.as_batch());
        }
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
        {
            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
        }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@ -617,27 +617,27 @@ namespace xsimd

        // swizzle (static version)
        template <class A, uint16_t... Vs>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
        {
-            return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
        }

        template <class A, uint16_t... Vs>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
        {
-            return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
        }

        template <class A, uint8_t... Vs>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
        {
-            return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
        }

        template <class A, uint8_t... Vs>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
        {
-            return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
        }

        // zip_hi
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
@ -1422,8 +1422,8 @@ namespace xsimd
        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
        inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
        {
-            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
-            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
            batch<T, A> acc = max(self, step);
            __m256i low = _mm512_castsi512_si256(acc);
            return reduce_max(batch<T, avx2>(low));
@ -1433,8 +1433,8 @@ namespace xsimd
        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
        inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
        {
-            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
-            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
            batch<T, A> acc = min(self, step);
            __m256i low = _mm512_castsi512_si256(acc);
            return reduce_min(batch<T, avx2>(low));
@ -1571,7 +1571,7 @@ namespace xsimd
        }

        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
        {
            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
        }
@ -1709,7 +1709,7 @@ namespace xsimd
        // shuffle
        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
-                                       batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
+                                       batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
                                       requires_arch<avx512f>) noexcept
        {
            constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
@ -1726,7 +1726,7 @@ namespace xsimd
        }

        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
        {
            constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
            // shuffle within lane
@ -1917,39 +1917,39 @@ namespace xsimd

        // swizzle (constant version)
        template <class A, uint32_t... Vs>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
        }

        template <class A, uint64_t... Vs>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
        }

        template <class A, uint64_t... Vs>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
        }

        template <class A, uint64_t... Vs>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
        }

        template <class A, uint32_t... Vs>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
        }

        template <class A, uint32_t... Vs>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
        }

        namespace detail
@ -1973,14 +1973,14 @@ namespace xsimd
                      uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
            struct fold_batch_constant
            {
-                using type = batch_constant<batch<uint32_t, A>, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
+                using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
                                            I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
            };

        }

        template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Idx...>, requires_arch<avx512f>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
        {
            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
@ -1988,13 +1988,13 @@ namespace xsimd

        template <class A>
        inline batch<uint16_t, A>
-        swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
        {
            // FIXME: this sequence is very inefficient, but it's here to catch
            // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
            // The whole pattern is actually decently folded by GCC and Clang,
            // so bare with it.
-            constexpr batch_constant<batch<uint32_t, A>, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);

            alignas(A::alignment()) uint16_t buffer[32];
@ -2005,7 +2005,7 @@ namespace xsimd

        template <class A, uint16_t... Vs>
        inline batch<int16_t, A>
-        swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
        {
            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
        }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_emulated.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_emulated.hpp
@ -0,0 +1,757 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_HPP
+#define XSIMD_EMULATED_HPP
+
+#include <complex>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
+#include "../arch/xsimd_scalar.hpp"
+
+#include "../types/xsimd_emulated_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            template <size_t I, class F, class... Bs>
+            auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...))
+            {
+                return func(bs.data[I]...);
+            }
+
+            template <class F, class B, class... Bs, size_t... Is>
+            auto emulated_apply(F func, ::xsimd::detail::index_sequence<Is...>, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return { emulated_apply<Is>(func, b, bs...)... };
+            }
+
+            template <class B, class F, class... Bs>
+            auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return emulated_apply(func, ::xsimd::detail::make_index_sequence<B::size>(), b, bs...);
+            }
+        }
+
+        // abs
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::abs(v); },
+                                          self);
+        }
+
+        // add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::add(v0, v1); },
+                                          self, other);
+        }
+
+        // all
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::all_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // any
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::any_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            return { self.data };
+        }
+
+        // bitwise_and
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_lshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_not
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        // bitwise_or
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_rshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_xor
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T_out, A>::size;
+            std::array<T_out, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            const char* raw_input = reinterpret_cast<const char*>(self.data.data());
+            memcpy(raw_data, raw_input, size * sizeof(T_out));
+            return result;
+        }
+
+        // broadcast
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        batch<T, A> inline broadcast(T val, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            std::fill(r.begin(), r.end(), val);
+            return r;
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i];
+                    result[1 + 2 * i] = self.imag().data[i];
+                }
+                return result;
+            }
+            // complex_high
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i + size / 2];
+                    result[1 + 2 * i] = self.imag().data[i + size / 2];
+                }
+                return result;
+            }
+        }
+
+        // decr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::div(v0, v1); },
+                                          self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(int32_t) * batch<int32_t, A>::size>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](float v)
+                                              { return int32_t(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](double v)
+                                              { return int64_t(v); },
+                                              self);
+            }
+        }
+
+        // eq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> eq(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch_bool<T, A>::size>
+        inline batch_bool<T, emulated<N>> eq(batch_bool<T, emulated<N>> const& self, batch_bool<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        // from_bool
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return T(v); },
+                                          self);
+        }
+
+        // from_mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<bool, size> vmask;
+            for (size_t i = 0; i < size; ++i)
+                vmask[i] = (mask >> i) & 1u;
+            return vmask;
+        }
+
+        // ge
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> ge(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ge(v0, v1); },
+                                          self, other);
+        }
+
+        // gt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> gt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::gt(v0, v1); },
+                                          self, other);
+        }
+
+        // haddp
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> haddp(batch<T, A> const* row, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            for (size_t i = 0; i < size; ++i)
+                r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front());
+            return r;
+        }
+
+        // incr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<emulated<N>>) noexcept
+        {
+            batch<T, A> other = self;
+            other.data[I] = val;
+            return other;
+        }
+
+        // isnan
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        inline batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::isnan(v); },
+                                          self);
+        }
+
+        // load_aligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_unaligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& hi, batch<T, A> const& lo, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> real, imag;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[i] = hi.data[2 * i];
+                    imag[i] = hi.data[1 + 2 * i];
+                }
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[size / 2 + i] = lo.data[2 * i];
+                    imag[size / 2 + i] = lo.data[1 + 2 * i];
+                }
+                return { real, imag };
+            }
+        }
+
+        // le
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> le(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::le(v0, v1); },
+                                          self, other);
+        }
+
+        // lt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> lt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::lt(v0, v1); },
+                                          self, other);
+        }
+
+        // mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            uint64_t res = 0;
+            for (size_t i = 0; i < size; ++i)
+                res |= (self.data[i] ? 1u : 0u) << i;
+            return res;
+        }
+
+        // max
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::max(v0, v1); },
+                                          self, other);
+        }
+
+        // min
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::min(v0, v1); },
+                                          self, other);
+        }
+
+        // mul
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::mul(v0, v1); },
+                                          self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<as_integer_t<T>, A> nearbyint_as_int(batch<T, A> const& self,
+                                                          requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::nearbyint_as_int(v); },
+                                          self);
+        }
+
+        // neg
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::neg(v); },
+                                          self);
+        }
+
+        // neq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        // reduce_add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> buffer;
+            self.store_unaligned(buffer.data());
+            return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin());
+        }
+
+        // reduce_max
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::max(x, y); });
+        }
+
+        // reduce_min
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::min(x, y); });
+        }
+
+        // rsqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::rsqrt(v); },
+                                          self);
+        }
+
+        // select
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool c, T t, T f)
+                                          { return xsimd::select(c, t, f); },
+                                          cond, true_br, false_br);
+        }
+
+        template <class A, class T, bool... Values>
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            static_assert(sizeof...(Values) == size, "consistent init");
+            return select((batch_bool<T, A>)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {});
+        }
+
+        // shuffle
+        template <class A, typename T, class ITy, ITy... Is>
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size];
+            return res;
+        }
+
+        // sqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> sqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::sqrt(v); },
+                                          self);
+        }
+
+        // slide_left
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memset(raw_data, 0, M);
+            memcpy(raw_data + M, reinterpret_cast<const char*>(x.data.data()), sizeof(T) * result.size() - M);
+            return result;
+        }
+
+        // slide_right
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memcpy(raw_data, reinterpret_cast<const char*>(x.data.data()) + M, sizeof(T) * result.size() - M);
+            memset(raw_data + sizeof(T) * result.size() - M, 0, M);
+            return result;
+        }
+
+        // sadd
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sadd(v0, v1); },
+                                          self, other);
+        }
+
+        // set
+        template <class A, class T, size_t N, class... Values>
+        inline batch<T, emulated<N>> set(batch<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { typename batch<T, emulated<N>>::register_type { static_cast<T>(values)... } };
+        }
+
+        template <class A, class T, size_t N, class... Values>
+        inline batch_bool<T, emulated<N>> set(batch_bool<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { std::array<bool, sizeof...(Values)> { static_cast<bool>(values)... } };
+        }
+
+        // ssub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ssub(v0, v1); },
+                                          self, other);
+        }
+
+        // store_aligned
+        template <class A, class T, size_t N>
+        inline void store_aligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T, size_t N>
+        inline void store_unaligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // sub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sub(v0, v1); },
+                                          self, other);
+        }
+
+        // swizzle
+
+        template <class A, typename T, class ITy, ITy... Is>
+        inline batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = self.data[bmask.data[i]];
+            return res;
+        }
+
+        // zip_hi
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            if (size % 2)
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? self : other).data[size / 2 + i / 2];
+            }
+            else
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? other : self).data[size / 2 + i / 2];
+            }
+            return res;
+        }
+
+        // zip_lo
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = (i % 2 ? other : self).data[i / 2];
+            return res;
+        }
+    }
+}
+
+#endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@ -16,6 +16,10 @@

 #include "./xsimd_generic_fwd.hpp"

+#if XSIMD_WITH_EMULATED
+#include "./xsimd_emulated.hpp"
+#endif
+
 #if XSIMD_WITH_SSE2
 #include "./xsimd_sse2.hpp"
 #endif
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@ -146,7 +146,7 @@ inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg;

 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
    struct batch_bool_constant;

    namespace kernel
@ -1743,7 +1743,7 @@ namespace xsimd
        }

        template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
        {
            return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
        }
@ -2717,7 +2717,7 @@ namespace xsimd
        }
    }

-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

    namespace kernel
@ -2728,7 +2728,7 @@ namespace xsimd

        template <class A, class T, class I, I... idx>
        inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<I, A>, idx...>,
+                                   batch_constant<I, A, idx...>,
                                   requires_arch<neon>) noexcept
        {
            static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@ -21,7 +21,7 @@

 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
    struct batch_bool_constant;

    namespace kernel
@ -942,7 +942,7 @@ namespace xsimd
        }

        template <class A, bool... b>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
+        inline batch<double, A> select(batch_bool_constant<double, A, b...> const&,
                                       batch<double, A> const& true_br,
                                       batch<double, A> const& false_br,
                                       requires_arch<neon64>) noexcept
@ -1243,7 +1243,7 @@ namespace xsimd
        }
    }

-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

    namespace kernel
@ -1354,42 +1354,40 @@ namespace xsimd
            template <class CB1, class CB2, class IS>
            struct index_burst_impl;

-            template <class B1, class B2, typename B2::value_type... V,
-                      typename B2::value_type... incr>
-            struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
-                                    integer_sequence<typename B2::value_type, incr...>>
+            template <typename T1, class A, typename T2, T2... V,
+                      T2... incr>
+            struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
+                                    integer_sequence<T2, incr...>>
            {
-                using type = batch_constant<B2, V...>;
+                using type = batch_constant<T2, A, V...>;
            };

-            template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
-                      class B2, typename B2::value_type... V2,
-                      typename B2::value_type... incr>
-            struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
-                                    integer_sequence<typename B2::value_type, incr...>>
+            template <typename T1, class A, T1 V0, T1... V1,
+                      typename T2, T2... V2, T2... incr>
+            struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
+                                    integer_sequence<T2, incr...>>
            {
-                using value_type = typename B2::value_type;
-                using next_input = batch_constant<B1, V1...>;
-                using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
-                using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
+                using next_input = batch_constant<T1, A, V1...>;
+                using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
            };

            template <class B, class T>
            struct index_burst;

-            template <class B, typename B::value_type... V, class T>
-            struct index_burst<batch_constant<B, V...>, T>
+            template <typename Tp, class A, Tp... V, typename T>
+            struct index_burst<batch_constant<Tp, A, V...>, T>
            {
-                static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
-                using input = batch_constant<B, (mul * V)...>;
-                using output = batch_constant<batch<T, typename B::arch_type>>;
+                static constexpr size_t mul = sizeof(Tp) / sizeof(T);
+                using input = batch_constant<Tp, A, (mul * V)...>;
+                using output = batch_constant<T, A>;
                using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
            };

-            template <class B, class T>
+            template <class B, typename T>
            using index_burst_t = typename index_burst<B, T>::type;

-            template <class T, class B>
+            template <typename T, class B>
            inline index_burst_t<B, T> burst_index(B)
            {
                return index_burst_t<B, T>();
@ -1399,7 +1397,7 @@ namespace xsimd
        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
-                                         batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                         batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
                                         requires_arch<neon64>) noexcept
        {
            return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
@ -1408,7 +1406,7 @@ namespace xsimd
        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
-                                        batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                        batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
                                        requires_arch<neon64>) noexcept
        {
            return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
@ -1416,7 +1414,7 @@ namespace xsimd

        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
-                                          batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                          batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
                                          requires_arch<neon64>) noexcept
        {
            using batch_type = batch<uint8_t, A>;
@ -1425,7 +1423,7 @@ namespace xsimd

        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
-                                         batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                         batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
                                         requires_arch<neon64>) noexcept
        {
            using batch_type = batch<int8_t, A>;
@ -1434,7 +1432,7 @@ namespace xsimd

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
-                                          batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                          batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                          requires_arch<neon64>) noexcept
        {
            using batch_type = batch<uint8_t, A>;
@ -1443,7 +1441,7 @@ namespace xsimd

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
-                                         batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                         batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                         requires_arch<neon64>) noexcept
        {
            using batch_type = batch<int8_t, A>;
@ -1452,7 +1450,7 @@ namespace xsimd

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
-                                          batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                          batch_constant<uint64_t, A, V0, V1> idx,
                                          requires_arch<neon64>) noexcept
        {
            using batch_type = batch<uint8_t, A>;
@ -1461,7 +1459,7 @@ namespace xsimd

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
-                                         batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                         batch_constant<uint64_t, A, V0, V1> idx,
                                         requires_arch<neon64>) noexcept
        {
            using batch_type = batch<int8_t, A>;
@ -1470,7 +1468,7 @@ namespace xsimd

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<float, A> swizzle(batch<float, A> const& self,
-                                       batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                       batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                       requires_arch<neon64>) noexcept
        {
            using batch_type = batch<uint8_t, A>;
@ -1479,7 +1477,7 @@ namespace xsimd

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<double, A> swizzle(batch<double, A> const& self,
-                                        batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                        batch_constant<uint64_t, A, V0, V1> idx,
                                        requires_arch<neon64>) noexcept
        {
            using batch_type = batch<uint8_t, A>;
@ -1488,7 +1486,7 @@ namespace xsimd

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
        inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
-                                                     batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                                     batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                                     requires_arch<neon64>) noexcept
        {
            return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
@ -1496,7 +1494,7 @@ namespace xsimd

        template <class A, uint64_t V0, uint64_t V1>
        inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
-                                                      batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                                      batch_constant<uint64_t, A, V0, V1> idx,
                                                      requires_arch<neon64>) noexcept
        {
            return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
--- a/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
@ -284,7 +284,7 @@

 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

    namespace kernel
@ -1150,7 +1150,7 @@ namespace xsimd

        // swizzle
        template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<rvv>) noexcept
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...>, requires_arch<rvv>) noexcept
        {
            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
            const batch<I, A> indices { idx... };
@ -1159,11 +1159,11 @@ namespace xsimd

        template <class A, class T, class I, I... idx>
        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
-                                                 batch_constant<batch<I, A>, idx...>,
+                                                 batch_constant<I, A, idx...>,
                                                 requires_arch<rvv>) noexcept
        {
-            const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, rvv {});
-            const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, rvv {});
+            const auto real = swizzle(self.real(), batch_constant<I, A, idx...> {}, rvv {});
+            const auto imag = swizzle(self.imag(), batch_constant<I, A, idx...> {}, rvv {});
            return batch<std::complex<T>>(real, imag);
        }

@ -1188,7 +1188,7 @@ namespace xsimd
        }

        template <class A, class T, bool... b>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
        {
            return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
        }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@ -86,6 +86,57 @@ namespace xsimd
    using std::tgamma;
    using std::trunc;

+    inline signed char abs(signed char v)
+    {
+        return v < 0 ? -v : v;
+    }
+
+    namespace detail
+    {
+        // Use templated type here to prevent automatic instantiation that may
+        // ends up in a warning
+        template <typename char_type>
+        inline char abs(char_type v, std::true_type)
+        {
+            return v;
+        }
+        template <typename char_type>
+        inline char abs(char_type v, std::false_type)
+        {
+            return v < 0 ? -v : v;
+        }
+    }
+
+    inline char abs(char v)
+    {
+        return detail::abs(v, std::is_unsigned<char>::type {});
+    }
+
+    inline short abs(short v)
+    {
+        return v < 0 ? -v : v;
+    }
+    inline unsigned char abs(unsigned char v)
+    {
+        return v;
+    }
+    inline unsigned short abs(unsigned short v)
+    {
+        return v;
+    }
+    inline unsigned int abs(unsigned int v)
+    {
+        return v;
+    }
+    inline unsigned long abs(unsigned long v)
+    {
+        return v;
+    }
+    inline unsigned long long abs(unsigned long long v)
+    {
+        return v;
+    }
+
 #ifndef _WIN32
    using std::isfinite;
    using std::isinf;
@ -137,7 +188,7 @@ namespace xsimd
 #endif

    template <class T, class Tp>
-    inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y)
+    inline typename std::common_type<T, Tp>::type add(T const& x, Tp const& y) noexcept
    {
        return x + y;
    }
@ -209,6 +260,15 @@ namespace xsimd
        return x & y;
    }

+    template <class T_out, class T_in>
+    inline T_out bitwise_cast(T_in x) noexcept
+    {
+        static_assert(sizeof(T_in) == sizeof(T_out), "bitwise_cast between types of the same size");
+        T_out r;
+        std::memcpy((void*)&r, (void*)&x, sizeof(T_in));
+        return r;
+    }
+
    inline float bitwise_and(float x, float y) noexcept
    {
        uint32_t ix, iy;
@ -231,35 +291,6 @@ namespace xsimd
        return r;
    }

-    template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
-    bitwise_andnot(T x, T y) noexcept
-    {
-        return x & ~y;
-    }
-
-    inline float bitwise_andnot(float x, float y) noexcept
-    {
-        uint32_t ix, iy;
-        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
-        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
-        uint32_t ir = bitwise_andnot(ix, iy);
-        float r;
-        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
-        return r;
-    }
-
-    inline double bitwise_andnot(double x, double y) noexcept
-    {
-        uint64_t ix, iy;
-        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
-        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
-        uint64_t ir = bitwise_andnot(ix, iy);
-        double r;
-        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
-        return r;
-    }
-
    template <class T0, class T1>
    inline typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
    bitwise_lshift(T0 x, T1 shift) noexcept
@ -281,6 +312,11 @@ namespace xsimd
        return ~x;
    }

+    inline bool bitwise_not(bool x) noexcept
+    {
+        return !x;
+    }
+
    inline float bitwise_not(float x) noexcept
    {
        uint32_t ix;
@ -301,6 +337,12 @@ namespace xsimd
        return r;
    }

+    template <class T>
+    inline typename std::enable_if<std::is_scalar<T>::value, T>::type bitwise_andnot(T x, T y) noexcept
+    {
+        return bitwise_and(x, bitwise_not(y));
+    }
+
    template <class T>
    inline typename std::enable_if<std::is_integral<T>::value, T>::type
    bitwise_or(T x, T y) noexcept
@ -360,7 +402,7 @@ namespace xsimd
    }

    template <class T, class Tp>
-    inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y)
+    inline typename std::common_type<T, Tp>::type div(T const& x, Tp const& y) noexcept
    {
        return x / y;
    }
@ -372,13 +414,13 @@ namespace xsimd
    }

    template <class T, class Tp>
-    inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y)
+    inline typename std::common_type<T, Tp>::type mul(T const& x, Tp const& y) noexcept
    {
        return x * y;
    }

    template <class T>
-    inline auto neg(T const& x) noexcept -> decltype(-x)
+    inline T neg(T const& x) noexcept
    {
        return -x;
    }
@ -776,9 +818,9 @@ namespace xsimd
    }

    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool bitofsign(T const& x) noexcept
+    inline T bitofsign(T const& x) noexcept
    {
-        return x < T(0);
+        return T(x < T(0));
    }

    template <class T>
@ -842,7 +884,7 @@ namespace xsimd
    }

    template <class T, class Tp>
-    inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y)
+    inline typename std::common_type<T, Tp>::type sub(T const& x, Tp const& y) noexcept
    {
        return x - y;
    }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@ -20,13 +20,13 @@

 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
    struct batch_bool_constant;

    template <class T_out, class T_in, class A>
    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;

-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

    namespace kernel
@ -59,7 +59,7 @@ namespace xsimd
        template <class A, class T, size_t I>
        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
        template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
        template <class A, class T>
        inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
        template <class A, class T>
@ -1216,6 +1216,43 @@ namespace xsimd
            return _mm_cvtss_f32(tmp1);
        }

+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return hadd(self, generic {});
+            }
+        }
+
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
        // reduce_max
        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
        inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
@ -1260,42 +1297,6 @@ namespace xsimd
            return acc3.get(0);
        }

-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi32(self, tmp1);
-                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
-                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
-                return _mm_cvtsi128_si32(tmp4);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi64(self, tmp1);
-#if defined(__x86_64__)
-                return _mm_cvtsi128_si64(tmp2);
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, tmp2);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                return hadd(self, generic {});
-            }
-        }
-        template <class A>
-        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
-        }
-
        // rsqrt
        template <class A>
        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
@ -1321,7 +1322,7 @@ namespace xsimd
            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
        }
        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
        {
            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
        }
@ -1333,7 +1334,7 @@ namespace xsimd

        // shuffle
        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
            // shuffle within lane
@ -1347,7 +1348,7 @@ namespace xsimd
        }

        template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<sse2>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
            // shuffle within lane
@ -1600,41 +1601,41 @@ namespace xsimd
        // swizzle

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
            return _mm_shuffle_ps(self, self, index);
        }

        template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(V0, V1);
            return _mm_shuffle_pd(self, self, index);
        }

        template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
            return _mm_shuffle_epi32(self, index);
        }

        template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
        {
            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
        {
            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
            return _mm_shuffle_epi32(self, index);
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
        {
            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
        }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
@ -284,9 +284,9 @@ namespace xsimd
        }

        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
        {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
            {
                return _mm_blend_epi16(false_br, true_br, mask);
@ -304,19 +304,19 @@ namespace xsimd
            }
            else
            {
-                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
+                return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
            }
        }
        template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
        {
-            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
            return _mm_blend_ps(false_br, true_br, mask);
        }
        template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
        {
-            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
            return _mm_blend_pd(false_br, true_br, mask);
        }

--- a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@ -140,32 +140,32 @@ namespace xsimd

        // swizzle (constant mask)
        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
        {
-            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+            constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
                                     2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
                mask8;
-            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
+            return _mm_shuffle_epi8(self, mask8.as_batch());
        }

        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
        {
            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
        }

        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
        {
-            return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
+            return swizzle(self, mask.as_batch(), ssse3 {});
        }

        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
        {
-            return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
+            return swizzle(self, mask.as_batch(), ssse3 {});
        }

    }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
@ -20,7 +20,7 @@

 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

    namespace kernel
@ -739,19 +739,19 @@ namespace xsimd

        // swizzle (static)
        template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...> indices, requires_arch<sve>) noexcept
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
        {
            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
-            return swizzle(arg, (batch<I, A>)indices, sve {});
+            return swizzle(arg, indices.as_batch(), sve {});
        }

        template <class A, class T, class I, I... idx>
        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
-                                                 batch_constant<batch<I, A>, idx...> indices,
+                                                 batch_constant<I, A, idx...> indices,
                                                 requires_arch<sve>) noexcept
        {
            static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
-            return swizzle(arg, (batch<I, A>)indices, sve {});
+            return swizzle(arg, indices.as_batch(), sve {});
        }

        /*************
@ -811,7 +811,7 @@ namespace xsimd
        }

        template <class A, class T, bool... b>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
        {
            return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
        }
--- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
@ -19,13 +19,13 @@

 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
    struct batch_bool_constant;

    template <class T_out, class T_in, class A>
    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;

-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant;

    namespace kernel
@ -36,7 +36,7 @@ namespace xsimd
        template <class A, class T, size_t I>
        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
        template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
        template <class A, class T>
        inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;

@ -1275,7 +1275,7 @@ namespace xsimd
            return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
        }
        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
        {
            return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
        }
@ -1287,13 +1287,13 @@ namespace xsimd

        // shuffle
        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
        {
            return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
        }

        template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1>, requires_arch<wasm>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<wasm>) noexcept
        {
            return wasm_i64x2_shuffle(x, y, I0, I1);
        }
@ -1515,63 +1515,63 @@ namespace xsimd

        // swizzle
        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
        {
            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
        }

        template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
        {
            return wasm_i64x2_shuffle(self, self, V0, V1);
        }

        template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
        {
            return wasm_i64x2_shuffle(self, self, V0, V1);
        }

        template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<wasm>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<wasm>) noexcept
        {
            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
        {
            return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
        }

        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
        {
            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
        }

        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
        {
            return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
        }

        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
        {
            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
        }

        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
        {
            return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
        }

        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
        {
            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
        }
--- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@ -46,3 +46,7 @@
 #include "xsimd_rvv_register.hpp"

 #include "xsimd_wasm_register.hpp"
+
+#if XSIMD_WITH_EMULATED
+#include "xsimd_emulated_register.hpp"
+#endif
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@ -2031,7 +2031,7 @@ namespace xsimd
     * @return the result of the selection.
     */
    template <class T, class A, bool... Values>
-    inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    inline batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
    {
        detail::static_check_supported_config<T, A>();
        return kernel::select<A>(cond, true_br, false_br, A {});
@ -2047,7 +2047,7 @@ namespace xsimd
     * element of \c x and \c y. Each element of the mask index the vector that
     * would be formed by the concatenation of \c x and \c y. For instance
     * \code{.cpp}
-     * batch_constant<batch<uint32_t, sse2>, 0, 4, 3, 7>
+     * batch_constant<uint32_t, sse2, 0, 4, 3, 7>
     * \endcode
     * Picks \c x[0], \c y[0], \c x[3], \c y[3]
     *
@ -2055,7 +2055,7 @@ namespace xsimd
     */
    template <class T, class A, class Vt, Vt... Values>
    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
-    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<Vt, A, Values...> mask) noexcept
    {
        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
        detail::static_check_supported_config<T, A>();
@ -2210,19 +2210,22 @@ namespace xsimd
    template <class To, class A = default_arch, class From>
    inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
    {
-        kernel::store_aligned(dst, src, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store_aligned<A>(dst, src, A {});
    }

    template <class A = default_arch, class From>
    inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
    {
-        kernel::store(src, dst, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
    }

    template <class To, class A = default_arch, class From>
    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
    {
-        kernel::store_complex_aligned(dst, src, A {});
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_aligned<A>(dst, src, A {});
    }

 #ifdef XSIMD_ENABLE_XTL_COMPLEX
@ -2244,25 +2247,29 @@ namespace xsimd
    template <class To, class A = default_arch, class From>
    inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
    {
-        kernel::store_unaligned(dst, src, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store_unaligned<A>(dst, src, A {});
    }

    template <class A = default_arch, class From>
    inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
    {
-        kernel::store(src, dst, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
    }

    template <class To, class A = default_arch, class From>
    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
    {
-        kernel::store_complex_unaligned(dst, src, A {});
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_unaligned<A>(dst, src, A {});
    }

 #ifdef XSIMD_ENABLE_XTL_COMPLEX
    template <class To, class A = default_arch, class From, bool i3ec>
    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
    {
+        detail::static_check_supported_config<std::complex<From>, A>();
        store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
    }
 #endif
@ -2350,14 +2357,14 @@ namespace xsimd
     */
    template <class T, class A, class Vt, Vt... Values>
    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
-    swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    swizzle(batch<T, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
    {
        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
        detail::static_check_supported_config<T, A>();
        return kernel::swizzle<A>(x, mask, A {});
    }
    template <class T, class A, class Vt, Vt... Values>
-    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
    {
        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
        detail::static_check_supported_config<T, A>();
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@ -25,17 +25,24 @@ namespace xsimd
     * @tparam batch_type the type of the associated batch values.
     * @tparam Values boolean constant represented by this batch
     **/
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
    struct batch_bool_constant
    {
-
-    public:
+        using batch_type = batch_bool<T, A>;
        static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
        using value_type = bool;
        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");

-        constexpr operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
+    public:
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr batch_type as_batch_bool() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr operator batch_type() const noexcept { return as_batch_bool(); }

        constexpr bool get(size_t i) const noexcept
        {
@ -70,14 +77,14 @@ namespace xsimd
        };

        template <class F, class SelfPack, class OtherPack, size_t... Indices>
-        static constexpr batch_bool_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        static constexpr batch_bool_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
        apply(detail::index_sequence<Indices...>)
        {
            return {};
        }

        template <class F, bool... OtherValues>
-        static constexpr auto apply(batch_bool_constant<batch_type, Values...>, batch_bool_constant<batch_type, OtherValues...>)
+        static constexpr auto apply(batch_bool_constant<T, A, Values...>, batch_bool_constant<T, A, OtherValues...>)
            -> decltype(apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
        {
            static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
@ -85,12 +92,12 @@ namespace xsimd
        }

    public:
-#define MAKE_BINARY_OP(OP, NAME)                                                            \
-    template <bool... OtherValues>                                                          \
-    constexpr auto operator OP(batch_bool_constant<batch_type, OtherValues...> other) const \
-        -> decltype(apply<NAME>(*this, other))                                              \
-    {                                                                                       \
-        return apply<NAME>(*this, other);                                                   \
+#define MAKE_BINARY_OP(OP, NAME)                                                      \
+    template <bool... OtherValues>                                                    \
+    constexpr auto operator OP(batch_bool_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                        \
+    {                                                                                 \
+        return apply<NAME>(*this, other);                                             \
    }

        MAKE_BINARY_OP(|, logical_or)
@ -101,12 +108,12 @@ namespace xsimd

 #undef MAKE_BINARY_OP

-        constexpr batch_bool_constant<batch_type, !Values...> operator!() const
+        constexpr batch_bool_constant<T, A, !Values...> operator!() const
        {
            return {};
        }

-        constexpr batch_bool_constant<batch_type, !Values...> operator~() const
+        constexpr batch_bool_constant<T, A, !Values...> operator~() const
        {
            return {};
        }
@ -120,88 +127,93 @@ namespace xsimd
     * @tparam batch_type the type of the associated batch values.
     * @tparam Values constants represented by this batch
     **/
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
    struct batch_constant
    {
        static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
+        using batch_type = batch<T, A>;
        using value_type = typename batch_type::value_type;
        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");

        /**
         * @brief Generate a batch of @p batch_type from this @p batch_constant
         */
-        inline operator batch_type() const noexcept { return { Values... }; }
+        inline batch_type as_batch() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        inline operator batch_type() const noexcept { return as_batch(); }

        /**
         * @brief Get the @p i th element of this @p batch_constant
         */
-        constexpr value_type get(size_t i) const noexcept
+        constexpr T get(size_t i) const noexcept
        {
-            return get(i, std::array<value_type, size> { Values... });
+            return get(i, std::array<T, size> { Values... });
        }

    private:
-        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
+        constexpr T get(size_t i, std::array<T, size> const& values) const noexcept
        {
            return values[i];
        }

        struct arithmetic_add
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x + y; }
+            constexpr T operator()(T x, T y) const { return x + y; }
        };
        struct arithmetic_sub
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x - y; }
+            constexpr T operator()(T x, T y) const { return x - y; }
        };
        struct arithmetic_mul
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x * y; }
+            constexpr T operator()(T x, T y) const { return x * y; }
        };
        struct arithmetic_div
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x / y; }
+            constexpr T operator()(T x, T y) const { return x / y; }
        };
        struct arithmetic_mod
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x % y; }
+            constexpr T operator()(T x, T y) const { return x % y; }
        };
        struct binary_and
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x & y; }
+            constexpr T operator()(T x, T y) const { return x & y; }
        };
        struct binary_or
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x | y; }
+            constexpr T operator()(T x, T y) const { return x | y; }
        };
        struct binary_xor
        {
-            constexpr value_type operator()(value_type x, value_type y) const { return x ^ y; }
+            constexpr T operator()(T x, T y) const { return x ^ y; }
        };

        template <class F, class SelfPack, class OtherPack, size_t... Indices>
-        static constexpr batch_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        static constexpr batch_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
        apply(detail::index_sequence<Indices...>)
        {
            return {};
        }

-        template <class F, value_type... OtherValues>
-        static constexpr auto apply(batch_constant<batch_type, Values...>, batch_constant<batch_type, OtherValues...>)
-            -> decltype(apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        template <class F, T... OtherValues>
+        static constexpr auto apply(batch_constant<T, A, Values...>, batch_constant<T, A, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
        {
            static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
-            return apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+            return apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
        }

    public:
-#define MAKE_BINARY_OP(OP, NAME)                                                       \
-    template <value_type... OtherValues>                                               \
-    constexpr auto operator OP(batch_constant<batch_type, OtherValues...> other) const \
-        -> decltype(apply<NAME>(*this, other))                                         \
-    {                                                                                  \
-        return apply<NAME>(*this, other);                                              \
+#define MAKE_BINARY_OP(OP, NAME)                                                 \
+    template <T... OtherValues>                                                  \
+    constexpr auto operator OP(batch_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                   \
+    {                                                                            \
+        return apply<NAME>(*this, other);                                        \
    }

        MAKE_BINARY_OP(+, arithmetic_add)
@ -215,17 +227,17 @@ namespace xsimd

 #undef MAKE_BINARY_OP

-        constexpr batch_constant<batch_type, (value_type)-Values...> operator-() const
+        constexpr batch_constant<T, A, (T)-Values...> operator-() const
        {
            return {};
        }

-        constexpr batch_constant<batch_type, (value_type) + Values...> operator+() const
+        constexpr batch_constant<T, A, (T) + Values...> operator+() const
        {
            return {};
        }

-        constexpr batch_constant<batch_type, (value_type)~Values...> operator~() const
+        constexpr batch_constant<T, A, (T)~Values...> operator~() const
        {
            return {};
        }
@ -233,15 +245,15 @@ namespace xsimd

    namespace detail
    {
-        template <class batch_type, class G, std::size_t... Is>
+        template <typename T, class A, class G, std::size_t... Is>
        inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
+            -> batch_constant<T, A, (T)G::get(Is, sizeof...(Is))...>
        {
            return {};
        }
-        template <class batch_type, class G, std::size_t... Is>
+        template <typename T, class A, class G, std::size_t... Is>
        inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
+            -> batch_bool_constant<T, A, G::get(Is, sizeof...(Is))...>
        {
            return {};
        }
@ -268,19 +280,19 @@ namespace xsimd
     * };
     * @endcode
     */
-    template <class batch_type, class G>
-    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
+    template <typename T, class A, class G>
+    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>()))
    {
-        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
+        return detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>());
    }

-    template <class batch_type, class G>
+    template <typename T, class A, class G>
    inline constexpr auto make_batch_bool_constant() noexcept
-        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>()))
+        -> decltype(detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>()))
    {
-        return detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>());
+        return detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>());
    }

 } // namespace xsimd
--- a/third_party/xsimd/include/xsimd/types/xsimd_emulated_register.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_emulated_register.hpp
@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_REGISTER_HPP
+#define XSIMD_EMULATED_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * emulated instructions
+     */
+    template <size_t N>
+    struct emulated : generic
+    {
+        static constexpr bool supported() noexcept { return true; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr std::size_t alignment() noexcept { return 8; }
+        static constexpr char const* name() noexcept { return "emulated"; }
+    };
+
+    namespace types
+    {
+        template <size_t N>
+        struct simd_emulated_bool_register
+        {
+            using register_type = std::array<bool, N>;
+            register_type data;
+            simd_emulated_bool_register() = default;
+            simd_emulated_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <typename T, size_t N>
+        struct get_bool_simd_register<T, emulated<N>>
+        {
+            using type = simd_emulated_bool_register<N / (8 * sizeof(T))>;
+        };
+
+        template <typename T, size_t N>
+        struct simd_register<T, emulated<N>>
+        {
+            static_assert(N % (8 * sizeof(T)) == 0, "bit width must be a multiple of scalar width");
+            using register_type = std::array<T, N / (8 * sizeof(T))>;
+            register_type data;
+            inline operator register_type() const noexcept
+            {
+                return data;
+            }
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<T, emulated<N>> : std::is_scalar<T>
+        {
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<std::complex<T>, emulated<N>> : std::true_type
+        {
+        };
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <typename T, bool i3ec, size_t N>
+        struct has_simd_register<xtl::complex<T, T, i3ec>, emulated<N>> : std::true_type
+        {
+        };
+#endif
+    }
+}
+
+#endif
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@ -10,8 +10,8 @@ origin:

  url: https://github.com/QuantStack/xsimd

-  release: 7080469620c2145fbedf4ef8950406066e1ca2d6 (2024-03-17T21:35:00Z).
-  revision: 7080469620c2145fbedf4ef8950406066e1ca2d6
+  release: be9dcb5df413a893fb6646fa950eeb4aeac70ffc (2024-04-20T09:35:04Z).
+  revision: be9dcb5df413a893fb6646fa950eeb4aeac70ffc

  license: BSD-3-Clause