Bug 1891462 - Upgrade xsimd to be9dcb5df413a893fb6646fa950eeb4aeac70ffc r=padenot

WIP

Differential Revision: https://phabricator.services.mozilla.com/D207722
This commit is contained in:
serge-sans-paille 2024-04-20 16:38:48 +00:00
parent d59a006a47
commit e02cc14f5a
22 changed files with 1227 additions and 310 deletions

View File

@ -2064,7 +2064,7 @@ namespace xsimd
inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
{
using index_type = as_unsigned_integer_t<T>;
batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
}
}

View File

@ -21,10 +21,10 @@
namespace xsimd
{
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant;
namespace kernel
@ -180,7 +180,7 @@ namespace xsimd
}
};
batch<T, A> tmp(val);
return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
}
// get
@ -295,7 +295,7 @@ namespace xsimd
}
};
return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
}
template <size_t N, class A, class T>
@ -316,7 +316,7 @@ namespace xsimd
}
};
return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
}
template <size_t N, class A, class T>
@ -412,6 +412,12 @@ namespace xsimd
return true;
}
template <typename ITy>
constexpr bool is_zip_lo(size_t, ITy)
{
return false;
}
template <typename ITy0, typename ITy1, typename... ITys>
constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
{
@ -423,6 +429,12 @@ namespace xsimd
return true;
}
template <typename ITy>
constexpr bool is_zip_hi(size_t, ITy)
{
return false;
}
template <typename ITy0, typename ITy1, typename... ITys>
constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
{
@ -443,19 +455,19 @@ namespace xsimd
}
template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
{
constexpr size_t bsize = sizeof...(Indices);
// Detect common patterns
XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
{
return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
}
XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
{
return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
}
XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
@ -470,7 +482,7 @@ namespace xsimd
XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
{
return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
}
#if defined(__has_builtin)
@ -491,9 +503,9 @@ namespace xsimd
#else
// Use a generic_pattern. It is suboptimal but clang optimizes this
// pretty well.
batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
return select(select_x_lane, x_lane, y_lane);
#endif
}
@ -530,7 +542,7 @@ namespace xsimd
// swizzle
template <class A, class T, class ITy, ITy... Vs>
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
{
return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
}

View File

@ -1161,22 +1161,22 @@ namespace xsimd
return detail::merge_sse(res_low, res_hi);
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
{
return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
}
template <class A, bool... Values>
inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
{
constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
return _mm256_blend_ps(false_br, true_br, mask);
}
template <class A, bool... Values>
inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
{
constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
return _mm256_blend_pd(false_br, true_br, mask);
}
@ -1238,7 +1238,7 @@ namespace xsimd
// shuffle
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
{
constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
// shuffle within lane
@ -1253,7 +1253,7 @@ namespace xsimd
}
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
{
constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
// shuffle within lane
@ -1504,7 +1504,7 @@ namespace xsimd
// swizzle (constant mask)
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
{
// duplicate low and high part of input
__m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
@ -1514,14 +1514,14 @@ namespace xsimd
__m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
// normalize mask
batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
// permute within each lane
__m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
__m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
__m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
__m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
// mask to choose the right lane
batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
// blend the two permutes
constexpr auto mask = blend_mask.mask();
@ -1529,7 +1529,7 @@ namespace xsimd
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
{
// duplicate low and high part of input
__m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
@ -1539,14 +1539,14 @@ namespace xsimd
__m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
// normalize mask
batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
// permute within each lane
__m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
__m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
__m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
__m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
// mask to choose the right lane
batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
// blend the two permutes
constexpr auto mask = blend_mask.mask();
@ -1564,7 +1564,7 @@ namespace xsimd
uint32_t V7,
detail::enable_sized_integral_t<T, 4> = 0>
inline batch<T, A> swizzle(batch<T, A> const& self,
batch_constant<batch<uint32_t, A>,
batch_constant<uint32_t, A,
V0,
V1,
V2,
@ -1588,7 +1588,7 @@ namespace xsimd
detail::enable_sized_integral_t<T, 8> = 0>
inline batch<T, A>
swizzle(batch<T, A> const& self,
batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
requires_arch<avx>) noexcept
{
return bitwise_cast<T>(

View File

@ -729,9 +729,9 @@ namespace xsimd
}
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
{
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
// FIXME: for some reason mask here is not considered as an immediate,
// but it's okay for _mm256_blend_epi32
// case 2: return _mm256_blend_epi16(false_br, true_br, mask);
@ -912,36 +912,36 @@ namespace xsimd
// swizzle (constant mask)
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
return _mm256_permutevar8x32_ps(self, mask.as_batch());
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
{
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
return _mm256_permute4x64_pd(self, mask);
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
{
constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
return _mm256_permute4x64_epi64(self, mask);
}
template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
return _mm256_permutevar8x32_epi32(self, mask.as_batch());
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
{
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
}

View File

@ -617,27 +617,27 @@ namespace xsimd
// swizzle (static version)
template <class A, uint16_t... Vs>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
return swizzle(self, mask.as_batch(), avx512bw {});
}
template <class A, uint16_t... Vs>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
return swizzle(self, mask.as_batch(), avx512bw {});
}
template <class A, uint8_t... Vs>
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
return swizzle(self, mask.as_batch(), avx512bw {});
}
template <class A, uint8_t... Vs>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
{
return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
return swizzle(self, mask.as_batch(), avx512bw {});
}
// zip_hi

View File

@ -1422,8 +1422,8 @@ namespace xsimd
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
batch<T, A> acc = max(self, step);
__m256i low = _mm512_castsi512_si256(acc);
return reduce_max(batch<T, avx2>(low));
@ -1433,8 +1433,8 @@ namespace xsimd
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
{
constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
batch<T, A> acc = min(self, step);
__m256i low = _mm512_castsi512_si256(acc);
return reduce_min(batch<T, avx2>(low));
@ -1571,7 +1571,7 @@ namespace xsimd
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
{
return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
}
@ -1709,7 +1709,7 @@ namespace xsimd
// shuffle
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
requires_arch<avx512f>) noexcept
{
constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
@ -1726,7 +1726,7 @@ namespace xsimd
}
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
{
constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
// shuffle within lane
@ -1917,39 +1917,39 @@ namespace xsimd
// swizzle (constant version)
template <class A, uint32_t... Vs>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
return swizzle(self, mask.as_batch(), avx512f {});
}
template <class A, uint64_t... Vs>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
return swizzle(self, mask.as_batch(), avx512f {});
}
template <class A, uint64_t... Vs>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
return swizzle(self, mask.as_batch(), avx512f {});
}
template <class A, uint64_t... Vs>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
return swizzle(self, mask.as_batch(), avx512f {});
}
template <class A, uint32_t... Vs>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
return swizzle(self, mask.as_batch(), avx512f {});
}
template <class A, uint32_t... Vs>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
return swizzle(self, mask.as_batch(), avx512f {});
}
namespace detail
@ -1973,14 +1973,14 @@ namespace xsimd
uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
struct fold_batch_constant
{
using type = batch_constant<batch<uint32_t, A>, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
};
}
template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Idx...>, requires_arch<avx512f>) noexcept
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
{
constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
@ -1988,13 +1988,13 @@ namespace xsimd
template <class A>
inline batch<uint16_t, A>
swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
{
// FIXME: this sequence is very inefficient, but it's here to catch
// a pattern generated by detail::reduce from xsimd_generic_math.hpp.
// The whole pattern is actually decently folded by GCC and Clang,
// so bare with it.
constexpr batch_constant<batch<uint32_t, A>, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
alignas(A::alignment()) uint16_t buffer[32];
@ -2005,7 +2005,7 @@ namespace xsimd
template <class A, uint16_t... Vs>
inline batch<int16_t, A>
swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
{
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
}

View File

@ -0,0 +1,757 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_EMULATED_HPP
#define XSIMD_EMULATED_HPP
#include <complex>
#include <limits>
#include <numeric>
#include <type_traits>
#include "../arch/xsimd_scalar.hpp"
#include "../types/xsimd_emulated_register.hpp"
#include "../types/xsimd_utils.hpp"
namespace xsimd
{
template <typename T, class A, bool... Values>
struct batch_bool_constant;
template <class T_out, class T_in, class A>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
{
using namespace types;
// fwd
template <class A, class T, size_t I>
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
namespace detail
{
template <size_t I, class F, class... Bs>
auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...))
{
return func(bs.data[I]...);
}
template <class F, class B, class... Bs, size_t... Is>
auto emulated_apply(F func, ::xsimd::detail::index_sequence<Is...>, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
{
return { emulated_apply<Is>(func, b, bs...)... };
}
template <class B, class F, class... Bs>
auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
{
return emulated_apply(func, ::xsimd::detail::make_index_sequence<B::size>(), b, bs...);
}
}
// abs
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::abs(v); },
self);
}
// add
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::add(v0, v1); },
self, other);
}
// all
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline bool all(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return std::all_of(self.data.begin(), self.data.end(), [](T v)
{ return bool(v); });
}
// any
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline bool any(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return std::any_of(self.data.begin(), self.data.end(), [](T v)
{ return bool(v); });
}
// batch_bool_cast
template <class A, class T_out, class T_in, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<emulated<N>>) noexcept
{
return { self.data };
}
// bitwise_and
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::bitwise_and(v0, v1); },
self, other);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v0, bool v1)
{ return xsimd::bitwise_and(v0, v1); },
self, other);
}
// bitwise_andnot
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::bitwise_andnot(v0, v1); },
self, other);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v0, bool v1)
{ return xsimd::bitwise_andnot(v0, v1); },
self, other);
}
// bitwise_lshift
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([other](T v)
{ return xsimd::bitwise_lshift(v, other); },
self);
}
// bitwise_not
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::bitwise_not(v); },
self);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v)
{ return xsimd::bitwise_not(v); },
self);
}
// bitwise_or
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::bitwise_or(v0, v1); },
self, other);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v0, bool v1)
{ return xsimd::bitwise_or(v0, v1); },
self, other);
}
// bitwise_rshift
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([other](T v)
{ return xsimd::bitwise_rshift(v, other); },
self);
}
// bitwise_xor
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::bitwise_xor(v0, v1); },
self, other);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v0, bool v1)
{ return xsimd::bitwise_xor(v0, v1); },
self, other);
}
// bitwise_cast
template <class A, class T_in, class T_out, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T_out, A>::size;
std::array<T_out, size> result;
char* raw_data = reinterpret_cast<char*>(result.data());
const char* raw_input = reinterpret_cast<const char*>(self.data.data());
memcpy(raw_data, raw_input, size * sizeof(T_out));
return result;
}
// broadcast
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
batch<T, A> inline broadcast(T val, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> r;
std::fill(r.begin(), r.end(), val);
return r;
}
// store_complex
namespace detail
{
// complex_low
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> complex_low(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> result;
for (size_t i = 0; i < size / 2; ++i)
{
result[2 * i] = self.real().data[i];
result[1 + 2 * i] = self.imag().data[i];
}
return result;
}
// complex_high
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> complex_high(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> result;
for (size_t i = 0; i < size / 2; ++i)
{
result[2 * i] = self.real().data[i + size / 2];
result[1 + 2 * i] = self.imag().data[i + size / 2];
}
return result;
}
}
// decr_if
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
{
return self - batch<T, A>(mask.data);
}
// div
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::div(v0, v1); },
self, other);
}
// fast_cast
namespace detail
{
template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](int32_t v)
{ return float(v); },
self);
}
template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](uint32_t v)
{ return float(v); },
self);
}
template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
inline batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](int64_t v)
{ return double(v); },
self);
}
template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
inline batch<double, A> fast_cast(batch<uint64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](uint64_t v)
{ return double(v); },
self);
}
template <class A, size_t N = 8 * sizeof(int32_t) * batch<int32_t, A>::size>
inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](float v)
{ return int32_t(v); },
self);
}
template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](double v)
{ return int64_t(v); },
self);
}
}
// eq
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, emulated<N>> eq(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::eq(v0, v1); },
self, other);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch_bool<T, A>::size>
inline batch_bool<T, emulated<N>> eq(batch_bool<T, emulated<N>> const& self, batch_bool<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v0, bool v1)
{ return xsimd::eq(v0, v1); },
self, other);
}
// from_bool
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v)
{ return T(v); },
self);
}
// from_mask
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<bool, size> vmask;
for (size_t i = 0; i < size; ++i)
vmask[i] = (mask >> i) & 1u;
return vmask;
}
// ge
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, emulated<N>> ge(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::ge(v0, v1); },
self, other);
}
// gt
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, emulated<N>> gt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::gt(v0, v1); },
self, other);
}
// haddp
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> haddp(batch<T, A> const* row, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> r;
for (size_t i = 0; i < size; ++i)
r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front());
return r;
}
// incr_if
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
{
return self + batch<T, A>(mask.data);
}
// insert
template <class A, class T, size_t I, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<emulated<N>>) noexcept
{
batch<T, A> other = self;
other.data[I] = val;
return other;
}
// isnan
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
inline batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::isnan(v); },
self);
}
// load_aligned
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> res;
std::copy(mem, mem + size, res.begin());
return res;
}
// load_unaligned
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> res;
std::copy(mem, mem + size, res.begin());
return res;
}
// load_complex
namespace detail
{
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<std::complex<T>, A> load_complex(batch<T, A> const& hi, batch<T, A> const& lo, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> real, imag;
for (size_t i = 0; i < size / 2; ++i)
{
real[i] = hi.data[2 * i];
imag[i] = hi.data[1 + 2 * i];
}
for (size_t i = 0; i < size / 2; ++i)
{
real[size / 2 + i] = lo.data[2 * i];
imag[size / 2 + i] = lo.data[1 + 2 * i];
}
return { real, imag };
}
}
// le
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, emulated<N>> le(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::le(v0, v1); },
self, other);
}
// lt
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, emulated<N>> lt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::lt(v0, v1); },
self, other);
}
// mask
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
uint64_t res = 0;
for (size_t i = 0; i < size; ++i)
res |= (self.data[i] ? 1u : 0u) << i;
return res;
}
// max
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::max(v0, v1); },
self, other);
}
// min
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::min(v0, v1); },
self, other);
}
// mul
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::mul(v0, v1); },
self, other);
}
// nearbyint_as_int
template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<as_integer_t<T>, A> nearbyint_as_int(batch<T, A> const& self,
requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::nearbyint_as_int(v); },
self);
}
// neg
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> neg(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::neg(v); },
self);
}
// neq
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::neq(v0, v1); },
self, other);
}
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool v0, bool v1)
{ return xsimd::neq(v0, v1); },
self, other);
}
// reduce_add
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline T reduce_add(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> buffer;
self.store_unaligned(buffer.data());
return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin());
}
// reduce_max
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline T reduce_max(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
{ return xsimd::max(x, y); });
}
// reduce_min
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline T reduce_min(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
{ return xsimd::min(x, y); });
}
// rsqrt
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::rsqrt(v); },
self);
}
// select
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](bool c, T t, T f)
{ return xsimd::select(c, t, f); },
cond, true_br, false_br);
}
template <class A, class T, bool... Values>
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
static_assert(sizeof...(Values) == size, "consistent init");
return select((batch_bool<T, A>)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {});
}
// shuffle
template <class A, typename T, class ITy, ITy... Is>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<batch<T, A>::size>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
batch<ITy, A> bmask = mask;
std::array<T, size> res;
for (size_t i = 0; i < size; ++i)
res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size];
return res;
}
// sqrt
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> sqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v)
{ return xsimd::sqrt(v); },
self);
}
// slide_left
template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> result;
char* raw_data = reinterpret_cast<char*>(result.data());
memset(raw_data, 0, M);
memcpy(raw_data + M, reinterpret_cast<const char*>(x.data.data()), sizeof(T) * result.size() - M);
return result;
}
// slide_right
template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
std::array<T, size> result;
char* raw_data = reinterpret_cast<char*>(result.data());
memcpy(raw_data, reinterpret_cast<const char*>(x.data.data()) + M, sizeof(T) * result.size() - M);
memset(raw_data + sizeof(T) * result.size() - M, 0, M);
return result;
}
// sadd
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::sadd(v0, v1); },
self, other);
}
// set
template <class A, class T, size_t N, class... Values>
inline batch<T, emulated<N>> set(batch<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
{
static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
return { typename batch<T, emulated<N>>::register_type { static_cast<T>(values)... } };
}
template <class A, class T, size_t N, class... Values>
inline batch_bool<T, emulated<N>> set(batch_bool<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
{
static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
return { std::array<bool, sizeof...(Values)> { static_cast<bool>(values)... } };
}
// ssub
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::ssub(v0, v1); },
self, other);
}
// store_aligned
template <class A, class T, size_t N>
inline void store_aligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
{
std::copy(self.data.begin(), self.data.end(), mem);
}
// store_unaligned
template <class A, class T, size_t N>
inline void store_unaligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
{
std::copy(self.data.begin(), self.data.end(), mem);
}
// sub
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
return detail::emulated_apply([](T v0, T v1)
{ return xsimd::sub(v0, v1); },
self, other);
}
// swizzle
template <class A, typename T, class ITy, ITy... Is>
inline batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
batch<ITy, A> bmask = mask;
std::array<T, size> res;
for (size_t i = 0; i < size; ++i)
res[i] = self.data[bmask.data[i]];
return res;
}
// zip_hi
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
// Note: irregular behavior for odd numbers.
std::array<T, size> res;
if (size % 2)
{
for (size_t i = 0; i < size; ++i)
res[i] = (i % 2 ? self : other).data[size / 2 + i / 2];
}
else
{
for (size_t i = 0; i < size; ++i)
res[i] = (i % 2 ? other : self).data[size / 2 + i / 2];
}
return res;
}
// zip_lo
template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
{
constexpr size_t size = batch<T, A>::size;
// Note: irregular behavior for odd numbers.
std::array<T, size> res;
for (size_t i = 0; i < size; ++i)
res[i] = (i % 2 ? other : self).data[i / 2];
return res;
}
}
}
#endif

View File

@ -16,6 +16,10 @@
#include "./xsimd_generic_fwd.hpp"
#if XSIMD_WITH_EMULATED
#include "./xsimd_emulated.hpp"
#endif
#if XSIMD_WITH_SSE2
#include "./xsimd_sse2.hpp"
#endif

View File

@ -146,7 +146,7 @@ inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg;
namespace xsimd
{
template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant;
namespace kernel
@ -1743,7 +1743,7 @@ namespace xsimd
}
template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
{
return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
}
@ -2717,7 +2717,7 @@ namespace xsimd
}
}
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
@ -2728,7 +2728,7 @@ namespace xsimd
template <class A, class T, class I, I... idx>
inline batch<T, A> swizzle(batch<T, A> const& self,
batch_constant<batch<I, A>, idx...>,
batch_constant<I, A, idx...>,
requires_arch<neon>) noexcept
{
static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");

View File

@ -21,7 +21,7 @@
namespace xsimd
{
template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant;
namespace kernel
@ -942,7 +942,7 @@ namespace xsimd
}
template <class A, bool... b>
inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
inline batch<double, A> select(batch_bool_constant<double, A, b...> const&,
batch<double, A> const& true_br,
batch<double, A> const& false_br,
requires_arch<neon64>) noexcept
@ -1243,7 +1243,7 @@ namespace xsimd
}
}
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
@ -1354,42 +1354,40 @@ namespace xsimd
template <class CB1, class CB2, class IS>
struct index_burst_impl;
template <class B1, class B2, typename B2::value_type... V,
typename B2::value_type... incr>
struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
integer_sequence<typename B2::value_type, incr...>>
template <typename T1, class A, typename T2, T2... V,
T2... incr>
struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
integer_sequence<T2, incr...>>
{
using type = batch_constant<B2, V...>;
using type = batch_constant<T2, A, V...>;
};
template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
class B2, typename B2::value_type... V2,
typename B2::value_type... incr>
struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
integer_sequence<typename B2::value_type, incr...>>
template <typename T1, class A, T1 V0, T1... V1,
typename T2, T2... V2, T2... incr>
struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
integer_sequence<T2, incr...>>
{
using value_type = typename B2::value_type;
using next_input = batch_constant<B1, V1...>;
using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
using next_input = batch_constant<T1, A, V1...>;
using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
};
template <class B, class T>
struct index_burst;
template <class B, typename B::value_type... V, class T>
struct index_burst<batch_constant<B, V...>, T>
template <typename Tp, class A, Tp... V, typename T>
struct index_burst<batch_constant<Tp, A, V...>, T>
{
static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
using input = batch_constant<B, (mul * V)...>;
using output = batch_constant<batch<T, typename B::arch_type>>;
static constexpr size_t mul = sizeof(Tp) / sizeof(T);
using input = batch_constant<Tp, A, (mul * V)...>;
using output = batch_constant<T, A>;
using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
};
template <class B, class T>
template <class B, typename T>
using index_burst_t = typename index_burst<B, T>::type;
template <class T, class B>
template <typename T, class B>
inline index_burst_t<B, T> burst_index(B)
{
return index_burst_t<B, T>();
@ -1399,7 +1397,7 @@ namespace xsimd
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
requires_arch<neon64>) noexcept
{
return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
@ -1408,7 +1406,7 @@ namespace xsimd
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
requires_arch<neon64>) noexcept
{
return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
@ -1416,7 +1414,7 @@ namespace xsimd
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<uint8_t, A>;
@ -1425,7 +1423,7 @@ namespace xsimd
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<int8_t, A>;
@ -1434,7 +1432,7 @@ namespace xsimd
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<uint8_t, A>;
@ -1443,7 +1441,7 @@ namespace xsimd
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<int8_t, A>;
@ -1452,7 +1450,7 @@ namespace xsimd
template <class A, uint64_t V0, uint64_t V1>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
batch_constant<batch<uint64_t, A>, V0, V1> idx,
batch_constant<uint64_t, A, V0, V1> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<uint8_t, A>;
@ -1461,7 +1459,7 @@ namespace xsimd
template <class A, uint64_t V0, uint64_t V1>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
batch_constant<batch<uint64_t, A>, V0, V1> idx,
batch_constant<uint64_t, A, V0, V1> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<int8_t, A>;
@ -1470,7 +1468,7 @@ namespace xsimd
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<float, A> swizzle(batch<float, A> const& self,
batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<uint8_t, A>;
@ -1479,7 +1477,7 @@ namespace xsimd
template <class A, uint64_t V0, uint64_t V1>
inline batch<double, A> swizzle(batch<double, A> const& self,
batch_constant<batch<uint64_t, A>, V0, V1> idx,
batch_constant<uint64_t, A, V0, V1> idx,
requires_arch<neon64>) noexcept
{
using batch_type = batch<uint8_t, A>;
@ -1488,7 +1486,7 @@ namespace xsimd
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
requires_arch<neon64>) noexcept
{
return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
@ -1496,7 +1494,7 @@ namespace xsimd
template <class A, uint64_t V0, uint64_t V1>
inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
batch_constant<batch<uint64_t, A>, V0, V1> idx,
batch_constant<uint64_t, A, V0, V1> idx,
requires_arch<neon64>) noexcept
{
return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));

View File

@ -284,7 +284,7 @@
namespace xsimd
{
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
@ -1150,7 +1150,7 @@ namespace xsimd
// swizzle
template <class A, class T, class I, I... idx>
inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<rvv>) noexcept
inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...>, requires_arch<rvv>) noexcept
{
static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
const batch<I, A> indices { idx... };
@ -1159,11 +1159,11 @@ namespace xsimd
template <class A, class T, class I, I... idx>
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
batch_constant<batch<I, A>, idx...>,
batch_constant<I, A, idx...>,
requires_arch<rvv>) noexcept
{
const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, rvv {});
const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, rvv {});
const auto real = swizzle(self.real(), batch_constant<I, A, idx...> {}, rvv {});
const auto imag = swizzle(self.imag(), batch_constant<I, A, idx...> {}, rvv {});
return batch<std::complex<T>>(real, imag);
}
@ -1188,7 +1188,7 @@ namespace xsimd
}
template <class A, class T, bool... b>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
{
return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
}

View File

@ -86,6 +86,57 @@ namespace xsimd
using std::tgamma;
using std::trunc;
inline signed char abs(signed char v)
{
return v < 0 ? -v : v;
}
namespace detail
{
// Use templated type here to prevent automatic instantiation that may
// ends up in a warning
template <typename char_type>
inline char abs(char_type v, std::true_type)
{
return v;
}
template <typename char_type>
inline char abs(char_type v, std::false_type)
{
return v < 0 ? -v : v;
}
}
inline char abs(char v)
{
return detail::abs(v, std::is_unsigned<char>::type {});
}
inline short abs(short v)
{
return v < 0 ? -v : v;
}
inline unsigned char abs(unsigned char v)
{
return v;
}
inline unsigned short abs(unsigned short v)
{
return v;
}
inline unsigned int abs(unsigned int v)
{
return v;
}
inline unsigned long abs(unsigned long v)
{
return v;
}
inline unsigned long long abs(unsigned long long v)
{
return v;
}
#ifndef _WIN32
using std::isfinite;
using std::isinf;
@ -137,7 +188,7 @@ namespace xsimd
#endif
template <class T, class Tp>
inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y)
inline typename std::common_type<T, Tp>::type add(T const& x, Tp const& y) noexcept
{
return x + y;
}
@ -209,6 +260,15 @@ namespace xsimd
return x & y;
}
template <class T_out, class T_in>
inline T_out bitwise_cast(T_in x) noexcept
{
static_assert(sizeof(T_in) == sizeof(T_out), "bitwise_cast between types of the same size");
T_out r;
std::memcpy((void*)&r, (void*)&x, sizeof(T_in));
return r;
}
inline float bitwise_and(float x, float y) noexcept
{
uint32_t ix, iy;
@ -231,35 +291,6 @@ namespace xsimd
return r;
}
template <class T>
inline typename std::enable_if<std::is_integral<T>::value, T>::type
bitwise_andnot(T x, T y) noexcept
{
return x & ~y;
}
inline float bitwise_andnot(float x, float y) noexcept
{
uint32_t ix, iy;
std::memcpy((void*)&ix, (void*)&x, sizeof(float));
std::memcpy((void*)&iy, (void*)&y, sizeof(float));
uint32_t ir = bitwise_andnot(ix, iy);
float r;
std::memcpy((void*)&r, (void*)&ir, sizeof(float));
return r;
}
inline double bitwise_andnot(double x, double y) noexcept
{
uint64_t ix, iy;
std::memcpy((void*)&ix, (void*)&x, sizeof(double));
std::memcpy((void*)&iy, (void*)&y, sizeof(double));
uint64_t ir = bitwise_andnot(ix, iy);
double r;
std::memcpy((void*)&r, (void*)&ir, sizeof(double));
return r;
}
template <class T0, class T1>
inline typename std::enable_if<std::is_integral<T0>::value && std::is_integral<T1>::value, T0>::type
bitwise_lshift(T0 x, T1 shift) noexcept
@ -281,6 +312,11 @@ namespace xsimd
return ~x;
}
inline bool bitwise_not(bool x) noexcept
{
return !x;
}
inline float bitwise_not(float x) noexcept
{
uint32_t ix;
@ -301,6 +337,12 @@ namespace xsimd
return r;
}
template <class T>
inline typename std::enable_if<std::is_scalar<T>::value, T>::type bitwise_andnot(T x, T y) noexcept
{
return bitwise_and(x, bitwise_not(y));
}
template <class T>
inline typename std::enable_if<std::is_integral<T>::value, T>::type
bitwise_or(T x, T y) noexcept
@ -360,7 +402,7 @@ namespace xsimd
}
template <class T, class Tp>
inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y)
inline typename std::common_type<T, Tp>::type div(T const& x, Tp const& y) noexcept
{
return x / y;
}
@ -372,13 +414,13 @@ namespace xsimd
}
template <class T, class Tp>
inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y)
inline typename std::common_type<T, Tp>::type mul(T const& x, Tp const& y) noexcept
{
return x * y;
}
template <class T>
inline auto neg(T const& x) noexcept -> decltype(-x)
inline T neg(T const& x) noexcept
{
return -x;
}
@ -776,9 +818,9 @@ namespace xsimd
}
template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
inline bool bitofsign(T const& x) noexcept
inline T bitofsign(T const& x) noexcept
{
return x < T(0);
return T(x < T(0));
}
template <class T>
@ -842,7 +884,7 @@ namespace xsimd
}
template <class T, class Tp>
inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y)
inline typename std::common_type<T, Tp>::type sub(T const& x, Tp const& y) noexcept
{
return x - y;
}

View File

@ -20,13 +20,13 @@
namespace xsimd
{
template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant;
template <class T_out, class T_in, class A>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
@ -59,7 +59,7 @@ namespace xsimd
template <class A, class T, size_t I>
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
template <class A, class T>
inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
template <class A, class T>
@ -1216,6 +1216,43 @@ namespace xsimd
return _mm_cvtss_f32(tmp1);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
__m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
__m128i tmp2 = _mm_add_epi32(self, tmp1);
__m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
__m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
return _mm_cvtsi128_si32(tmp4);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
__m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
__m128i tmp2 = _mm_add_epi64(self, tmp1);
#if defined(__x86_64__)
return _mm_cvtsi128_si64(tmp2);
#else
__m128i m;
_mm_storel_epi64(&m, tmp2);
int64_t i;
std::memcpy(&i, &m, sizeof(i));
return i;
#endif
}
else
{
return hadd(self, generic {});
}
}
template <class A>
inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
{
return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
}
// reduce_max
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
@ -1260,42 +1297,6 @@ namespace xsimd
return acc3.get(0);
}
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
{
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
{
__m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
__m128i tmp2 = _mm_add_epi32(self, tmp1);
__m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
__m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
return _mm_cvtsi128_si32(tmp4);
}
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
{
__m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
__m128i tmp2 = _mm_add_epi64(self, tmp1);
#if defined(__x86_64__)
return _mm_cvtsi128_si64(tmp2);
#else
__m128i m;
_mm_storel_epi64(&m, tmp2);
int64_t i;
std::memcpy(&i, &m, sizeof(i));
return i;
#endif
}
else
{
return hadd(self, generic {});
}
}
template <class A>
inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
{
return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
}
// rsqrt
template <class A>
inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
@ -1321,7 +1322,7 @@ namespace xsimd
return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
{
return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
}
@ -1333,7 +1334,7 @@ namespace xsimd
// shuffle
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
{
constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
// shuffle within lane
@ -1347,7 +1348,7 @@ namespace xsimd
}
template <class A, class ITy, ITy I0, ITy I1>
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<sse2>) noexcept
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
{
constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
// shuffle within lane
@ -1600,41 +1601,41 @@ namespace xsimd
// swizzle
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
{
constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
return _mm_shuffle_ps(self, self, index);
}
template <class A, uint64_t V0, uint64_t V1>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
{
constexpr uint32_t index = detail::shuffle(V0, V1);
return _mm_shuffle_pd(self, self, index);
}
template <class A, uint64_t V0, uint64_t V1>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
{
constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
return _mm_shuffle_epi32(self, index);
}
template <class A, uint64_t V0, uint64_t V1>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
{
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
{
constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
return _mm_shuffle_epi32(self, index);
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
{
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
}

View File

@ -284,9 +284,9 @@ namespace xsimd
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
{
constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
{
return _mm_blend_epi16(false_br, true_br, mask);
@ -304,19 +304,19 @@ namespace xsimd
}
else
{
return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
}
}
template <class A, bool... Values>
inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
{
constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
return _mm_blend_ps(false_br, true_br, mask);
}
template <class A, bool... Values>
inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
{
constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
return _mm_blend_pd(false_br, true_br, mask);
}

View File

@ -140,32 +140,32 @@ namespace xsimd
// swizzle (constant mask)
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
{
constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
mask8;
return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
return _mm_shuffle_epi8(self, mask8.as_batch());
}
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
{
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
{
return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
return swizzle(self, mask.as_batch(), ssse3 {});
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
{
return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
return swizzle(self, mask.as_batch(), ssse3 {});
}
}

View File

@ -20,7 +20,7 @@
namespace xsimd
{
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
@ -739,19 +739,19 @@ namespace xsimd
// swizzle (static)
template <class A, class T, class I, I... idx>
inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...> indices, requires_arch<sve>) noexcept
inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
{
static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
return swizzle(arg, (batch<I, A>)indices, sve {});
return swizzle(arg, indices.as_batch(), sve {});
}
template <class A, class T, class I, I... idx>
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
batch_constant<batch<I, A>, idx...> indices,
batch_constant<I, A, idx...> indices,
requires_arch<sve>) noexcept
{
static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
return swizzle(arg, (batch<I, A>)indices, sve {});
return swizzle(arg, indices.as_batch(), sve {});
}
/*************
@ -811,7 +811,7 @@ namespace xsimd
}
template <class A, class T, bool... b>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
{
return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
}

View File

@ -19,13 +19,13 @@
namespace xsimd
{
template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant;
template <class T_out, class T_in, class A>
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant;
namespace kernel
@ -36,7 +36,7 @@ namespace xsimd
template <class A, class T, size_t I>
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
template <class A, class T>
inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
@ -1275,7 +1275,7 @@ namespace xsimd
return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
}
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
{
return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
}
@ -1287,13 +1287,13 @@ namespace xsimd
// shuffle
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
{
return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
}
template <class A, class ITy, ITy I0, ITy I1>
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1>, requires_arch<wasm>) noexcept
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<wasm>) noexcept
{
return wasm_i64x2_shuffle(x, y, I0, I1);
}
@ -1515,63 +1515,63 @@ namespace xsimd
// swizzle
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
{
return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
}
template <class A, uint64_t V0, uint64_t V1>
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
{
return wasm_i64x2_shuffle(self, self, V0, V1);
}
template <class A, uint64_t V0, uint64_t V1>
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
{
return wasm_i64x2_shuffle(self, self, V0, V1);
}
template <class A, uint64_t V0, uint64_t V1>
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<wasm>) noexcept
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<wasm>) noexcept
{
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
{
return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
}
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
{
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
}
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
{
return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
}
template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
{
return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
{
return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
}
template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
{
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
}

View File

@ -46,3 +46,7 @@
#include "xsimd_rvv_register.hpp"
#include "xsimd_wasm_register.hpp"
#if XSIMD_WITH_EMULATED
#include "xsimd_emulated_register.hpp"
#endif

View File

@ -2031,7 +2031,7 @@ namespace xsimd
* @return the result of the selection.
*/
template <class T, class A, bool... Values>
inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
inline batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
{
detail::static_check_supported_config<T, A>();
return kernel::select<A>(cond, true_br, false_br, A {});
@ -2047,7 +2047,7 @@ namespace xsimd
* element of \c x and \c y. Each element of the mask index the vector that
* would be formed by the concatenation of \c x and \c y. For instance
* \code{.cpp}
* batch_constant<batch<uint32_t, sse2>, 0, 4, 3, 7>
* batch_constant<uint32_t, sse2, 0, 4, 3, 7>
* \endcode
* Picks \c x[0], \c y[0], \c x[3], \c y[3]
*
@ -2055,7 +2055,7 @@ namespace xsimd
*/
template <class T, class A, class Vt, Vt... Values>
inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<Vt, A>, Values...> mask) noexcept
shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<Vt, A, Values...> mask) noexcept
{
static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
detail::static_check_supported_config<T, A>();
@ -2210,19 +2210,22 @@ namespace xsimd
template <class To, class A = default_arch, class From>
inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
{
kernel::store_aligned(dst, src, A {});
detail::static_check_supported_config<From, A>();
kernel::store_aligned<A>(dst, src, A {});
}
template <class A = default_arch, class From>
inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
{
kernel::store(src, dst, A {});
detail::static_check_supported_config<From, A>();
kernel::store<A>(src, dst, A {});
}
template <class To, class A = default_arch, class From>
inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
{
kernel::store_complex_aligned(dst, src, A {});
detail::static_check_supported_config<std::complex<From>, A>();
kernel::store_complex_aligned<A>(dst, src, A {});
}
#ifdef XSIMD_ENABLE_XTL_COMPLEX
@ -2244,25 +2247,29 @@ namespace xsimd
template <class To, class A = default_arch, class From>
inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
{
kernel::store_unaligned(dst, src, A {});
detail::static_check_supported_config<From, A>();
kernel::store_unaligned<A>(dst, src, A {});
}
template <class A = default_arch, class From>
inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
{
kernel::store(src, dst, A {});
detail::static_check_supported_config<From, A>();
kernel::store<A>(src, dst, A {});
}
template <class To, class A = default_arch, class From>
inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
{
kernel::store_complex_unaligned(dst, src, A {});
detail::static_check_supported_config<std::complex<From>, A>();
kernel::store_complex_unaligned<A>(dst, src, A {});
}
#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <class To, class A = default_arch, class From, bool i3ec>
inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
{
detail::static_check_supported_config<std::complex<From>, A>();
store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
}
#endif
@ -2350,14 +2357,14 @@ namespace xsimd
*/
template <class T, class A, class Vt, Vt... Values>
inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
swizzle(batch<T, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
{
static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
detail::static_check_supported_config<T, A>();
return kernel::swizzle<A>(x, mask, A {});
}
template <class T, class A, class Vt, Vt... Values>
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
{
static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
detail::static_check_supported_config<T, A>();

View File

@ -25,17 +25,24 @@ namespace xsimd
* @tparam batch_type the type of the associated batch values.
* @tparam Values boolean constant represented by this batch
**/
template <class batch_type, bool... Values>
template <typename T, class A, bool... Values>
struct batch_bool_constant
{
public:
using batch_type = batch_bool<T, A>;
static constexpr std::size_t size = sizeof...(Values);
using arch_type = typename batch_type::arch_type;
using value_type = bool;
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
constexpr operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
public:
/**
* @brief Generate a batch of @p batch_type from this @p batch_bool_constant
*/
constexpr batch_type as_batch_bool() const noexcept { return { Values... }; }
/**
* @brief Generate a batch of @p batch_type from this @p batch_bool_constant
*/
constexpr operator batch_type() const noexcept { return as_batch_bool(); }
constexpr bool get(size_t i) const noexcept
{
@ -70,14 +77,14 @@ namespace xsimd
};
template <class F, class SelfPack, class OtherPack, size_t... Indices>
static constexpr batch_bool_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
static constexpr batch_bool_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
apply(detail::index_sequence<Indices...>)
{
return {};
}
template <class F, bool... OtherValues>
static constexpr auto apply(batch_bool_constant<batch_type, Values...>, batch_bool_constant<batch_type, OtherValues...>)
static constexpr auto apply(batch_bool_constant<T, A, Values...>, batch_bool_constant<T, A, OtherValues...>)
-> decltype(apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
{
static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
@ -85,12 +92,12 @@ namespace xsimd
}
public:
#define MAKE_BINARY_OP(OP, NAME) \
template <bool... OtherValues> \
constexpr auto operator OP(batch_bool_constant<batch_type, OtherValues...> other) const \
-> decltype(apply<NAME>(*this, other)) \
{ \
return apply<NAME>(*this, other); \
#define MAKE_BINARY_OP(OP, NAME) \
template <bool... OtherValues> \
constexpr auto operator OP(batch_bool_constant<T, A, OtherValues...> other) const \
-> decltype(apply<NAME>(*this, other)) \
{ \
return apply<NAME>(*this, other); \
}
MAKE_BINARY_OP(|, logical_or)
@ -101,12 +108,12 @@ namespace xsimd
#undef MAKE_BINARY_OP
constexpr batch_bool_constant<batch_type, !Values...> operator!() const
constexpr batch_bool_constant<T, A, !Values...> operator!() const
{
return {};
}
constexpr batch_bool_constant<batch_type, !Values...> operator~() const
constexpr batch_bool_constant<T, A, !Values...> operator~() const
{
return {};
}
@ -120,88 +127,93 @@ namespace xsimd
* @tparam batch_type the type of the associated batch values.
* @tparam Values constants represented by this batch
**/
template <class batch_type, typename batch_type::value_type... Values>
template <typename T, class A, T... Values>
struct batch_constant
{
static constexpr std::size_t size = sizeof...(Values);
using arch_type = typename batch_type::arch_type;
using batch_type = batch<T, A>;
using value_type = typename batch_type::value_type;
static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
/**
* @brief Generate a batch of @p batch_type from this @p batch_constant
*/
inline operator batch_type() const noexcept { return { Values... }; }
inline batch_type as_batch() const noexcept { return { Values... }; }
/**
* @brief Generate a batch of @p batch_type from this @p batch_constant
*/
inline operator batch_type() const noexcept { return as_batch(); }
/**
* @brief Get the @p i th element of this @p batch_constant
*/
constexpr value_type get(size_t i) const noexcept
constexpr T get(size_t i) const noexcept
{
return get(i, std::array<value_type, size> { Values... });
return get(i, std::array<T, size> { Values... });
}
private:
constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
constexpr T get(size_t i, std::array<T, size> const& values) const noexcept
{
return values[i];
}
struct arithmetic_add
{
constexpr value_type operator()(value_type x, value_type y) const { return x + y; }
constexpr T operator()(T x, T y) const { return x + y; }
};
struct arithmetic_sub
{
constexpr value_type operator()(value_type x, value_type y) const { return x - y; }
constexpr T operator()(T x, T y) const { return x - y; }
};
struct arithmetic_mul
{
constexpr value_type operator()(value_type x, value_type y) const { return x * y; }
constexpr T operator()(T x, T y) const { return x * y; }
};
struct arithmetic_div
{
constexpr value_type operator()(value_type x, value_type y) const { return x / y; }
constexpr T operator()(T x, T y) const { return x / y; }
};
struct arithmetic_mod
{
constexpr value_type operator()(value_type x, value_type y) const { return x % y; }
constexpr T operator()(T x, T y) const { return x % y; }
};
struct binary_and
{
constexpr value_type operator()(value_type x, value_type y) const { return x & y; }
constexpr T operator()(T x, T y) const { return x & y; }
};
struct binary_or
{
constexpr value_type operator()(value_type x, value_type y) const { return x | y; }
constexpr T operator()(T x, T y) const { return x | y; }
};
struct binary_xor
{
constexpr value_type operator()(value_type x, value_type y) const { return x ^ y; }
constexpr T operator()(T x, T y) const { return x ^ y; }
};
template <class F, class SelfPack, class OtherPack, size_t... Indices>
static constexpr batch_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
static constexpr batch_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
apply(detail::index_sequence<Indices...>)
{
return {};
}
template <class F, value_type... OtherValues>
static constexpr auto apply(batch_constant<batch_type, Values...>, batch_constant<batch_type, OtherValues...>)
-> decltype(apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
template <class F, T... OtherValues>
static constexpr auto apply(batch_constant<T, A, Values...>, batch_constant<T, A, OtherValues...>)
-> decltype(apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
{
static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
return apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
return apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
}
public:
#define MAKE_BINARY_OP(OP, NAME) \
template <value_type... OtherValues> \
constexpr auto operator OP(batch_constant<batch_type, OtherValues...> other) const \
-> decltype(apply<NAME>(*this, other)) \
{ \
return apply<NAME>(*this, other); \
#define MAKE_BINARY_OP(OP, NAME) \
template <T... OtherValues> \
constexpr auto operator OP(batch_constant<T, A, OtherValues...> other) const \
-> decltype(apply<NAME>(*this, other)) \
{ \
return apply<NAME>(*this, other); \
}
MAKE_BINARY_OP(+, arithmetic_add)
@ -215,17 +227,17 @@ namespace xsimd
#undef MAKE_BINARY_OP
constexpr batch_constant<batch_type, (value_type)-Values...> operator-() const
constexpr batch_constant<T, A, (T)-Values...> operator-() const
{
return {};
}
constexpr batch_constant<batch_type, (value_type) + Values...> operator+() const
constexpr batch_constant<T, A, (T) + Values...> operator+() const
{
return {};
}
constexpr batch_constant<batch_type, (value_type)~Values...> operator~() const
constexpr batch_constant<T, A, (T)~Values...> operator~() const
{
return {};
}
@ -233,15 +245,15 @@ namespace xsimd
namespace detail
{
template <class batch_type, class G, std::size_t... Is>
template <typename T, class A, class G, std::size_t... Is>
inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
-> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
-> batch_constant<T, A, (T)G::get(Is, sizeof...(Is))...>
{
return {};
}
template <class batch_type, class G, std::size_t... Is>
template <typename T, class A, class G, std::size_t... Is>
inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
-> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
-> batch_bool_constant<T, A, G::get(Is, sizeof...(Is))...>
{
return {};
}
@ -268,19 +280,19 @@ namespace xsimd
* };
* @endcode
*/
template <class batch_type, class G>
inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
template <typename T, class A, class G>
inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>()))
{
return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
return detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>());
}
template <class batch_type, class G>
template <typename T, class A, class G>
inline constexpr auto make_batch_bool_constant() noexcept
-> decltype(detail::make_batch_bool_constant<batch_type, G>(
detail::make_index_sequence<batch_type::size>()))
-> decltype(detail::make_batch_bool_constant<T, A, G>(
detail::make_index_sequence<batch<T, A>::size>()))
{
return detail::make_batch_bool_constant<batch_type, G>(
detail::make_index_sequence<batch_type::size>());
return detail::make_batch_bool_constant<T, A, G>(
detail::make_index_sequence<batch<T, A>::size>());
}
} // namespace xsimd

View File

@ -0,0 +1,80 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/
#ifndef XSIMD_EMULATED_REGISTER_HPP
#define XSIMD_EMULATED_REGISTER_HPP
#include "./xsimd_generic_arch.hpp"
#include "./xsimd_register.hpp"
namespace xsimd
{
/**
* @ingroup architectures
*
* emulated instructions
*/
template <size_t N>
struct emulated : generic
{
static constexpr bool supported() noexcept { return true; }
static constexpr bool available() noexcept { return true; }
static constexpr bool requires_alignment() noexcept { return false; }
static constexpr std::size_t alignment() noexcept { return 8; }
static constexpr char const* name() noexcept { return "emulated"; }
};
namespace types
{
template <size_t N>
struct simd_emulated_bool_register
{
using register_type = std::array<bool, N>;
register_type data;
simd_emulated_bool_register() = default;
simd_emulated_bool_register(register_type r) { data = r; }
operator register_type() const noexcept { return data; }
};
template <typename T, size_t N>
struct get_bool_simd_register<T, emulated<N>>
{
using type = simd_emulated_bool_register<N / (8 * sizeof(T))>;
};
template <typename T, size_t N>
struct simd_register<T, emulated<N>>
{
static_assert(N % (8 * sizeof(T)) == 0, "bit width must be a multiple of scalar width");
using register_type = std::array<T, N / (8 * sizeof(T))>;
register_type data;
inline operator register_type() const noexcept
{
return data;
}
};
template <typename T, size_t N>
struct has_simd_register<T, emulated<N>> : std::is_scalar<T>
{
};
template <typename T, size_t N>
struct has_simd_register<std::complex<T>, emulated<N>> : std::true_type
{
};
#ifdef XSIMD_ENABLE_XTL_COMPLEX
template <typename T, bool i3ec, size_t N>
struct has_simd_register<xtl::complex<T, T, i3ec>, emulated<N>> : std::true_type
{
};
#endif
}
}
#endif

View File

@ -10,8 +10,8 @@ origin:
url: https://github.com/QuantStack/xsimd
release: 7080469620c2145fbedf4ef8950406066e1ca2d6 (2024-03-17T21:35:00Z).
revision: 7080469620c2145fbedf4ef8950406066e1ca2d6
release: be9dcb5df413a893fb6646fa950eeb4aeac70ffc (2024-04-20T09:35:04Z).
revision: be9dcb5df413a893fb6646fa950eeb4aeac70ffc
license: BSD-3-Clause