mirror of
https://github.com/mozilla/gecko-dev.git
synced 2024-11-23 04:41:11 +00:00
Bug 1927534 - Update xsimd to e384105a2a3809c319f0740e2ebf6166da895fcb r=padenot
Differential Revision: https://phabricator.services.mozilla.com/D227075
This commit is contained in:
parent
0fffe21e9d
commit
b6f870224e
@ -639,6 +639,32 @@ namespace xsimd
|
||||
hi.store_unaligned(buffer + real_batch::size);
|
||||
}
|
||||
|
||||
// transpose
|
||||
template <class A, class T>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
alignas(A::alignment()) T scratch_buffer[batch<T, A>::size * batch<T, A>::size];
|
||||
for (size_t i = 0; i < batch<T, A>::size; ++i)
|
||||
{
|
||||
matrix_begin[i].store_aligned(&scratch_buffer[i * batch<T, A>::size]);
|
||||
}
|
||||
// FIXME: this is super naive we can probably do better.
|
||||
for (size_t i = 0; i < batch<T, A>::size; ++i)
|
||||
{
|
||||
for (size_t j = 0; j < i; ++j)
|
||||
{
|
||||
std::swap(scratch_buffer[i * batch<T, A>::size + j],
|
||||
scratch_buffer[j * batch<T, A>::size + i]);
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < batch<T, A>::size; ++i)
|
||||
{
|
||||
matrix_begin[i] = batch<T, A>::load_aligned(&scratch_buffer[i * batch<T, A>::size]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1594,6 +1594,87 @@ namespace xsimd
|
||||
return bitwise_cast<T>(
|
||||
swizzle(bitwise_cast<double>(self), mask));
|
||||
}
|
||||
// transpose
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<avx>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
// See
|
||||
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
|
||||
r2 = matrix_begin[2], r3 = matrix_begin[3],
|
||||
r4 = matrix_begin[4], r5 = matrix_begin[5],
|
||||
r6 = matrix_begin[6], r7 = matrix_begin[7];
|
||||
|
||||
auto t0 = _mm256_unpacklo_ps(r0, r1);
|
||||
auto t1 = _mm256_unpackhi_ps(r0, r1);
|
||||
auto t2 = _mm256_unpacklo_ps(r2, r3);
|
||||
auto t3 = _mm256_unpackhi_ps(r2, r3);
|
||||
auto t4 = _mm256_unpacklo_ps(r4, r5);
|
||||
auto t5 = _mm256_unpackhi_ps(r4, r5);
|
||||
auto t6 = _mm256_unpacklo_ps(r6, r7);
|
||||
auto t7 = _mm256_unpackhi_ps(r6, r7);
|
||||
|
||||
r0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
r1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
r2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
r3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
r4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
r5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
r6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1, 0, 1, 0));
|
||||
r7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3, 2, 3, 2));
|
||||
|
||||
matrix_begin[0] = _mm256_permute2f128_ps(r0, r4, 0x20);
|
||||
matrix_begin[1] = _mm256_permute2f128_ps(r1, r5, 0x20);
|
||||
matrix_begin[2] = _mm256_permute2f128_ps(r2, r6, 0x20);
|
||||
matrix_begin[3] = _mm256_permute2f128_ps(r3, r7, 0x20);
|
||||
matrix_begin[4] = _mm256_permute2f128_ps(r0, r4, 0x31);
|
||||
matrix_begin[5] = _mm256_permute2f128_ps(r1, r5, 0x31);
|
||||
matrix_begin[6] = _mm256_permute2f128_ps(r2, r6, 0x31);
|
||||
matrix_begin[7] = _mm256_permute2f128_ps(r3, r7, 0x31);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<avx>) noexcept
|
||||
{
|
||||
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<avx>) noexcept
|
||||
{
|
||||
return transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
|
||||
}
|
||||
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<avx>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1],
|
||||
r2 = matrix_begin[2], r3 = matrix_begin[3];
|
||||
|
||||
auto t0 = _mm256_unpacklo_pd(r0, r1); // r00 r10 r01 r11
|
||||
auto t1 = _mm256_unpackhi_pd(r0, r1); // r02 r12 r03 r13
|
||||
auto t2 = _mm256_unpacklo_pd(r2, r3); // r20 r30 r21 r31
|
||||
auto t3 = _mm256_unpackhi_pd(r2, r3); // r22 r32 r23 r33
|
||||
|
||||
matrix_begin[0] = _mm256_permute2f128_pd(t0, t2, 0x20);
|
||||
matrix_begin[1] = _mm256_permute2f128_pd(t1, t3, 0x20);
|
||||
matrix_begin[2] = _mm256_permute2f128_pd(t0, t2, 0x31);
|
||||
matrix_begin[3] = _mm256_permute2f128_pd(t1, t3, 0x31);
|
||||
}
|
||||
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<avx>) noexcept
|
||||
{
|
||||
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<avx>) noexcept
|
||||
{
|
||||
return transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
|
||||
}
|
||||
|
||||
// trunc
|
||||
template <class A>
|
||||
|
@ -1748,6 +1748,69 @@ namespace xsimd
|
||||
return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
|
||||
}
|
||||
|
||||
/*************
|
||||
* transpose *
|
||||
*************/
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<neon>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
|
||||
auto t01 = vtrnq_f32(r0, r1);
|
||||
auto t23 = vtrnq_f32(r2, r3);
|
||||
matrix_begin[0] = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0]));
|
||||
matrix_begin[1] = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1]));
|
||||
matrix_begin[2] = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0]));
|
||||
matrix_begin[3] = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1]));
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<neon>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<uint32_t, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
|
||||
auto t01 = vtrnq_u32(r0, r1);
|
||||
auto t23 = vtrnq_u32(r2, r3);
|
||||
matrix_begin[0] = vcombine_u32(vget_low_u32(t01.val[0]), vget_low_u32(t23.val[0]));
|
||||
matrix_begin[1] = vcombine_u32(vget_low_u32(t01.val[1]), vget_low_u32(t23.val[1]));
|
||||
matrix_begin[2] = vcombine_u32(vget_high_u32(t01.val[0]), vget_high_u32(t23.val[0]));
|
||||
matrix_begin[3] = vcombine_u32(vget_high_u32(t01.val[1]), vget_high_u32(t23.val[1]));
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<neon>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<int32_t, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
|
||||
auto t01 = vtrnq_s32(r0, r1);
|
||||
auto t23 = vtrnq_s32(r2, r3);
|
||||
matrix_begin[0] = vcombine_s32(vget_low_s32(t01.val[0]), vget_low_s32(t23.val[0]));
|
||||
matrix_begin[1] = vcombine_s32(vget_low_s32(t01.val[1]), vget_low_s32(t23.val[1]));
|
||||
matrix_begin[2] = vcombine_s32(vget_high_s32(t01.val[0]), vget_high_s32(t23.val[0]));
|
||||
matrix_begin[3] = vcombine_s32(vget_high_s32(t01.val[1]), vget_high_s32(t23.val[1]));
|
||||
}
|
||||
|
||||
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
matrix_begin[0] = vcombine_u64(vget_low_u64(r0), vget_low_u64(r1));
|
||||
matrix_begin[1] = vcombine_u64(vget_high_u64(r0), vget_high_u64(r1));
|
||||
}
|
||||
|
||||
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
matrix_begin[0] = vcombine_s64(vget_low_s64(r0), vget_low_s64(r1));
|
||||
matrix_begin[1] = vcombine_s64(vget_high_s64(r0), vget_high_s64(r1));
|
||||
}
|
||||
|
||||
/**********
|
||||
* zip_lo *
|
||||
**********/
|
||||
@ -2737,6 +2800,7 @@ namespace xsimd
|
||||
return set(batch<T, A>(), A(), data[idx]...);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#undef WRAP_BINARY_INT_EXCLUDING_64
|
||||
|
@ -949,6 +949,37 @@ namespace xsimd
|
||||
{
|
||||
return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
|
||||
}
|
||||
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<neon64>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
matrix_begin[0] = vzip1q_f64(r0, r1);
|
||||
matrix_begin[1] = vzip2q_f64(r0, r1);
|
||||
}
|
||||
|
||||
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
matrix_begin[0] = vzip1q_u64(r0, r1);
|
||||
matrix_begin[1] = vzip2q_u64(r0, r1);
|
||||
}
|
||||
|
||||
template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<neon64>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
matrix_begin[0] = vzip1q_s64(r0, r1);
|
||||
matrix_begin[1] = vzip2q_s64(r0, r1);
|
||||
}
|
||||
|
||||
/**********
|
||||
* zip_lo *
|
||||
**********/
|
||||
|
@ -1640,6 +1640,50 @@ namespace xsimd
|
||||
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
|
||||
}
|
||||
|
||||
// transpose
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
|
||||
_MM_TRANSPOSE4_PS(r0, r1, r2, r3);
|
||||
matrix_begin[0] = r0;
|
||||
matrix_begin[1] = r1;
|
||||
matrix_begin[2] = r2;
|
||||
matrix_begin[3] = r3;
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
|
||||
{
|
||||
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
|
||||
{
|
||||
transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
|
||||
}
|
||||
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
|
||||
matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
|
||||
{
|
||||
transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
|
||||
}
|
||||
template <class A>
|
||||
XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
|
||||
{
|
||||
transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
|
||||
}
|
||||
|
||||
// zip_hi
|
||||
template <class A>
|
||||
XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
|
||||
|
@ -39,6 +39,8 @@ namespace xsimd
|
||||
XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
|
||||
template <class A, class T>
|
||||
XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
|
||||
template <class A, class T>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<generic>) noexcept;
|
||||
|
||||
// abs
|
||||
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
|
||||
@ -1576,6 +1578,40 @@ namespace xsimd
|
||||
return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
|
||||
}
|
||||
|
||||
// transpose
|
||||
template <class A, class T>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end, requires_arch<wasm>) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
(void)matrix_end;
|
||||
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
|
||||
{
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
|
||||
|
||||
auto t0 = wasm_i32x4_shuffle(r0, r1, 0, 4, 1, 5); // r0[0] r1[0] r0[1] r1[1]
|
||||
auto t1 = wasm_i32x4_shuffle(r0, r1, 2, 6, 3, 7); // r0[2] r1[2] r0[3] r1[3]
|
||||
|
||||
auto t2 = wasm_i32x4_shuffle(r2, r3, 0, 4, 1, 5); // r2[0] r3[0] r2[1] r3[1]
|
||||
auto t3 = wasm_i32x4_shuffle(r2, r3, 2, 6, 3, 7); // r2[2] r3[2] r2[3] r3[3]
|
||||
|
||||
matrix_begin[0] = wasm_i32x4_shuffle(t0, t2, 0, 1, 4, 5); // r0[0] r1[0] r2[0] r3[0]
|
||||
matrix_begin[1] = wasm_i32x4_shuffle(t0, t2, 2, 3, 6, 7); // r0[1] r1[1] r2[1] r3[1]
|
||||
matrix_begin[2] = wasm_i32x4_shuffle(t1, t3, 0, 1, 4, 5); // r0[2] r1[2] r2[2] r3[2]
|
||||
matrix_begin[3] = wasm_i32x4_shuffle(t1, t3, 2, 3, 6, 7); // r0[3] r1[3] r2[3] r3[3]
|
||||
}
|
||||
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
|
||||
{
|
||||
auto r0 = matrix_begin[0], r1 = matrix_begin[1];
|
||||
|
||||
matrix_begin[0] = wasm_i64x2_shuffle(r0, r1, 0, 2);
|
||||
matrix_begin[1] = wasm_i64x2_shuffle(r0, r1, 1, 3);
|
||||
}
|
||||
else
|
||||
{
|
||||
transpose(matrix_begin, matrix_end, generic {});
|
||||
}
|
||||
}
|
||||
|
||||
// trunc
|
||||
template <class A>
|
||||
XSIMD_INLINE batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept
|
||||
|
@ -28,13 +28,14 @@ namespace xsimd
|
||||
*
|
||||
* @defgroup batch_arithmetic Arithmetic operators
|
||||
* @defgroup batch_constant Constant batches
|
||||
* @defgroup batch_cond Conditional operators
|
||||
* @defgroup batch_data_transfer Memory operators
|
||||
* @defgroup batch_math Basic math operators
|
||||
* @defgroup batch_math_extra Extra math operators
|
||||
* @defgroup batch_fp Floating point manipulation
|
||||
* @defgroup batch_rounding Rounding operators
|
||||
* @defgroup batch_conversion Conversion operators
|
||||
* @defgroup batch_complex_op Complex operators
|
||||
* @defgroup batch_complex Complex operators
|
||||
* @defgroup batch_logical Logical operators
|
||||
* @defgroup batch_bitwise Bitwise operators
|
||||
* @defgroup batch_reducers Reducers
|
||||
@ -1890,7 +1891,7 @@ namespace xsimd
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup rotate_left
|
||||
* @ingroup batch_data_transfer
|
||||
*
|
||||
* Slide the whole batch to the left by \c n bytes, and reintroduce the
|
||||
* slided out elements from the right. This is different from
|
||||
@ -1908,7 +1909,7 @@ namespace xsimd
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup rotate_right
|
||||
* @ingroup batch_data_transfer
|
||||
*
|
||||
* Slide the whole batch to the right by \c n bytes, and reintroduce the
|
||||
* slided out elements from the left. This is different from
|
||||
@ -2021,7 +2022,7 @@ namespace xsimd
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup batch_miscellaneous
|
||||
* @ingroup batch_cond
|
||||
*
|
||||
* Ternary operator for batches: selects values from the batches \c true_br or \c false_br
|
||||
* depending on the boolean values in the constant batch \c cond. Equivalent to
|
||||
@ -2042,7 +2043,7 @@ namespace xsimd
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup batch_miscellaneous
|
||||
* @ingroup batch_cond
|
||||
*
|
||||
* Ternary operator for batches: selects values from the batches \c true_br or \c false_br
|
||||
* depending on the boolean values in the constant batch \c cond. Equivalent to
|
||||
@ -2063,7 +2064,7 @@ namespace xsimd
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup batch_miscellaneous
|
||||
* @ingroup batch_cond
|
||||
*
|
||||
* Ternary operator for batches: selects values from the batches \c true_br or \c false_br
|
||||
* depending on the boolean values in the constant batch \c cond. Equivalent to
|
||||
@ -2515,6 +2516,23 @@ namespace xsimd
|
||||
return batch_cast<as_integer_t<T>>(x);
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup batch_data_transfer
|
||||
*
|
||||
* Transposes in place the matrix whose line are each of the batch passed as
|
||||
* argument.
|
||||
* @param matrix_begin pointer to the first line of the matrix to transpose
|
||||
* @param matrix_end pointer to one element after the last line of the matrix to transpose
|
||||
*
|
||||
*/
|
||||
template <class T, class A>
|
||||
XSIMD_INLINE void transpose(batch<T, A>* matrix_begin, batch<T, A>* matrix_end) noexcept
|
||||
{
|
||||
assert((matrix_end - matrix_begin == batch<T, A>::size) && "correctly sized matrix");
|
||||
detail::static_check_supported_config<T, A>();
|
||||
return kernel::transpose(matrix_begin, matrix_end, A {});
|
||||
}
|
||||
|
||||
/**
|
||||
* @ingroup batch_rounding
|
||||
*
|
||||
|
@ -95,8 +95,10 @@ namespace xsimd
|
||||
} \
|
||||
template <> \
|
||||
XSIMD_INLINE type bitcast<type>(type x) noexcept { return x; } \
|
||||
static XSIMD_INLINE byte_type as_bytes(type x) noexcept \
|
||||
template <class U> \
|
||||
static XSIMD_INLINE byte_type as_bytes(U x) noexcept \
|
||||
{ \
|
||||
static_assert(std::is_same<U, type>::value, "inconsistent conversion types"); \
|
||||
const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
|
||||
return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words); \
|
||||
} \
|
||||
@ -125,6 +127,83 @@ namespace xsimd
|
||||
#undef XSIMD_RVV_MAKE_TYPES
|
||||
#undef XSIMD_RVV_MAKE_TYPE
|
||||
|
||||
// Specialization needed for #1058
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 8>::type
|
||||
rvv_type_info<int8_t, rvv_width_m1 * 8>::bitcast<__rvv_uint8m8_t>(
|
||||
__rvv_uint8m8_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_i8m8(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 1>::type
|
||||
rvv_type_info<int8_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
|
||||
__rvv_uint8m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_i8m1(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint16_t, rvv_width_m1 * 1>::type
|
||||
rvv_type_info<uint16_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
|
||||
__rvv_uint8m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u16m1(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint32_t, rvv_width_m1 * 1>::type
|
||||
rvv_type_info<uint32_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
|
||||
__rvv_uint8m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u32m1(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint64_t, rvv_width_m1 * 1>::type
|
||||
rvv_type_info<uint64_t, rvv_width_m1 * 1>::bitcast<__rvv_uint8m1_t>(
|
||||
__rvv_uint8m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u64m1(x);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 8>::byte_type
|
||||
rvv_type_info<int8_t, rvv_width_m1 * 8>::as_bytes<__rvv_int8m8_t>(__rvv_int8m8_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u8m8(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<int8_t, rvv_width_m1 * 1>::byte_type
|
||||
rvv_type_info<int8_t, rvv_width_m1 * 1>::as_bytes<__rvv_int8m1_t>(__rvv_int8m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u8m1(x);
|
||||
}
|
||||
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint8_t, rvv_width_m1 * 1>::byte_type
|
||||
rvv_type_info<uint8_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint8m1_t>(__rvv_uint8m1_t x) noexcept
|
||||
{
|
||||
return x;
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint16_t, rvv_width_m1 * 1>::byte_type
|
||||
rvv_type_info<uint16_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint16m1_t>(__rvv_uint16m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u8m1(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint32_t, rvv_width_m1 * 1>::byte_type
|
||||
rvv_type_info<uint32_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint32m1_t>(__rvv_uint32m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u8m1(x);
|
||||
}
|
||||
template <>
|
||||
XSIMD_INLINE rvv_type_info<uint64_t, rvv_width_m1 * 1>::byte_type
|
||||
rvv_type_info<uint64_t, rvv_width_m1 * 1>::as_bytes<__rvv_uint64m1_t>(__rvv_uint64m1_t x) noexcept
|
||||
{
|
||||
return __riscv_vreinterpret_u8m1(x);
|
||||
}
|
||||
|
||||
// rvv_blob is storage-type abstraction for a vector register.
|
||||
template <class T, size_t Width>
|
||||
struct rvv_blob : public rvv_type_info<T, Width>
|
||||
|
4
third_party/xsimd/moz.yaml
vendored
4
third_party/xsimd/moz.yaml
vendored
@ -10,8 +10,8 @@ origin:
|
||||
|
||||
url: https://github.com/QuantStack/xsimd
|
||||
|
||||
release: 50a69bf8bc892b854a0490ba6cc9a73031347f01 (2024-09-18T13:09:40Z).
|
||||
revision: 50a69bf8bc892b854a0490ba6cc9a73031347f01
|
||||
release: e384105a2a3809c319f0740e2ebf6166da895fcb (2024-10-16T06:11:04Z).
|
||||
revision: e384105a2a3809c319f0740e2ebf6166da895fcb
|
||||
|
||||
license: BSD-3-Clause
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user