mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-24 06:10:12 +00:00
[libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify
[libomptarget][devicertl][nfc] Remove some cuda intrinsics, simplify Replace __popc, __ffs with clang intrinsics. Move kmpc_impl_min to only file that uses it and replace template with explictly typed. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D95060
This commit is contained in:
parent
ca4ed1e7ae
commit
ea616f9026
@ -65,6 +65,10 @@ enum DATA_SHARING_SIZES {
|
||||
DS_Max_Warp_Number = 16,
|
||||
};
|
||||
|
||||
enum : __kmpc_impl_lanemask_t {
|
||||
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
|
||||
};
|
||||
|
||||
INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
|
||||
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
|
||||
@ -74,28 +78,15 @@ INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||
return (((uint64_t)hi) << 32) | (uint64_t)lo;
|
||||
}
|
||||
|
||||
enum : __kmpc_impl_lanemask_t {
|
||||
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
|
||||
};
|
||||
|
||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
|
||||
|
||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
|
||||
|
||||
DEVICE uint32_t __kmpc_impl_smid();
|
||||
|
||||
DEVICE double __kmpc_impl_get_wtick();
|
||||
|
||||
DEVICE double __kmpc_impl_get_wtime();
|
||||
|
||||
INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
|
||||
|
||||
INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
|
||||
|
||||
template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
|
||||
return x < y ? x : y;
|
||||
}
|
||||
|
||||
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();
|
||||
|
||||
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var,
|
||||
|
@ -184,6 +184,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) {
|
||||
return (s & ~(unsigned)(WARPSIZE - 1));
|
||||
}
|
||||
|
||||
INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
|
||||
|
||||
DEVICE static volatile uint32_t IterCnt = 0;
|
||||
DEVICE static volatile uint32_t Cnt = 0;
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
@ -261,14 +263,14 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
// by returning 1 in the thread holding the reduction result.
|
||||
|
||||
// Check if this is the very last team.
|
||||
unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records));
|
||||
unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
|
||||
if (ChunkTeamCount == NumTeams - Bound - 1) {
|
||||
//
|
||||
// Last team processing.
|
||||
//
|
||||
if (ThreadId >= NumRecs)
|
||||
return 0;
|
||||
NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs));
|
||||
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
|
||||
if (ThreadId >= NumThreads)
|
||||
return 0;
|
||||
|
||||
@ -283,7 +285,7 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads);
|
||||
uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
|
||||
if (ActiveThreads > WARPSIZE) {
|
||||
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
// Gather all the reduced values from each warp
|
||||
|
@ -93,13 +93,8 @@ DEVICE uint32_t __kmpc_impl_smid();
|
||||
DEVICE double __kmpc_impl_get_wtick();
|
||||
DEVICE double __kmpc_impl_get_wtime();
|
||||
|
||||
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); }
|
||||
|
||||
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
|
||||
|
||||
template <typename T> INLINE T __kmpc_impl_min(T x, T y) {
|
||||
return min(x, y);
|
||||
}
|
||||
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
|
||||
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
|
||||
|
||||
#ifndef CUDA_VERSION
|
||||
#error CUDA_VERSION macro is undefined, something wrong with cuda.
|
||||
|
Loading…
Reference in New Issue
Block a user