mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-12-21 06:52:10 +00:00
[libomptarget] Refactor activemask macro to inline function
Summary: [libomptarget] Refactor activemask macro to inline function See also abandoned D66846, split into this diff and others. Reviewers: jdoerfert, ABataev, grokos, ronlieb, gregrodgers Reviewed By: jdoerfert, ABataev Subscribers: openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D66851 llvm-svn: 370781
This commit is contained in:
parent
dfde7b09c8
commit
bbdd282371
@ -20,7 +20,7 @@ INLINE static unsigned getLaneId() { return threadIdx.x % WARPSIZE; }
|
||||
|
||||
// Return true if this is the first active thread in the warp.
|
||||
INLINE static bool IsWarpMasterActiveThread() {
|
||||
unsigned long long Mask = __ACTIVEMASK();
|
||||
unsigned long long Mask = __kmpc_impl_activemask();
|
||||
unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE);
|
||||
unsigned long long Sh = Mask << ShNum;
|
||||
// Truncate Sh to the 32 lower bits
|
||||
@ -112,7 +112,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
||||
(unsigned long long)SharingDefaultDataSize);
|
||||
|
||||
unsigned WID = getWarpId();
|
||||
unsigned CurActiveThreads = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask();
|
||||
|
||||
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
|
||||
void *&StackP = DataSharingState.StackPtr[WID];
|
||||
@ -252,7 +252,7 @@ EXTERN void __kmpc_data_sharing_environment_end(
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t CurActive = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
|
||||
|
||||
// Only the warp master can restore the stack and frame information, and only
|
||||
// if there are no other threads left behind in this environment (i.e. the
|
||||
@ -378,7 +378,7 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
|
||||
// Frame pointer must be visible to all workers in the same warp.
|
||||
const unsigned WID = getWarpId();
|
||||
void *FrameP = 0;
|
||||
int32_t CurActive = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
|
||||
|
||||
if (IsWarpMaster) {
|
||||
// SlotP will point to either the shared memory slot or an existing
|
||||
|
@ -389,7 +389,7 @@ public:
|
||||
}
|
||||
|
||||
INLINE static uint64_t NextIter() {
|
||||
__kmpc_impl_lanemask_t active = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
|
||||
uint32_t leader = __kmpc_impl_ffs(active) - 1;
|
||||
uint32_t change = __kmpc_impl_popc(active);
|
||||
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
|
||||
|
@ -45,17 +45,6 @@
|
||||
#define BARRIER_COUNTER 0
|
||||
#define ORDERED_COUNTER 1
|
||||
|
||||
// Macros for Cuda intrinsics
|
||||
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
|
||||
// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
|
||||
#ifndef CUDA_VERSION
|
||||
#error CUDA_VERSION macro is undefined, something wrong with cuda.
|
||||
#elif CUDA_VERSION >= 9000
|
||||
#define __ACTIVEMASK() __activemask()
|
||||
#else
|
||||
#define __ACTIVEMASK() __ballot(1)
|
||||
#endif // CUDA_VERSION
|
||||
|
||||
// arguments needed for L0 parallelism only.
|
||||
class omptarget_nvptx_SharedArgs {
|
||||
public:
|
||||
|
@ -53,7 +53,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
|
||||
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
|
||||
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
|
||||
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
|
||||
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
|
||||
@ -126,7 +126,7 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
|
||||
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
|
||||
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
|
||||
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
|
||||
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
|
||||
|
@ -61,12 +61,12 @@ INLINE static uint32_t
|
||||
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
|
||||
uint32_t size, remote_id, physical_lane_id;
|
||||
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
|
||||
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
uint32_t Liveness = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
|
||||
uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
|
||||
__kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
|
||||
do {
|
||||
Liveness = __ACTIVEMASK();
|
||||
Liveness = __kmpc_impl_activemask();
|
||||
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
|
||||
size = __kmpc_impl_popc(Liveness);
|
||||
logical_lane_id /= 2;
|
||||
@ -81,7 +81,7 @@ int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
uint32_t Liveness = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
if (Liveness == 0xffffffff) {
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
return GetThreadIdInBlock() % WARPSIZE ==
|
||||
@ -142,7 +142,7 @@ static int32_t nvptx_parallel_reduce_nowait(
|
||||
}
|
||||
return BlockThreadId == 0;
|
||||
#else
|
||||
uint32_t Liveness = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
if (Liveness == 0xffffffff) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
|
||||
@ -317,7 +317,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
|
||||
|
||||
// Reduce across warps to the warp master.
|
||||
uint32_t Liveness = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
if (Liveness == 0xffffffff) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else // Partial warp but contiguous lanes
|
||||
|
@ -204,9 +204,9 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
|
||||
// Parallel level
|
||||
|
||||
INLINE void IncParallelLevel(bool ActiveParallel) {
|
||||
unsigned Active = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
|
||||
__kmpc_impl_syncwarp(Active);
|
||||
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
|
||||
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
|
||||
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
|
||||
if (Rank == 0) {
|
||||
parallelLevel[GetWarpId()] +=
|
||||
@ -217,9 +217,9 @@ INLINE void IncParallelLevel(bool ActiveParallel) {
|
||||
}
|
||||
|
||||
INLINE void DecParallelLevel(bool ActiveParallel) {
|
||||
unsigned Active = __ACTIVEMASK();
|
||||
__kmpc_impl_lanemask_t Active = __kmpc_impl_activemask();
|
||||
__kmpc_impl_syncwarp(Active);
|
||||
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
|
||||
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
|
||||
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
|
||||
if (Rank == 0) {
|
||||
parallelLevel[GetWarpId()] -=
|
||||
|
@ -142,7 +142,7 @@ EXTERN void __kmpc_flush(kmp_Ident *loc) {
|
||||
|
||||
EXTERN int32_t __kmpc_warp_active_thread_mask() {
|
||||
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
|
||||
return __ACTIVEMASK();
|
||||
return __kmpc_impl_activemask();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -48,6 +48,16 @@ INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); }
|
||||
#error CUDA_VERSION macro is undefined, something wrong with cuda.
|
||||
#endif
|
||||
|
||||
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
|
||||
|
||||
INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||
#if CUDA_VERSION >= 9000
|
||||
return __activemask();
|
||||
#else
|
||||
return __ballot(1);
|
||||
#endif
|
||||
}
|
||||
|
||||
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
|
||||
|
||||
INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
||||
|
Loading…
Reference in New Issue
Block a user