mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-28 16:11:29 +00:00
[OPENMP][NVPTX]Fix dynamic scheduling.
Summary: Previous implementation may cause the runtime crash when the number of teams is > 1024. Patch fixes this problem + reduces number of the atomic operations by 32 times. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, openmp-commits, caomhin Differential Revision: https://reviews.llvm.org/D56332 llvm-svn: 350524
This commit is contained in:
parent
2cd40c0170
commit
26e6c86b79
@ -352,18 +352,18 @@ public:
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
|
||||
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
|
||||
__kmpc_barrier(loc, threadId);
|
||||
// save sched state
|
||||
int teamId = GetOmpTeamId();
|
||||
// save data
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
if (GetThreadIdInBlock() == 0) {
|
||||
if (chunk < 1)
|
||||
chunk = 1;
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
|
||||
if (chunk < 1)
|
||||
chunk = 1;
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
__kmpc_barrier(loc, threadId);
|
||||
if (tid == 0) {
|
||||
omptarget_nvptx_threadPrivateContext->Cnt() = 0;
|
||||
__threadfence_block();
|
||||
}
|
||||
__kmpc_barrier(loc, threadId);
|
||||
PRINT(LD_LOOP,
|
||||
@ -371,21 +371,45 @@ public:
|
||||
", chunk %" PRIu64 "\n",
|
||||
(int)tnum,
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(teamId));
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid));
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for dispatch next
|
||||
|
||||
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
|
||||
int lo, hi;
|
||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||
hi = __SHFL_SYNC(active, hi, leader);
|
||||
lo = __SHFL_SYNC(active, lo, leader);
|
||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||
return val;
|
||||
}
|
||||
|
||||
INLINE static uint64_t NextIter() {
|
||||
unsigned int active = __ACTIVEMASK();
|
||||
int leader = __ffs(active) - 1;
|
||||
int change = __popc(active);
|
||||
unsigned lane_mask_lt;
|
||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
|
||||
unsigned int rank = __popc(active & lane_mask_lt);
|
||||
uint64_t warp_res;
|
||||
if (rank == 0) {
|
||||
warp_res = atomicAdd(
|
||||
(unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
|
||||
change);
|
||||
}
|
||||
warp_res = Shuffle(active, warp_res, leader);
|
||||
return warp_res + rank;
|
||||
}
|
||||
|
||||
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
|
||||
int64_t &loopLowerBound,
|
||||
T loopUpperBound) {
|
||||
// calculate lower bound for all lanes in the warp
|
||||
lb = atomicAdd((unsigned long long *)&loopLowerBound,
|
||||
(unsigned long long)chunkSize);
|
||||
T loopLowerBound, T loopUpperBound) {
|
||||
T N = NextIter();
|
||||
lb = loopLowerBound + N * chunkSize;
|
||||
ub = lb + chunkSize - 1; // Clang uses i <= ub
|
||||
|
||||
// 3 result cases:
|
||||
@ -461,11 +485,10 @@ public:
|
||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
|
||||
"bad sched");
|
||||
T myLb, myUb;
|
||||
int teamId = GetOmpTeamId();
|
||||
int finished = DynamicNextChunk(
|
||||
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
|
||||
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
|
||||
|
||||
if (finished == FINISHED)
|
||||
return DISPATCH_FINISHED;
|
||||
|
@ -344,6 +344,7 @@ public:
|
||||
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
|
||||
|
||||
INLINE void InitThreadPrivateContext(int tid);
|
||||
INLINE uint64_t &Cnt() { return cnt; }
|
||||
|
||||
private:
|
||||
// team context for this team
|
||||
@ -366,6 +367,7 @@ private:
|
||||
// state for dispatch with dyn/guided OR static (never use both at a time)
|
||||
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
|
||||
int64_t stride[MAX_THREADS_PER_TEAM];
|
||||
uint64_t cnt;
|
||||
};
|
||||
|
||||
/// Device envrionment data
|
||||
|
Loading…
Reference in New Issue
Block a user