[OPENMP][NVPTX]Fix dynamic scheduling.

Summary:
Previous implementation may cause the runtime crash when the number of
teams is > 1024. Patch fixes this problem + reduces number of the atomic
operations by 32 times.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, openmp-commits, caomhin

Differential Revision: https://reviews.llvm.org/D56332

llvm-svn: 350524
This commit is contained in:
Alexey Bataev 2019-01-07 14:25:25 +00:00
parent 2cd40c0170
commit 26e6c86b79
2 changed files with 47 additions and 22 deletions

View File

@ -352,18 +352,18 @@ public:
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
tid));
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
__kmpc_barrier(loc, threadId);
// save sched state
int teamId = GetOmpTeamId();
// save data
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
if (GetThreadIdInBlock() == 0) {
if (chunk < 1)
chunk = 1;
omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
if (chunk < 1)
chunk = 1;
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
__kmpc_barrier(loc, threadId);
if (tid == 0) {
omptarget_nvptx_threadPrivateContext->Cnt() = 0;
__threadfence_block();
}
__kmpc_barrier(loc, threadId);
PRINT(LD_LOOP,
@ -371,21 +371,45 @@ public:
", chunk %" PRIu64 "\n",
(int)tnum,
(unsigned long long)
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
omptarget_nvptx_threadPrivateContext->Chunk(teamId));
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
omptarget_nvptx_threadPrivateContext->Chunk(tid));
}
}
////////////////////////////////////////////////////////////////////////////////
// Support for dispatch next
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
int lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
hi = __SHFL_SYNC(active, hi, leader);
lo = __SHFL_SYNC(active, lo, leader);
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
}
INLINE static uint64_t NextIter() {
unsigned int active = __ACTIVEMASK();
int leader = __ffs(active) - 1;
int change = __popc(active);
unsigned lane_mask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
unsigned int rank = __popc(active & lane_mask_lt);
uint64_t warp_res;
if (rank == 0) {
warp_res = atomicAdd(
(unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
change);
}
warp_res = Shuffle(active, warp_res, leader);
return warp_res + rank;
}
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
int64_t &loopLowerBound,
T loopUpperBound) {
// calculate lower bound for all lanes in the warp
lb = atomicAdd((unsigned long long *)&loopLowerBound,
(unsigned long long)chunkSize);
T loopLowerBound, T loopUpperBound) {
T N = NextIter();
lb = loopLowerBound + N * chunkSize;
ub = lb + chunkSize - 1; // Clang uses i <= ub
// 3 result cases:
@ -461,11 +485,10 @@ public:
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
"bad sched");
T myLb, myUb;
int teamId = GetOmpTeamId();
int finished = DynamicNextChunk(
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
if (finished == FINISHED)
return DISPATCH_FINISHED;

View File

@ -344,6 +344,7 @@ public:
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
INLINE void InitThreadPrivateContext(int tid);
INLINE uint64_t &Cnt() { return cnt; }
private:
// team context for this team
@ -366,6 +367,7 @@ private:
// state for dispatch with dyn/guided OR static (never use both at a time)
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
int64_t stride[MAX_THREADS_PER_TEAM];
uint64_t cnt;
};
/// Device envrionment data