[OPENMP][NVPTX]Fix dynamic scheduling.

Summary: Previous implementation may cause the runtime crash when the number of teams is > 1024. Patch fixes this problem + reduces number of the atomic operations by 32 times. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, openmp-commits, caomhin Differential Revision: https://reviews.llvm.org/D56332 llvm-svn: 350524
2024-11-28 16:11:29 +00:00 · 2019-01-07 14:25:25 +00:00 · 2019-01-07 14:25:25 +00:00 · 26e6c86b79
commit 26e6c86b79
parent 2cd40c0170
2 changed files with 47 additions and 22 deletions
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
@ -352,18 +352,18 @@ public:
                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
                tid));
-
    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
-      __kmpc_barrier(loc, threadId);
-      // save sched state
-      int teamId = GetOmpTeamId();
+      // save data
      omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
-      if (GetThreadIdInBlock() == 0) {
-        if (chunk < 1)
-          chunk = 1;
-        omptarget_nvptx_threadPrivateContext->Chunk(teamId) = chunk;
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId) = ub;
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId) = lb;
+      if (chunk < 1)
+        chunk = 1;
+      omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
+      omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
+      omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        omptarget_nvptx_threadPrivateContext->Cnt() = 0;
+        __threadfence_block();
      }
      __kmpc_barrier(loc, threadId);
      PRINT(LD_LOOP,
@ -371,21 +371,45 @@ public:
            ", chunk %" PRIu64 "\n",
            (int)tnum,
            (unsigned long long)
-                omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
-            omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId),
-            omptarget_nvptx_threadPrivateContext->Chunk(teamId));
+                omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+            omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
+            omptarget_nvptx_threadPrivateContext->Chunk(tid));
    }
  }

  ////////////////////////////////////////////////////////////////////////////////
  // Support for dispatch next

+  INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
+    int lo, hi;
+    asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
+    hi = __SHFL_SYNC(active, hi, leader);
+    lo = __SHFL_SYNC(active, lo, leader);
+    asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
+    return val;
+  }
+
+  INLINE static uint64_t NextIter() {
+    unsigned int active = __ACTIVEMASK();
+    int leader = __ffs(active) - 1;
+    int change = __popc(active);
+    unsigned lane_mask_lt;
+    asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
+    unsigned int rank = __popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomicAdd(
+          (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
+          change);
+    }
+    warp_res = Shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
  INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
-                                     int64_t &loopLowerBound,
-                                     T loopUpperBound) {
-    // calculate lower bound for all lanes in the warp
-    lb = atomicAdd((unsigned long long *)&loopLowerBound,
-                   (unsigned long long)chunkSize);
+                                     T loopLowerBound, T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
    ub = lb + chunkSize - 1;  // Clang uses i <= ub

    // 3 result cases:
@ -461,11 +485,10 @@ public:
            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
            "bad sched");
    T myLb, myUb;
-    int teamId = GetOmpTeamId();
    int finished = DynamicNextChunk(
-        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(teamId),
-        omptarget_nvptx_threadPrivateContext->NextLowerBound(teamId),
-        omptarget_nvptx_threadPrivateContext->LoopUpperBound(teamId));
+        myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
+        omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
+        omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));

    if (finished == FINISHED)
      return DISPATCH_FINISHED;
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@ -344,6 +344,7 @@ public:
  INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }

  INLINE void InitThreadPrivateContext(int tid);
+  INLINE uint64_t &Cnt() { return cnt; }

 private:
  // team context for this team
@ -366,6 +367,7 @@ private:
  // state for dispatch with dyn/guided OR static (never use both at a time)
  int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
  int64_t stride[MAX_THREADS_PER_TEAM];
+  uint64_t cnt;
 };

 /// Device envrionment data