[OpenMP] Simplify parallel reductions (#70983)

A lot of the code was from a time when we had multiple parallel levels. The new runtime is much simpler, the code can be simplified a lot which should speed up reductions too.
2024-11-26 23:21:11 +00:00 · 2023-11-02 15:50:05 -07:00 · 2023-11-02 15:50:05 -07:00 · e9a48f9e05
commit e9a48f9e05
parent eab828d46c
2 changed files with 46 additions and 97 deletions
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
  }
 }

-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
-static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
-                                          ShuffleReductFnTy shflFct) {
-  uint32_t size, remote_id, physical_lane_id;
-  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
-  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
-  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
-  do {
-    Liveness = mapping::activemask();
-    remote_id = utils::ffs(Liveness & lanemask_gt);
-    size = utils::popc(Liveness);
-    logical_lane_id /= 2;
-    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
-            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
-  } while (logical_lane_id % 2 == 0 && size > 1);
-  return (logical_lane_id == 0);
-}
-#endif
-
-static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
-                                            uint64_t reduce_size,
-                                            void *reduce_data,
+static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
                                            ShuffleReductFnTy shflFct,
-                                            InterWarpCopyFnTy cpyFct,
-                                            bool isSPMDExecutionMode, bool) {
-  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
-  if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
-    BlockThreadId = 0;
+                                            InterWarpCopyFnTy cpyFct) {
  uint32_t NumThreads = omp_get_num_threads();
+  // Handle degenerated parallel regions, including all nested ones, first.
  if (NumThreads == 1)
    return 1;
-    /*
-     * This reduce function handles reduction within a team. It handles
-     * parallel regions in both L1 and L2 parallelism levels. It also
-     * supports Generic, SPMD, and NoOMP modes.
-     *
-     * 1. Reduce within a warp.
-     * 2. Warp master copies value to warp 0 via shared memory.
-     * 3. Warp 0 reduces to a single value.
-     * 4. The reduced value is available in the thread that returns 1.
-     */

-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-  uint32_t WarpsNeeded =
+  /*
+   * 1. Reduce within a warp.
+   * 2. Warp master copies value to warp 0 via shared memory.
+   * 3. Warp 0 reduces to a single value.
+   * 4. The reduced value is available in the thread that returns 1.
+   */
+
+  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+  uint32_t NumWarps =
      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-  uint32_t WarpId = mapping::getWarpIdInBlock();

-  // Volta execution model:
  // For the Generic execution mode a parallel region either has 1 thread and
  // beyond that, always a multiple of 32. For the SPMD execution mode we may
  // have any number of threads.
-  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
+  gpu_regular_warp_reduce(reduce_data, shflFct);

  // When we have more than [mapping::getWarpSize()] number of threads
  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
  if (NumThreads > mapping::getWarpSize()) {
    // Gather all the reduced values from each warp
    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
+    cpyFct(reduce_data, NumWarps);

-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
+    if (BlockThreadId < mapping::getWarpSize())
+      gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
  }
+
+  // In Generic and in SPMD mode block thread Id 0 is what we want.
+  // It's either the main thread in SPMD mode or the "acting" main thread in the
+  // parallel region.
  return BlockThreadId == 0;
-#else
-  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
-  if (Liveness == lanes::All) // Full warp
-    gpu_regular_warp_reduce(reduce_data, shflFct);
-  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
-    gpu_irregular_warp_reduce(reduce_data, shflFct,
-                              /*LaneCount=*/utils::popc(Liveness),
-                              /*LaneId=*/mapping::getThreadIdInBlock() %
-                                  mapping::getWarpSize());
-  else { // Dispersed lanes. Only threads in L2
-         // parallel region may enter here; return
-         // early.
-    return gpu_irregular_simd_reduce(reduce_data, shflFct);
-  }
-
-  // When we have more than [mapping::getWarpSize()] number of threads
-  // a block reduction is performed here.
-  //
-  // Only L1 parallel region can enter this if condition.
-  if (NumThreads > mapping::getWarpSize()) {
-    uint32_t WarpsNeeded =
-        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
-    // Gather all the reduced values from each warp
-    // to the first warp.
-    cpyFct(reduce_data, WarpsNeeded);
-
-    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
-    if (WarpId == 0)
-      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
-                                BlockThreadId);
-
-    return BlockThreadId == 0;
-  }
-
-  // Get the OMP thread Id. This is different from BlockThreadId in the case of
-  // an L2 parallel region.
-  return TId == 0;
-#endif // __CUDA_ARCH__ >= 700
 }

 uint32_t roundToWarpsize(uint32_t s) {
@ -173,9 +99,7 @@ extern "C" {
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
-  return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
-                                      shflFct, cpyFct, mapping::isSPMDMode(),
-                                      false);
+  return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
 }

 /// Mostly like _v2 but with the builtin assumption that we have less than
--- a/openmp/libomptarget/test/offloading/generic_reduction.c
+++ b/openmp/libomptarget/test/offloading/generic_reduction.c
@ -0,0 +1,25 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+__attribute__((optnone)) void optnone(void) {}
+
+int main() {
+  int sum = 0, nt;
+#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
+  {
+    nt = 3 * omp_get_max_threads();
+    optnone();
+#pragma omp parallel reduction(+ : sum)
+    sum += 1;
+#pragma omp parallel reduction(+ : sum)
+    sum += 1;
+#pragma omp parallel reduction(+ : sum)
+    sum += 1;
+  }
+  // CHECK: nt: [[NT:.*]]
+  // CHECK: sum: [[NT]]
+  printf("nt: %i\n", nt);
+  printf("sum: %i\n", sum);
+}