[OpenMP] Simplify parallel reductions (#70983)

A lot of the code was from a time when we had multiple parallel levels.
The new runtime is much simpler, the code can be simplified a lot which
should speed up reductions too.
This commit is contained in:
Johannes Doerfert 2023-11-02 15:50:05 -07:00 committed by GitHub
parent eab828d46c
commit e9a48f9e05
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 46 additions and 97 deletions

View File

@ -44,119 +44,45 @@ void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
}
}
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
ShuffleReductFnTy shflFct) {
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
__kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
__kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
do {
Liveness = mapping::activemask();
remote_id = utils::ffs(Liveness & lanemask_gt);
size = utils::popc(Liveness);
logical_lane_id /= 2;
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
} while (logical_lane_id % 2 == 0 && size > 1);
return (logical_lane_id == 0);
}
#endif
static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
uint64_t reduce_size,
void *reduce_data,
static int32_t nvptx_parallel_reduce_nowait(void *reduce_data,
ShuffleReductFnTy shflFct,
InterWarpCopyFnTy cpyFct,
bool isSPMDExecutionMode, bool) {
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
if (mapping::isMainThreadInGenericMode(/* IsSPMD */ false))
BlockThreadId = 0;
InterWarpCopyFnTy cpyFct) {
uint32_t NumThreads = omp_get_num_threads();
// Handle degenerated parallel regions, including all nested ones, first.
if (NumThreads == 1)
return 1;
/*
* This reduce function handles reduction within a team. It handles
* parallel regions in both L1 and L2 parallelism levels. It also
* supports Generic, SPMD, and NoOMP modes.
*
* 1. Reduce within a warp.
* 2. Warp master copies value to warp 0 via shared memory.
* 3. Warp 0 reduces to a single value.
* 4. The reduced value is available in the thread that returns 1.
*/
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
uint32_t WarpsNeeded =
/*
* 1. Reduce within a warp.
* 2. Warp master copies value to warp 0 via shared memory.
* 3. Warp 0 reduces to a single value.
* 4. The reduced value is available in the thread that returns 1.
*/
uint32_t BlockThreadId = mapping::getThreadIdInBlock();
uint32_t NumWarps =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
uint32_t WarpId = mapping::getWarpIdInBlock();
// Volta execution model:
// For the Generic execution mode a parallel region either has 1 thread and
// beyond that, always a multiple of 32. For the SPMD execution mode we may
// have any number of threads.
if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/NumThreads % mapping::getWarpSize(),
/*LaneId=*/mapping::getThreadIdInBlock() %
mapping::getWarpSize());
gpu_regular_warp_reduce(reduce_data, shflFct);
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
cpyFct(reduce_data, NumWarps);
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
if (BlockThreadId < mapping::getWarpSize())
gpu_irregular_warp_reduce(reduce_data, shflFct, NumWarps, BlockThreadId);
}
// In Generic and in SPMD mode block thread Id 0 is what we want.
// It's either the main thread in SPMD mode or the "acting" main thread in the
// parallel region.
return BlockThreadId == 0;
#else
__kmpc_impl_lanemask_t Liveness = mapping::activemask();
if (Liveness == lanes::All) // Full warp
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/utils::popc(Liveness),
/*LaneId=*/mapping::getThreadIdInBlock() %
mapping::getWarpSize());
else { // Dispersed lanes. Only threads in L2
// parallel region may enter here; return
// early.
return gpu_irregular_simd_reduce(reduce_data, shflFct);
}
// When we have more than [mapping::getWarpSize()] number of threads
// a block reduction is performed here.
//
// Only L1 parallel region can enter this if condition.
if (NumThreads > mapping::getWarpSize()) {
uint32_t WarpsNeeded =
(NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
// Gather all the reduced values from each warp
// to the first warp.
cpyFct(reduce_data, WarpsNeeded);
uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
if (WarpId == 0)
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
BlockThreadId);
return BlockThreadId == 0;
}
// Get the OMP thread Id. This is different from BlockThreadId in the case of
// an L2 parallel region.
return TId == 0;
#endif // __CUDA_ARCH__ >= 700
}
uint32_t roundToWarpsize(uint32_t s) {
@ -173,9 +99,7 @@ extern "C" {
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
shflFct, cpyFct, mapping::isSPMDMode(),
false);
return nvptx_parallel_reduce_nowait(reduce_data, shflFct, cpyFct);
}
/// Mostly like _v2 but with the builtin assumption that we have less than

View File

@ -0,0 +1,25 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
// RUN: %libomptarget-compileoptxx-run-and-check-generic
#include <omp.h>
#include <stdio.h>
__attribute__((optnone)) void optnone(void) {}
int main() {
int sum = 0, nt;
#pragma omp target teams map(tofrom : sum, nt) num_teams(1)
{
nt = 3 * omp_get_max_threads();
optnone();
#pragma omp parallel reduction(+ : sum)
sum += 1;
#pragma omp parallel reduction(+ : sum)
sum += 1;
#pragma omp parallel reduction(+ : sum)
sum += 1;
}
// CHECK: nt: [[NT:.*]]
// CHECK: sum: [[NT]]
printf("nt: %i\n", nt);
printf("sum: %i\n", sum);
}