[OPENMP][NVPTX]Improve number of threads counter, NFC.

Summary:
Patch improves performance of the full runtime mode by moving
number-of-threads counter to the shared memory. It also allows to save
global memory.

Reviewers: grokos, gtbercea, kkwli0

Subscribers: guansong, jfb, jdoerfert, openmp-commits, caomhin

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D61785

llvm-svn: 360457
This commit is contained in:
Alexey Bataev 2019-05-10 18:56:05 +00:00
parent 6c3ae79e9b
commit f62c266de7
11 changed files with 47 additions and 83 deletions

View File

@ -45,9 +45,7 @@ EXTERN void omp_set_num_threads(int num) {
} }
EXTERN int omp_get_num_threads(void) { EXTERN int omp_get_num_threads(void) {
bool isSPMDExecutionMode = isSPMDMode(); int rc = GetNumberOfOmpThreads(isSPMDMode());
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
return rc; return rc;
} }
@ -156,10 +154,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
int rc = -1; int rc = -1;
// If level is 0 or all parallel regions are not active - return 0. // If level is 0 or all parallel regions are not active - return 0.
unsigned parLevel = parallelLevel[GetWarpId()]; unsigned parLevel = parallelLevel[GetWarpId()];
if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
level <= parLevel)) {
rc = 0;
} else if (level > 0) {
int totLevel = omp_get_level(); int totLevel = omp_get_level();
if (level <= totLevel) { if (level <= totLevel) {
omptarget_nvptx_TaskDescr *currTaskDescr = omptarget_nvptx_TaskDescr *currTaskDescr =
@ -179,8 +174,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
(currTaskDescr->IsParallelConstruct() ? "par" : "task"), (currTaskDescr->IsParallelConstruct() ? "par" : "task"),
(int)currTaskDescr->InParallelRegion(), (int)sched, (int)currTaskDescr->InParallelRegion(), (int)sched,
currTaskDescr->RuntimeChunkSize(), currTaskDescr->RuntimeChunkSize(),
(int)currTaskDescr->ThreadId(), (int)currTaskDescr->ThreadId(), (int)threadsInTeam,
(int)currTaskDescr->ThreadsInTeam(),
(int)currTaskDescr->NThreads()); (int)currTaskDescr->NThreads());
} }
@ -196,6 +190,12 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
} while (currTaskDescr); } while (currTaskDescr);
ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
} }
} else if (level == 0 ||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel) ||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
rc = 0;
} }
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
rc) rc)
@ -208,30 +208,14 @@ EXTERN int omp_get_team_size(int level) {
int rc = -1; int rc = -1;
unsigned parLevel = parallelLevel[GetWarpId()]; unsigned parLevel = parallelLevel[GetWarpId()];
// If level is 0 or all parallel regions are not active - return 1. // If level is 0 or all parallel regions are not active - return 1.
if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
level <= parLevel)) { rc = threadsInTeam;
} else if (level == 0 ||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel) ||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
rc = 1; rc = 1;
} else if (level > 0) {
int totLevel = omp_get_level();
if (level <= totLevel) {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
int steps = totLevel - level;
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
if (currTaskDescr->IsParallelConstruct()) {
if (!steps) {
// found the level
rc = currTaskDescr->ThreadsInTeam();
break;
}
steps--;
}
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
} while (currTaskDescr);
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
}
} }
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
return rc; return rc;

View File

@ -99,12 +99,9 @@ public:
// When IsRuntimeUninitialized is true, we assume that the caller is // When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate. // in an L0 parallel region and that all worker threads participate.
int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
// Assume we are in teams region or that we use a single block // Assume we are in teams region or that we use a single block
// per target region // per target region
ST numberOfActiveOMPThreads = ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);
// All warps that are in excess of the maximum requested, do // All warps that are in excess of the maximum requested, do
// not execute the loop // not execute the loop
@ -212,7 +209,7 @@ public:
} }
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = currTaskDescr->ThreadsInTeam(); T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
T tripCount = ub - lb + 1; // +1 because ub is inclusive T tripCount = ub - lb + 1; // +1 because ub is inclusive
ASSERT0(LT_FUSSY, threadId < tnum, ASSERT0(LT_FUSSY, threadId < tnum,
"current thread is not needed here; error"); "current thread is not needed here; error");
@ -455,7 +452,7 @@ public:
// automatically selects thread or warp ID based on selected implementation // automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)), ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
"current thread is not needed here; error"); "current thread is not needed here; error");
// retrieve schedule // retrieve schedule
kmp_sched_t schedule = kmp_sched_t schedule =
@ -509,7 +506,7 @@ public:
PRINT(LD_LOOP, PRINT(LD_LOOP,
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
"last %d\n", "last %d\n",
(int)GetNumberOfOmpThreads(tid, isSPMDMode()), (int)GetNumberOfOmpThreads(isSPMDMode()),
(int)GetNumberOfWorkersInTeam(), (long long)*plower, (int)GetNumberOfWorkersInTeam(), (long long)*plower,
(long long)*pupper, (long long)*pstride, (int)*plast); (long long)*pupper, (long long)*pstride, (int)*plast);
return DISPATCH_NOTFINISHED; return DISPATCH_NOTFINISHED;
@ -782,8 +779,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
"Expected non-SPMD mode + initialized runtime."); "Expected non-SPMD mode + initialized runtime.");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
for (unsigned i = 0; i < varNum; i++) { for (unsigned i = 0; i < varNum; i++) {
// Reset buffer. // Reset buffer.

View File

@ -33,6 +33,7 @@ __device__ __shared__ uint32_t usedSlotIdx;
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; __device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
__device__ __shared__ uint16_t threadLimit; __device__ __shared__ uint16_t threadLimit;
__device__ __shared__ uint16_t threadsInTeam;
// Pointer to this team's OpenMP state object // Pointer to this team's OpenMP state object
__device__ __shared__ __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;

View File

@ -137,8 +137,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
newTaskDescr->InitLevelOneTaskDescr(ThreadLimit, newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
currTeamDescr.LevelZeroTaskDescr());
// install new top descriptor // install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr); newTaskDescr);
@ -147,7 +146,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
PRINT(LD_PAR, PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of " "thread will execute parallel region with id %d in a team of "
"%d threads\n", "%d threads\n",
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam()); (int)newTaskDescr->ThreadId(), (int)ThreadLimit);
if (RequiresDataSharing && GetLaneId() == 0) { if (RequiresDataSharing && GetLaneId() == 0) {
// Warp master innitializes data sharing environment. // Warp master innitializes data sharing environment.

View File

@ -166,7 +166,6 @@ public:
// methods for other fields // methods for other fields
INLINE uint16_t &NThreads() { return items.nthreads; } INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadId() { return items.threadId; } INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
@ -174,14 +173,12 @@ public:
} }
// init & copy // init & copy
INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode); INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
INLINE void InitLevelOneTaskDescr(uint16_t tnum, INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr, INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
uint16_t tnum);
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
uint16_t tid, uint16_t tnum); uint16_t tid, uint16_t tnum);
@ -213,7 +210,6 @@ private:
uint8_t unused; uint8_t unused;
uint16_t nthreads; // thread num for subsequent parallel regions uint16_t nthreads; // thread num for subsequent parallel regions
uint16_t threadId; // thread id uint16_t threadId; // thread id
uint16_t threadsInTeam; // threads in current team
uint64_t runtimeChunkSize; // runtime chunk size uint64_t runtimeChunkSize; // runtime chunk size
} items; } items;
omptarget_nvptx_TaskDescr *prev; omptarget_nvptx_TaskDescr *prev;
@ -407,6 +403,7 @@ extern __device__ __shared__ uint32_t usedSlotIdx;
extern __device__ __shared__ uint8_t extern __device__ __shared__ uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit; extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__ uint16_t threadsInTeam;
extern __device__ __shared__ extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;

View File

@ -42,14 +42,13 @@ omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode); items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
; // threads: whatever was alloc by kernel ; // threads: whatever was alloc by kernel
items.threadId = 0; // is master items.threadId = 0; // is master
items.threadsInTeam = 1; // sequential
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
} }
// This is called when all threads are started together in SPMD mode. // This is called when all threads are started together in SPMD mode.
// OMP directives include target parallel, target distribute parallel for, etc. // OMP directives include target parallel, target distribute parallel for, etc.
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) { omptarget_nvptx_TaskDescr *parentTaskDescr) {
// slow method // slow method
// flag: // flag:
// default sched is static, // default sched is static,
@ -61,7 +60,6 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
items.nthreads = 0; // # threads for subsequent parallel region items.nthreads = 0; // # threads for subsequent parallel region
items.threadId = items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
items.threadsInTeam = tnum;
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1 items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
prev = parentTaskDescr; prev = parentTaskDescr;
} }
@ -91,12 +89,11 @@ INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
} }
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) { omptarget_nvptx_TaskDescr *masterTaskDescr) {
CopyParent(masterTaskDescr); CopyParent(masterTaskDescr);
// overrwrite specific items; // overrwrite specific items;
items.flags |= items.flags |=
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.threadsInTeam = tnum; // set number of threads
} }
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
@ -121,7 +118,6 @@ INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
CopyParent(parentTaskDescr); CopyParent(parentTaskDescr);
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
items.threadsInTeam = tnum; // set number of threads
items.threadId = tid; items.threadId = tid;
} }

View File

@ -264,7 +264,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
// Set number of threads on work descriptor. // Set number of threads on work descriptor.
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads); workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
threadsInTeam = NumThreads;
} }
// All workers call this function. Deactivate those not needed. // All workers call this function. Deactivate those not needed.
@ -294,7 +295,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
// Set to true for workers participating in the parallel region. // Set to true for workers participating in the parallel region.
bool isActive = false; bool isActive = false;
// Initialize state for active threads. // Initialize state for active threads.
if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) { if (threadId < threadsInTeam) {
// init work descriptor from workdesccr // init work descriptor from workdesccr
omptarget_nvptx_TaskDescr *newTaskDescr = omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
@ -310,7 +311,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads()); (int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
isActive = true; isActive = true;
IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1); IncParallelLevel(threadsInTeam != 1);
} }
return isActive; return isActive;
@ -328,7 +329,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr()); threadId, currTaskDescr->GetPrevTaskDescr());
DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1); DecParallelLevel(threadsInTeam != 1);
} }
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
@ -367,7 +368,6 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
// - each thread becomes ID 0 in its serialized parallel, and // - each thread becomes ID 0 in its serialized parallel, and
// - there is only one thread per team // - there is only one thread per team
newTaskDescr->ThreadId() = 0; newTaskDescr->ThreadId() = 0;
newTaskDescr->ThreadsInTeam() = 1;
// set new task descriptor as top // set new task descriptor as top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,

View File

@ -20,8 +20,7 @@
EXTERN EXTERN
int32_t __gpu_block_reduce() { int32_t __gpu_block_reduce() {
bool isSPMDExecutionMode = isSPMDMode(); bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); int nt = GetNumberOfOmpThreads(isSPMDExecutionMode);
int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
if (nt != blockDim.x) if (nt != blockDim.x)
return 0; return 0;
unsigned tnum = __ACTIVEMASK(); unsigned tnum = __ACTIVEMASK();
@ -39,7 +38,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
int numthread; int numthread;
if (currTaskDescr->IsParallelConstruct()) { if (currTaskDescr->IsParallelConstruct()) {
numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc)); numthread = GetNumberOfOmpThreads(checkSPMDMode(loc));
} else { } else {
numthread = GetNumberOfOmpTeams(); numthread = GetNumberOfOmpTeams();
} }
@ -147,8 +146,7 @@ static int32_t nvptx_parallel_reduce_nowait(
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) { bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
uint32_t NumThreads = uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode);
if (NumThreads == 1) if (NumThreads == 1)
return 1; return 1;
/* /*
@ -279,9 +277,8 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
// In generic mode only the team master participates in the teams // In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work. // reduction because the workers are waiting for parallel work.
uint32_t NumThreads = uint32_t NumThreads =
isSPMDExecutionMode isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true) : /*Master thread only*/ 1;
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel(); uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel();
__shared__ volatile bool IsLastTeam; __shared__ volatile bool IsLastTeam;
@ -473,9 +470,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// In generic mode only the team master participates in the teams // In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work. // reduction because the workers are waiting for parallel work.
uint32_t NumThreads = uint32_t NumThreads =
checkSPMDMode(loc) checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true) : /*Master thread only*/ 1;
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel(); uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel();
__shared__ unsigned Bound; __shared__ unsigned Bound;

View File

@ -54,8 +54,7 @@ INLINE int GetOmpThreadId(int threadId,
INLINE int GetOmpTeamId(); // omp_team_num INLINE int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team // get OpenMP number of threads and team
INLINE int GetNumberOfOmpThreads(int threadId, INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
bool isSPMDExecutionMode); // omp_num_threads
INLINE int GetNumberOfOmpTeams(); // omp_num_teams INLINE int GetNumberOfOmpTeams(); // omp_num_teams
// get OpenMP number of procs // get OpenMP number of procs

View File

@ -165,18 +165,16 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
return rc; return rc;
} }
INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) { INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
// omp_num_threads // omp_num_threads
int rc; int rc;
if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { int Level = parallelLevel[GetWarpId()];
if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
rc = 1; rc = 1;
} else if (isSPMDExecutionMode) { } else if (isSPMDExecutionMode) {
rc = GetNumberOfThreadsInBlock(); rc = GetNumberOfThreadsInBlock();
} else { } else {
omptarget_nvptx_TaskDescr *currTaskDescr = rc = threadsInTeam;
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadsInTeam();
} }
return rc; return rc;

View File

@ -46,10 +46,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
__kmpc_barrier_simple_spmd(loc_ref, tid); __kmpc_barrier_simple_spmd(loc_ref, tid);
} else { } else {
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
int numberOfActiveOMPThreads = int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref)); GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
if (numberOfActiveOMPThreads > 1) { if (numberOfActiveOMPThreads > 1) {
if (checkSPMDMode(loc_ref)) { if (checkSPMDMode(loc_ref)) {
__kmpc_barrier_simple_spmd(loc_ref, tid); __kmpc_barrier_simple_spmd(loc_ref, tid);