mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-12-21 15:02:51 +00:00
[OPENMP][NVPTX]Improve number of threads counter, NFC.
Summary: Patch improves performance of the full runtime mode by moving number-of-threads counter to the shared memory. It also allows to save global memory. Reviewers: grokos, gtbercea, kkwli0 Subscribers: guansong, jfb, jdoerfert, openmp-commits, caomhin Tags: #openmp Differential Revision: https://reviews.llvm.org/D61785 llvm-svn: 360457
This commit is contained in:
parent
6c3ae79e9b
commit
f62c266de7
@ -45,9 +45,7 @@ EXTERN void omp_set_num_threads(int num) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
EXTERN int omp_get_num_threads(void) {
|
EXTERN int omp_get_num_threads(void) {
|
||||||
bool isSPMDExecutionMode = isSPMDMode();
|
int rc = GetNumberOfOmpThreads(isSPMDMode());
|
||||||
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
|
|
||||||
int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
|
|
||||||
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
|
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -156,10 +154,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
|
|||||||
int rc = -1;
|
int rc = -1;
|
||||||
// If level is 0 or all parallel regions are not active - return 0.
|
// If level is 0 or all parallel regions are not active - return 0.
|
||||||
unsigned parLevel = parallelLevel[GetWarpId()];
|
unsigned parLevel = parallelLevel[GetWarpId()];
|
||||||
if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
|
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
|
||||||
level <= parLevel)) {
|
|
||||||
rc = 0;
|
|
||||||
} else if (level > 0) {
|
|
||||||
int totLevel = omp_get_level();
|
int totLevel = omp_get_level();
|
||||||
if (level <= totLevel) {
|
if (level <= totLevel) {
|
||||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||||
@ -179,8 +174,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
|
|||||||
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
|
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
|
||||||
(int)currTaskDescr->InParallelRegion(), (int)sched,
|
(int)currTaskDescr->InParallelRegion(), (int)sched,
|
||||||
currTaskDescr->RuntimeChunkSize(),
|
currTaskDescr->RuntimeChunkSize(),
|
||||||
(int)currTaskDescr->ThreadId(),
|
(int)currTaskDescr->ThreadId(), (int)threadsInTeam,
|
||||||
(int)currTaskDescr->ThreadsInTeam(),
|
|
||||||
(int)currTaskDescr->NThreads());
|
(int)currTaskDescr->NThreads());
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -196,6 +190,12 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
|
|||||||
} while (currTaskDescr);
|
} while (currTaskDescr);
|
||||||
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
|
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
|
||||||
}
|
}
|
||||||
|
} else if (level == 0 ||
|
||||||
|
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||||
|
level <= parLevel) ||
|
||||||
|
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||||
|
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
|
||||||
|
rc = 0;
|
||||||
}
|
}
|
||||||
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
|
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
|
||||||
rc)
|
rc)
|
||||||
@ -208,30 +208,14 @@ EXTERN int omp_get_team_size(int level) {
|
|||||||
int rc = -1;
|
int rc = -1;
|
||||||
unsigned parLevel = parallelLevel[GetWarpId()];
|
unsigned parLevel = parallelLevel[GetWarpId()];
|
||||||
// If level is 0 or all parallel regions are not active - return 1.
|
// If level is 0 or all parallel regions are not active - return 1.
|
||||||
if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
|
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
|
||||||
level <= parLevel)) {
|
rc = threadsInTeam;
|
||||||
|
} else if (level == 0 ||
|
||||||
|
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||||
|
level <= parLevel) ||
|
||||||
|
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||||
|
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
|
||||||
rc = 1;
|
rc = 1;
|
||||||
} else if (level > 0) {
|
|
||||||
int totLevel = omp_get_level();
|
|
||||||
if (level <= totLevel) {
|
|
||||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
|
||||||
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
|
|
||||||
int steps = totLevel - level;
|
|
||||||
ASSERT0(LT_FUSSY, currTaskDescr,
|
|
||||||
"do not expect fct to be called in a non-active thread");
|
|
||||||
do {
|
|
||||||
if (currTaskDescr->IsParallelConstruct()) {
|
|
||||||
if (!steps) {
|
|
||||||
// found the level
|
|
||||||
rc = currTaskDescr->ThreadsInTeam();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
steps--;
|
|
||||||
}
|
|
||||||
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
|
|
||||||
} while (currTaskDescr);
|
|
||||||
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
|
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -99,12 +99,9 @@ public:
|
|||||||
// When IsRuntimeUninitialized is true, we assume that the caller is
|
// When IsRuntimeUninitialized is true, we assume that the caller is
|
||||||
// in an L0 parallel region and that all worker threads participate.
|
// in an L0 parallel region and that all worker threads participate.
|
||||||
|
|
||||||
int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
|
|
||||||
|
|
||||||
// Assume we are in teams region or that we use a single block
|
// Assume we are in teams region or that we use a single block
|
||||||
// per target region
|
// per target region
|
||||||
ST numberOfActiveOMPThreads =
|
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
|
||||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);
|
|
||||||
|
|
||||||
// All warps that are in excess of the maximum requested, do
|
// All warps that are in excess of the maximum requested, do
|
||||||
// not execute the loop
|
// not execute the loop
|
||||||
@ -212,7 +209,7 @@ public:
|
|||||||
}
|
}
|
||||||
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
||||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
|
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
|
||||||
T tnum = currTaskDescr->ThreadsInTeam();
|
T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
|
||||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||||
ASSERT0(LT_FUSSY, threadId < tnum,
|
ASSERT0(LT_FUSSY, threadId < tnum,
|
||||||
"current thread is not needed here; error");
|
"current thread is not needed here; error");
|
||||||
@ -455,7 +452,7 @@ public:
|
|||||||
|
|
||||||
// automatically selects thread or warp ID based on selected implementation
|
// automatically selects thread or warp ID based on selected implementation
|
||||||
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
||||||
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)),
|
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
|
||||||
"current thread is not needed here; error");
|
"current thread is not needed here; error");
|
||||||
// retrieve schedule
|
// retrieve schedule
|
||||||
kmp_sched_t schedule =
|
kmp_sched_t schedule =
|
||||||
@ -509,7 +506,7 @@ public:
|
|||||||
PRINT(LD_LOOP,
|
PRINT(LD_LOOP,
|
||||||
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
|
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
|
||||||
"last %d\n",
|
"last %d\n",
|
||||||
(int)GetNumberOfOmpThreads(tid, isSPMDMode()),
|
(int)GetNumberOfOmpThreads(isSPMDMode()),
|
||||||
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
|
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
|
||||||
(long long)*pupper, (long long)*pstride, (int)*plast);
|
(long long)*pupper, (long long)*pstride, (int)*plast);
|
||||||
return DISPATCH_NOTFINISHED;
|
return DISPATCH_NOTFINISHED;
|
||||||
@ -782,8 +779,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
|
|||||||
"Expected non-SPMD mode + initialized runtime.");
|
"Expected non-SPMD mode + initialized runtime.");
|
||||||
|
|
||||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||||
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
|
uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
|
||||||
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
|
|
||||||
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
|
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
|
||||||
for (unsigned i = 0; i < varNum; i++) {
|
for (unsigned i = 0; i < varNum; i++) {
|
||||||
// Reset buffer.
|
// Reset buffer.
|
||||||
|
@ -33,6 +33,7 @@ __device__ __shared__ uint32_t usedSlotIdx;
|
|||||||
|
|
||||||
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||||
__device__ __shared__ uint16_t threadLimit;
|
__device__ __shared__ uint16_t threadLimit;
|
||||||
|
__device__ __shared__ uint16_t threadsInTeam;
|
||||||
// Pointer to this team's OpenMP state object
|
// Pointer to this team's OpenMP state object
|
||||||
__device__ __shared__
|
__device__ __shared__
|
||||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||||
|
@ -137,8 +137,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
|||||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||||
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
||||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||||
newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
|
newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
|
||||||
currTeamDescr.LevelZeroTaskDescr());
|
|
||||||
// install new top descriptor
|
// install new top descriptor
|
||||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||||
newTaskDescr);
|
newTaskDescr);
|
||||||
@ -147,7 +146,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
|||||||
PRINT(LD_PAR,
|
PRINT(LD_PAR,
|
||||||
"thread will execute parallel region with id %d in a team of "
|
"thread will execute parallel region with id %d in a team of "
|
||||||
"%d threads\n",
|
"%d threads\n",
|
||||||
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
|
(int)newTaskDescr->ThreadId(), (int)ThreadLimit);
|
||||||
|
|
||||||
if (RequiresDataSharing && GetLaneId() == 0) {
|
if (RequiresDataSharing && GetLaneId() == 0) {
|
||||||
// Warp master innitializes data sharing environment.
|
// Warp master innitializes data sharing environment.
|
||||||
|
@ -166,7 +166,6 @@ public:
|
|||||||
// methods for other fields
|
// methods for other fields
|
||||||
INLINE uint16_t &NThreads() { return items.nthreads; }
|
INLINE uint16_t &NThreads() { return items.nthreads; }
|
||||||
INLINE uint16_t &ThreadId() { return items.threadId; }
|
INLINE uint16_t &ThreadId() { return items.threadId; }
|
||||||
INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
|
|
||||||
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
|
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
|
||||||
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
|
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
|
||||||
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
|
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
|
||||||
@ -174,14 +173,12 @@ public:
|
|||||||
}
|
}
|
||||||
// init & copy
|
// init & copy
|
||||||
INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
|
INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
|
||||||
INLINE void InitLevelOneTaskDescr(uint16_t tnum,
|
INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||||
omptarget_nvptx_TaskDescr *parentTaskDescr);
|
|
||||||
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
||||||
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
||||||
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||||
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||||
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
|
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
|
||||||
uint16_t tnum);
|
|
||||||
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
|
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
|
||||||
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
|
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
|
||||||
uint16_t tid, uint16_t tnum);
|
uint16_t tid, uint16_t tnum);
|
||||||
@ -213,7 +210,6 @@ private:
|
|||||||
uint8_t unused;
|
uint8_t unused;
|
||||||
uint16_t nthreads; // thread num for subsequent parallel regions
|
uint16_t nthreads; // thread num for subsequent parallel regions
|
||||||
uint16_t threadId; // thread id
|
uint16_t threadId; // thread id
|
||||||
uint16_t threadsInTeam; // threads in current team
|
|
||||||
uint64_t runtimeChunkSize; // runtime chunk size
|
uint64_t runtimeChunkSize; // runtime chunk size
|
||||||
} items;
|
} items;
|
||||||
omptarget_nvptx_TaskDescr *prev;
|
omptarget_nvptx_TaskDescr *prev;
|
||||||
@ -407,6 +403,7 @@ extern __device__ __shared__ uint32_t usedSlotIdx;
|
|||||||
extern __device__ __shared__ uint8_t
|
extern __device__ __shared__ uint8_t
|
||||||
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||||
extern __device__ __shared__ uint16_t threadLimit;
|
extern __device__ __shared__ uint16_t threadLimit;
|
||||||
|
extern __device__ __shared__ uint16_t threadsInTeam;
|
||||||
extern __device__ __shared__
|
extern __device__ __shared__
|
||||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||||
|
|
||||||
|
@ -42,14 +42,13 @@ omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
|
|||||||
items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
|
items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
|
||||||
; // threads: whatever was alloc by kernel
|
; // threads: whatever was alloc by kernel
|
||||||
items.threadId = 0; // is master
|
items.threadId = 0; // is master
|
||||||
items.threadsInTeam = 1; // sequential
|
|
||||||
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
|
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is called when all threads are started together in SPMD mode.
|
// This is called when all threads are started together in SPMD mode.
|
||||||
// OMP directives include target parallel, target distribute parallel for, etc.
|
// OMP directives include target parallel, target distribute parallel for, etc.
|
||||||
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
|
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
|
||||||
uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||||
// slow method
|
// slow method
|
||||||
// flag:
|
// flag:
|
||||||
// default sched is static,
|
// default sched is static,
|
||||||
@ -61,7 +60,6 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
|
|||||||
items.nthreads = 0; // # threads for subsequent parallel region
|
items.nthreads = 0; // # threads for subsequent parallel region
|
||||||
items.threadId =
|
items.threadId =
|
||||||
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
|
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
|
||||||
items.threadsInTeam = tnum;
|
|
||||||
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
|
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
|
||||||
prev = parentTaskDescr;
|
prev = parentTaskDescr;
|
||||||
}
|
}
|
||||||
@ -91,12 +89,11 @@ INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
|
|||||||
}
|
}
|
||||||
|
|
||||||
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
|
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
|
||||||
omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
|
omptarget_nvptx_TaskDescr *masterTaskDescr) {
|
||||||
CopyParent(masterTaskDescr);
|
CopyParent(masterTaskDescr);
|
||||||
// overrwrite specific items;
|
// overrwrite specific items;
|
||||||
items.flags |=
|
items.flags |=
|
||||||
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
|
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
|
||||||
items.threadsInTeam = tnum; // set number of threads
|
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
|
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
|
||||||
@ -121,7 +118,6 @@ INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
|
|||||||
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
|
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
|
||||||
CopyParent(parentTaskDescr);
|
CopyParent(parentTaskDescr);
|
||||||
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
|
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
|
||||||
items.threadsInTeam = tnum; // set number of threads
|
|
||||||
items.threadId = tid;
|
items.threadId = tid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -264,7 +264,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
|||||||
|
|
||||||
// Set number of threads on work descriptor.
|
// Set number of threads on work descriptor.
|
||||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||||
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads);
|
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
|
||||||
|
threadsInTeam = NumThreads;
|
||||||
}
|
}
|
||||||
|
|
||||||
// All workers call this function. Deactivate those not needed.
|
// All workers call this function. Deactivate those not needed.
|
||||||
@ -294,7 +295,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
|
|||||||
// Set to true for workers participating in the parallel region.
|
// Set to true for workers participating in the parallel region.
|
||||||
bool isActive = false;
|
bool isActive = false;
|
||||||
// Initialize state for active threads.
|
// Initialize state for active threads.
|
||||||
if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
|
if (threadId < threadsInTeam) {
|
||||||
// init work descriptor from workdesccr
|
// init work descriptor from workdesccr
|
||||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||||
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
||||||
@ -310,7 +311,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
|
|||||||
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
|
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
|
||||||
|
|
||||||
isActive = true;
|
isActive = true;
|
||||||
IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1);
|
IncParallelLevel(threadsInTeam != 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
return isActive;
|
return isActive;
|
||||||
@ -328,7 +329,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
|
|||||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||||
threadId, currTaskDescr->GetPrevTaskDescr());
|
threadId, currTaskDescr->GetPrevTaskDescr());
|
||||||
|
|
||||||
DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1);
|
DecParallelLevel(threadsInTeam != 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
@ -367,7 +368,6 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
|
|||||||
// - each thread becomes ID 0 in its serialized parallel, and
|
// - each thread becomes ID 0 in its serialized parallel, and
|
||||||
// - there is only one thread per team
|
// - there is only one thread per team
|
||||||
newTaskDescr->ThreadId() = 0;
|
newTaskDescr->ThreadId() = 0;
|
||||||
newTaskDescr->ThreadsInTeam() = 1;
|
|
||||||
|
|
||||||
// set new task descriptor as top
|
// set new task descriptor as top
|
||||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||||
|
@ -20,8 +20,7 @@
|
|||||||
EXTERN
|
EXTERN
|
||||||
int32_t __gpu_block_reduce() {
|
int32_t __gpu_block_reduce() {
|
||||||
bool isSPMDExecutionMode = isSPMDMode();
|
bool isSPMDExecutionMode = isSPMDMode();
|
||||||
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
|
int nt = GetNumberOfOmpThreads(isSPMDExecutionMode);
|
||||||
int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
|
|
||||||
if (nt != blockDim.x)
|
if (nt != blockDim.x)
|
||||||
return 0;
|
return 0;
|
||||||
unsigned tnum = __ACTIVEMASK();
|
unsigned tnum = __ACTIVEMASK();
|
||||||
@ -39,7 +38,7 @@ int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
|
|||||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||||
int numthread;
|
int numthread;
|
||||||
if (currTaskDescr->IsParallelConstruct()) {
|
if (currTaskDescr->IsParallelConstruct()) {
|
||||||
numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc));
|
numthread = GetNumberOfOmpThreads(checkSPMDMode(loc));
|
||||||
} else {
|
} else {
|
||||||
numthread = GetNumberOfOmpTeams();
|
numthread = GetNumberOfOmpTeams();
|
||||||
}
|
}
|
||||||
@ -147,8 +146,7 @@ static int32_t nvptx_parallel_reduce_nowait(
|
|||||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||||
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
|
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
|
||||||
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
|
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
|
||||||
uint32_t NumThreads =
|
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
|
||||||
GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode);
|
|
||||||
if (NumThreads == 1)
|
if (NumThreads == 1)
|
||||||
return 1;
|
return 1;
|
||||||
/*
|
/*
|
||||||
@ -279,9 +277,8 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
|||||||
// In generic mode only the team master participates in the teams
|
// In generic mode only the team master participates in the teams
|
||||||
// reduction because the workers are waiting for parallel work.
|
// reduction because the workers are waiting for parallel work.
|
||||||
uint32_t NumThreads =
|
uint32_t NumThreads =
|
||||||
isSPMDExecutionMode
|
isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
|
||||||
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
|
: /*Master thread only*/ 1;
|
||||||
: /*Master thread only*/ 1;
|
|
||||||
uint32_t TeamId = GetBlockIdInKernel();
|
uint32_t TeamId = GetBlockIdInKernel();
|
||||||
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
||||||
__shared__ volatile bool IsLastTeam;
|
__shared__ volatile bool IsLastTeam;
|
||||||
@ -473,9 +470,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
|||||||
// In generic mode only the team master participates in the teams
|
// In generic mode only the team master participates in the teams
|
||||||
// reduction because the workers are waiting for parallel work.
|
// reduction because the workers are waiting for parallel work.
|
||||||
uint32_t NumThreads =
|
uint32_t NumThreads =
|
||||||
checkSPMDMode(loc)
|
checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
|
||||||
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
|
: /*Master thread only*/ 1;
|
||||||
: /*Master thread only*/ 1;
|
|
||||||
uint32_t TeamId = GetBlockIdInKernel();
|
uint32_t TeamId = GetBlockIdInKernel();
|
||||||
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
||||||
__shared__ unsigned Bound;
|
__shared__ unsigned Bound;
|
||||||
|
@ -54,8 +54,7 @@ INLINE int GetOmpThreadId(int threadId,
|
|||||||
INLINE int GetOmpTeamId(); // omp_team_num
|
INLINE int GetOmpTeamId(); // omp_team_num
|
||||||
|
|
||||||
// get OpenMP number of threads and team
|
// get OpenMP number of threads and team
|
||||||
INLINE int GetNumberOfOmpThreads(int threadId,
|
INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
|
||||||
bool isSPMDExecutionMode); // omp_num_threads
|
|
||||||
INLINE int GetNumberOfOmpTeams(); // omp_num_teams
|
INLINE int GetNumberOfOmpTeams(); // omp_num_teams
|
||||||
|
|
||||||
// get OpenMP number of procs
|
// get OpenMP number of procs
|
||||||
|
@ -165,18 +165,16 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) {
|
INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
|
||||||
// omp_num_threads
|
// omp_num_threads
|
||||||
int rc;
|
int rc;
|
||||||
if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
|
int Level = parallelLevel[GetWarpId()];
|
||||||
|
if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
|
||||||
rc = 1;
|
rc = 1;
|
||||||
} else if (isSPMDExecutionMode) {
|
} else if (isSPMDExecutionMode) {
|
||||||
rc = GetNumberOfThreadsInBlock();
|
rc = GetNumberOfThreadsInBlock();
|
||||||
} else {
|
} else {
|
||||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
rc = threadsInTeam;
|
||||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
|
||||||
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
|
|
||||||
rc = currTaskDescr->ThreadsInTeam();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return rc;
|
return rc;
|
||||||
|
@ -46,10 +46,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
|
|||||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||||
} else {
|
} else {
|
||||||
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
|
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
|
||||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
|
||||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
|
|
||||||
int numberOfActiveOMPThreads =
|
int numberOfActiveOMPThreads =
|
||||||
GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref));
|
GetNumberOfOmpThreads(checkSPMDMode(loc_ref));
|
||||||
if (numberOfActiveOMPThreads > 1) {
|
if (numberOfActiveOMPThreads > 1) {
|
||||||
if (checkSPMDMode(loc_ref)) {
|
if (checkSPMDMode(loc_ref)) {
|
||||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||||
|
Loading…
Reference in New Issue
Block a user