[OpenMP] Unify omptarget API and usage wrt. __tgt_async_info

This patch unifies our libomptarget API in two ways:
  - always pass a `__tgt_async_info` object, the Queue member decides if
    it is in use or not.
  - (almost) always synchronize in the interface layer and not in the
    omptarget layer.

A side effect is that we now put all constructor and static initializer
kernels in a stream too, if the device utilizes `__tgt_async_info`.

The patch contains a TODO which can be addressed as we add support for
asynchronous malloc and free in the plugin API. This is the only
`synchronizeAsyncInfo` left in the omptarget layer.

Site note: On a V100 system the GridMini performance for small sizes
more than doubled.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D96379
This commit is contained in:
Johannes Doerfert 2021-02-10 11:06:00 -06:00
parent a2fc0d34db
commit 758b849931
7 changed files with 121 additions and 106 deletions

View File

@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "device.h"
#include "omptarget.h"
#include "private.h"
#include "rtl.h"
@ -171,11 +172,13 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
} else if (src_device == omp_get_initial_device()) {
DP("copy from host to device\n");
DeviceTy &DstDev = PM->Devices[dst_device];
rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr);
AsyncInfoTy AsyncInfo(DstDev);
rc = DstDev.submitData(dstAddr, srcAddr, length, AsyncInfo);
} else if (dst_device == omp_get_initial_device()) {
DP("copy from device to host\n");
DeviceTy &SrcDev = PM->Devices[src_device];
rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr);
AsyncInfoTy AsyncInfo(SrcDev);
rc = SrcDev.retrieveData(dstAddr, srcAddr, length, AsyncInfo);
} else {
DP("copy from device to device\n");
DeviceTy &SrcDev = PM->Devices[src_device];
@ -183,15 +186,21 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
// First try to use D2D memcpy which is more efficient. If fails, fall back
// to unefficient way.
if (SrcDev.isDataExchangable(DstDev)) {
rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr);
AsyncInfoTy AsyncInfo(SrcDev);
rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
return OFFLOAD_SUCCESS;
}
void *buffer = malloc(length);
rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr);
if (rc == OFFLOAD_SUCCESS)
rc = DstDev.submitData(dstAddr, buffer, length, nullptr);
{
AsyncInfoTy AsyncInfo(SrcDev);
rc = SrcDev.retrieveData(buffer, srcAddr, length, AsyncInfo);
}
if (rc == OFFLOAD_SUCCESS) {
AsyncInfoTy AsyncInfo(SrcDev);
rc = DstDev.submitData(dstAddr, buffer, length, AsyncInfo);
}
free(buffer);
}

View File

@ -415,27 +415,27 @@ int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
// Submit data to device
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
__tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
AsyncInfoTy &AsyncInfo) {
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
else
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
AsyncInfoPtr);
AsyncInfo);
}
// Retrieve data from device
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
int64_t Size, __tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
int64_t Size, AsyncInfoTy &AsyncInfo) {
if (!RTL->data_retrieve_async || !RTL->synchronize)
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
else
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
AsyncInfoPtr);
AsyncInfo);
}
// Copy data from current device to destination device directly
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, __tgt_async_info *AsyncInfo) {
int64_t Size, AsyncInfoTy &AsyncInfo) {
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
@ -448,13 +448,13 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
// Run region on device
int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
__tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
AsyncInfoTy &AsyncInfo) {
if (!RTL->run_region || !RTL->synchronize)
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
TgtVarsSize);
else
return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, AsyncInfoPtr);
TgtOffsets, TgtVarsSize, AsyncInfo);
}
// Run team region on device.
@ -462,15 +462,15 @@ int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount,
__tgt_async_info *AsyncInfoPtr) {
if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
AsyncInfoTy &AsyncInfo) {
if (!RTL->run_team_region_async || !RTL->synchronize)
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
LoopTripCount);
else
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams,
ThreadLimit, LoopTripCount, AsyncInfoPtr);
ThreadLimit, LoopTripCount, AsyncInfo);
}
// Whether data can be copied to DstDevice directly
@ -485,9 +485,9 @@ bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
return false;
}
int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) {
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
if (RTL->synchronize)
return RTL->synchronize(RTLDeviceID, AsyncInfoPtr);
return RTL->synchronize(RTLDeviceID, AsyncInfo);
return OFFLOAD_SUCCESS;
}

View File

@ -22,13 +22,13 @@
#include <set>
#include <vector>
#include "omptarget.h"
#include "rtl.h"
// Forward declarations.
struct RTLInfoTy;
struct __tgt_bin_desc;
struct __tgt_target_table;
struct __tgt_async_info;
using map_var_info_t = void *;
@ -200,24 +200,24 @@ struct DeviceTy {
// synchronous.
// Copy data from host to device
int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
__tgt_async_info *AsyncInfoPtr);
AsyncInfoTy &AsyncInfo);
// Copy data from device back to host
int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
__tgt_async_info *AsyncInfoPtr);
AsyncInfoTy &AsyncInfo);
// Copy data from current device to destination device directly
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, __tgt_async_info *AsyncInfo);
int64_t Size, AsyncInfoTy &AsyncInfo);
int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets,
int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr);
int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo);
int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr);
uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
/// Synchronize device/queue/event based on \p AsyncInfoPtr and return
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
int32_t synchronize(__tgt_async_info *AsyncInfoPtr);
int32_t synchronize(AsyncInfoTy &AsyncInfo);
private:
// Call to RTL

View File

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "device.h"
#include "omptarget.h"
#include "private.h"
#include "rtl.h"
@ -183,8 +184,11 @@ EXTERN void __tgt_target_data_begin_mapper(ident_t *loc, int64_t device_id,
}
#endif
AsyncInfoTy AsyncInfo(Device);
int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, nullptr);
arg_types, arg_names, arg_mappers, AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
}
@ -270,8 +274,11 @@ EXTERN void __tgt_target_data_end_mapper(ident_t *loc, int64_t device_id,
}
#endif
AsyncInfoTy AsyncInfo(Device);
int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, nullptr);
arg_types, arg_names, arg_mappers, AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
}
@ -335,8 +342,11 @@ EXTERN void __tgt_target_data_update_mapper(ident_t *loc, int64_t device_id,
arg_names, "Updating OpenMP data");
DeviceTy &Device = PM->Devices[device_id];
AsyncInfoTy AsyncInfo(Device);
int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, nullptr);
arg_types, arg_names, arg_mappers, AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
}
@ -408,9 +418,12 @@ EXTERN int __tgt_target_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
#endif
DeviceTy &Device = PM->Devices[device_id];
int rc =
target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr);
AsyncInfoTy AsyncInfo(Device);
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, 0, 0, false /*team*/,
AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
return rc;
}
@ -490,9 +503,12 @@ EXTERN int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id,
#endif
DeviceTy &Device = PM->Devices[device_id];
AsyncInfoTy AsyncInfo(Device);
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
arg_types, arg_names, arg_mappers, team_num, thread_limit,
true /*team*/, nullptr);
true /*team*/, AsyncInfo);
if (rc == OFFLOAD_SUCCESS)
rc = AsyncInfo.synchronize();
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
return rc;
}

View File

@ -23,7 +23,7 @@ int AsyncInfoTy::synchronize() {
int Result = OFFLOAD_SUCCESS;
if (AsyncInfo.Queue) {
// If we have a queue we need to synchronize it now.
Result = Device.synchronize(&AsyncInfo);
Result = Device.synchronize(*this);
assert(AsyncInfo.Queue == nullptr &&
"The device plugin should have nulled the queue to indicate there "
"are no outstanding actions!");
@ -166,6 +166,7 @@ static int InitLibrary(DeviceTy &Device) {
* Run ctors for static objects
*/
if (!Device.PendingCtorsDtors.empty()) {
AsyncInfoTy AsyncInfo(Device);
// Call all ctors for all libraries registered so far
for (auto &lib : Device.PendingCtorsDtors) {
if (!lib.second.PendingCtors.empty()) {
@ -174,7 +175,7 @@ static int InitLibrary(DeviceTy &Device) {
void *ctor = entry;
int rc =
target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
if (rc != OFFLOAD_SUCCESS) {
REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
Device.PendingGlobalsMtx.unlock();
@ -186,6 +187,9 @@ static int InitLibrary(DeviceTy &Device) {
DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
}
}
// All constructors have been issued, wait for them now.
if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
}
Device.HasPendingGlobals = false;
Device.PendingGlobalsMtx.unlock();
@ -226,6 +230,7 @@ static int32_t getParentIndex(int64_t type) {
int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
int64_t arg_size, int64_t arg_type,
map_var_info_t arg_names, void *arg_mapper,
AsyncInfoTy &AsyncInfo,
TargetDataFuncPtrTy target_data_function) {
TIMESCOPE_WITH_IDENT(loc);
DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
@ -256,11 +261,10 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
MapperArgNames[I] = C.Name;
}
int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
MapperArgsBase.data(), MapperArgs.data(),
MapperArgSizes.data(), MapperArgTypes.data(),
MapperArgNames.data(), /*arg_mappers*/ nullptr,
/* AsyncInfoTy */ nullptr);
int rc = target_data_function(
loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(),
MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(),
MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo);
return rc;
}
@ -269,7 +273,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes,
int64_t *arg_types, map_var_info_t *arg_names,
void **arg_mappers, AsyncInfoTy *AsyncInfo) {
void **arg_mappers, AsyncInfoTy &AsyncInfo) {
// process each input.
for (int32_t i = 0; i < arg_num; ++i) {
// Ignore private variables and arrays - there is no mapping for them.
@ -286,7 +290,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
int rc = targetDataMapper(loc, Device, args_base[i], args[i],
arg_sizes[i], arg_types[i], arg_name,
arg_mappers[i], targetDataBegin);
arg_mappers[i], AsyncInfo, targetDataBegin);
if (rc != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
@ -416,7 +420,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
int rt =
Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo);
Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@ -430,7 +434,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
sizeof(void *), *AsyncInfo);
sizeof(void *), AsyncInfo);
if (rt != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@ -470,7 +474,7 @@ struct DeallocTgtPtrInfo {
int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy *AsyncInfo) {
void **ArgMappers, AsyncInfoTy &AsyncInfo) {
int Ret;
std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
// process each input.
@ -488,9 +492,9 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
DP("Calling targetDataMapper for the %dth argument\n", I);
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
Ret =
targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataEnd);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
@ -585,7 +589,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
*AsyncInfo);
AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
@ -637,17 +641,13 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
}
}
// TODO: We should not synchronize here but pass the AsyncInfo object to the
// allocate/deallocate device APIs.
//
// We need to synchronize before deallocating data.
// If AsyncInfo is nullptr, the previous data transfer (if has) will be
// synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
// nullptr, there is no data transfer happened because once there is,
// AsyncInfo->Queue will not be nullptr, so again, we don't need to
// synchronize.
if (AsyncInfo) {
Ret = AsyncInfo->synchronize();
if (Ret != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
}
Ret = AsyncInfo.synchronize();
if (Ret != OFFLOAD_SUCCESS)
return OFFLOAD_FAIL;
// Deallocate target pointer
for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
@ -664,7 +664,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
void *HstPtrBegin, int64_t ArgSize,
int64_t ArgType) {
int64_t ArgType, AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_IDENT(loc);
bool IsLast, IsHostPtr;
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
@ -690,7 +690,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
if (ArgType & OMP_TGT_MAPTYPE_FROM) {
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data from device failed.\n");
return OFFLOAD_FAIL;
@ -717,7 +717,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
if (ArgType & OMP_TGT_MAPTYPE_TO) {
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@ -737,7 +737,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
"pointer " DPxMOD "\n",
DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
sizeof(void *), nullptr);
sizeof(void *), AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
Device.ShadowMtx.unlock();
@ -753,8 +753,8 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
void *ArgsBase,
__tgt_target_non_contig *NonContig,
uint64_t Size, int64_t ArgType,
int CurrentDim, int DimSize,
uint64_t Offset) {
int CurrentDim, int DimSize, uint64_t Offset,
AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_IDENT(loc);
int Ret = OFFLOAD_SUCCESS;
if (CurrentDim < DimSize) {
@ -766,7 +766,7 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
if (CurrentDim != DimSize - 1 || I == 0) {
Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
ArgType, CurrentDim + 1, DimSize,
Offset + CurOffset);
Offset + CurOffset, AsyncInfo);
// Stop the whole process if any contiguous piece returns anything
// other than OFFLOAD_SUCCESS.
if (Ret != OFFLOAD_SUCCESS)
@ -778,7 +778,8 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
" len %" PRIu64 "\n",
DPxPTR(Ptr), Offset, Size);
Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType);
Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
AsyncInfo);
}
return Ret;
}
@ -794,12 +795,10 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
}
/// Internal function to pass data to/from the target.
// AsyncInfo is currently unused, added here so targetDataUpdate has the
// same signature as targetDataBegin and targetDataEnd.
int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
void **ArgsBase, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *ArgNames,
void **ArgMappers, AsyncInfoTy *AsyncInfo) {
void **ArgMappers, AsyncInfoTy &AsyncInfo) {
// process each input.
for (int32_t I = 0; I < ArgNum; ++I) {
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
@ -814,7 +813,7 @@ int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
ArgTypes[I], ArgName, ArgMappers[I],
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
targetDataUpdate);
if (Ret != OFFLOAD_SUCCESS) {
@ -837,10 +836,10 @@ int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
Ret = targetDataNonContiguous(
loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
/*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
/*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
} else {
Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
ArgTypes[I]);
ArgTypes[I], AsyncInfo);
}
if (Ret == OFFLOAD_FAIL)
return OFFLOAD_FAIL;
@ -950,7 +949,7 @@ class PrivateArgumentManagerTy {
/// A reference to the \p DeviceTy object
DeviceTy &Device;
/// A pointer to a \p AsyncInfoTy object
AsyncInfoTy *AsyncInfo;
AsyncInfoTy &AsyncInfo;
// TODO: What would be the best value here? Should we make it configurable?
// If the size is larger than this threshold, we will allocate and transfer it
@ -959,7 +958,7 @@ class PrivateArgumentManagerTy {
public:
/// Constructor
PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo)
PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
: Device(Dev), AsyncInfo(AsyncInfo) {}
/// Add a private argument
@ -986,7 +985,7 @@ public:
#endif
// If first-private, copy data from host
if (IsFirstPrivate) {
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo);
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
DP("Copying data to device failed, failed.\n");
return OFFLOAD_FAIL;
@ -1042,7 +1041,7 @@ public:
FirstPrivateArgSize, DPxPTR(TgtPtr));
// Transfer data to target device
int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
FirstPrivateArgSize, *AsyncInfo);
FirstPrivateArgSize, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
DP("Failed to submit data of private arguments.\n");
return OFFLOAD_FAIL;
@ -1090,7 +1089,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
std::vector<void *> &TgtArgs,
std::vector<ptrdiff_t> &TgtOffsets,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy *AsyncInfo) {
AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
DeviceTy &Device = PM->Devices[DeviceId];
int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
@ -1141,7 +1140,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
sizeof(void *), *AsyncInfo);
sizeof(void *), AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
return OFFLOAD_FAIL;
@ -1211,7 +1210,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers,
PrivateArgumentManagerTy &PrivateArgumentManager,
AsyncInfoTy *AsyncInfo) {
AsyncInfoTy &AsyncInfo) {
TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
DeviceTy &Device = PM->Devices[DeviceId];
@ -1243,7 +1242,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) {
int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
int32_t DeviceId = Device.DeviceID;
TableMap *TM = getTableMap(HostPtr);
@ -1264,12 +1263,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
}
assert(TargetTable && "Global data has not been mapped\n");
// TODO: This will go away as soon as we consequently pass in async info
// objects (as references).
AsyncInfoTy InternalAsyncInfo(Device);
if (!AsyncInfo)
AsyncInfo = &InternalAsyncInfo;
std::vector<void *> TgtArgs;
std::vector<ptrdiff_t> TgtOffsets;
@ -1301,10 +1294,10 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
if (IsTeamConstruct)
Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
TgtArgs.size(), TeamNum, ThreadLimit,
LoopTripCount, *AsyncInfo);
LoopTripCount, AsyncInfo);
else
Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
TgtArgs.size(), *AsyncInfo);
TgtArgs.size(), AsyncInfo);
}
if (Ret != OFFLOAD_SUCCESS) {
@ -1322,13 +1315,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
REPORT("Failed to process data after launching the kernel.\n");
return OFFLOAD_FAIL;
}
} else {
// TODO: We should not synchronize here but on the outer level once we pass
// in a reference AsyncInfo object.
// If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't
// hava any argument, and the device supports async operations, so we need a
// sync at this point.
return AsyncInfo->synchronize();
}
return OFFLOAD_SUCCESS;

View File

@ -23,23 +23,23 @@
extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes,
int64_t *arg_types, map_var_info_t *arg_names,
void **arg_mappers, AsyncInfoTy *AsyncInfo);
void **arg_mappers, AsyncInfoTy &AsyncInfo);
extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *arg_names,
void **ArgMappers, AsyncInfoTy *AsyncInfo);
void **ArgMappers, AsyncInfoTy &AsyncInfo);
extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num,
void **args_base, void **args, int64_t *arg_sizes,
int64_t *arg_types, map_var_info_t *arg_names,
void **arg_mappers, AsyncInfoTy *AsyncInfo);
void **arg_mappers, AsyncInfoTy &AsyncInfo);
extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
void **ArgBases, void **Args, int64_t *ArgSizes,
int64_t *ArgTypes, map_var_info_t *arg_names,
void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
int IsTeamConstruct, AsyncInfoTy *AsyncInfo);
int IsTeamConstruct, AsyncInfoTy &AsyncInfo);
extern int CheckDeviceAndCtors(int64_t device_id);
@ -76,7 +76,7 @@ typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
// targetDataEnd and targetDataUpdate).
typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
void **, int64_t *, int64_t *,
map_var_info_t *, void **, AsyncInfoTy *);
map_var_info_t *, void **, AsyncInfoTy &);
// Implemented in libomp, they are called from within __tgt_* functions.
#ifdef __cplusplus

View File

@ -400,16 +400,20 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
DeviceTy &Device = PM->Devices[FoundRTL->Idx + i];
Device.PendingGlobalsMtx.lock();
if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
AsyncInfoTy AsyncInfo(Device);
for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
int rc =
target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
nullptr, nullptr, nullptr, 1, 1, true /*team*/,
AsyncInfo);
if (rc != OFFLOAD_SUCCESS) {
DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
}
}
// Remove this library's entry from PendingCtorsDtors
Device.PendingCtorsDtors.erase(desc);
// All constructors have been issued, wait for them now.
if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
DP("Failed synchronizing destructors kernels.\n");
}
Device.PendingGlobalsMtx.unlock();
}