mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-24 06:10:12 +00:00
[OpenMP] Unify omptarget API and usage wrt. __tgt_async_info
This patch unifies our libomptarget API in two ways: - always pass a `__tgt_async_info` object, the Queue member decides if it is in use or not. - (almost) always synchronize in the interface layer and not in the omptarget layer. A side effect is that we now put all constructor and static initializer kernels in a stream too, if the device utilizes `__tgt_async_info`. The patch contains a TODO which can be addressed as we add support for asynchronous malloc and free in the plugin API. This is the only `synchronizeAsyncInfo` left in the omptarget layer. Site note: On a V100 system the GridMini performance for small sizes more than doubled. Reviewed By: tianshilei1992 Differential Revision: https://reviews.llvm.org/D96379
This commit is contained in:
parent
a2fc0d34db
commit
758b849931
@ -11,6 +11,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "device.h"
|
||||
#include "omptarget.h"
|
||||
#include "private.h"
|
||||
#include "rtl.h"
|
||||
|
||||
@ -171,11 +172,13 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
|
||||
} else if (src_device == omp_get_initial_device()) {
|
||||
DP("copy from host to device\n");
|
||||
DeviceTy &DstDev = PM->Devices[dst_device];
|
||||
rc = DstDev.submitData(dstAddr, srcAddr, length, nullptr);
|
||||
AsyncInfoTy AsyncInfo(DstDev);
|
||||
rc = DstDev.submitData(dstAddr, srcAddr, length, AsyncInfo);
|
||||
} else if (dst_device == omp_get_initial_device()) {
|
||||
DP("copy from device to host\n");
|
||||
DeviceTy &SrcDev = PM->Devices[src_device];
|
||||
rc = SrcDev.retrieveData(dstAddr, srcAddr, length, nullptr);
|
||||
AsyncInfoTy AsyncInfo(SrcDev);
|
||||
rc = SrcDev.retrieveData(dstAddr, srcAddr, length, AsyncInfo);
|
||||
} else {
|
||||
DP("copy from device to device\n");
|
||||
DeviceTy &SrcDev = PM->Devices[src_device];
|
||||
@ -183,15 +186,21 @@ EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
|
||||
// First try to use D2D memcpy which is more efficient. If fails, fall back
|
||||
// to unefficient way.
|
||||
if (SrcDev.isDataExchangable(DstDev)) {
|
||||
rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, nullptr);
|
||||
AsyncInfoTy AsyncInfo(SrcDev);
|
||||
rc = SrcDev.dataExchange(srcAddr, DstDev, dstAddr, length, AsyncInfo);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
void *buffer = malloc(length);
|
||||
rc = SrcDev.retrieveData(buffer, srcAddr, length, nullptr);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
rc = DstDev.submitData(dstAddr, buffer, length, nullptr);
|
||||
{
|
||||
AsyncInfoTy AsyncInfo(SrcDev);
|
||||
rc = SrcDev.retrieveData(buffer, srcAddr, length, AsyncInfo);
|
||||
}
|
||||
if (rc == OFFLOAD_SUCCESS) {
|
||||
AsyncInfoTy AsyncInfo(SrcDev);
|
||||
rc = DstDev.submitData(dstAddr, buffer, length, AsyncInfo);
|
||||
}
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
|
@ -415,27 +415,27 @@ int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
|
||||
|
||||
// Submit data to device
|
||||
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
|
||||
__tgt_async_info *AsyncInfoPtr) {
|
||||
if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize)
|
||||
AsyncInfoTy &AsyncInfo) {
|
||||
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
|
||||
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
|
||||
else
|
||||
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
|
||||
AsyncInfoPtr);
|
||||
AsyncInfo);
|
||||
}
|
||||
|
||||
// Retrieve data from device
|
||||
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
|
||||
int64_t Size, __tgt_async_info *AsyncInfoPtr) {
|
||||
if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize)
|
||||
int64_t Size, AsyncInfoTy &AsyncInfo) {
|
||||
if (!RTL->data_retrieve_async || !RTL->synchronize)
|
||||
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
|
||||
else
|
||||
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
|
||||
AsyncInfoPtr);
|
||||
AsyncInfo);
|
||||
}
|
||||
|
||||
// Copy data from current device to destination device directly
|
||||
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
|
||||
int64_t Size, __tgt_async_info *AsyncInfo) {
|
||||
int64_t Size, AsyncInfoTy &AsyncInfo) {
|
||||
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
|
||||
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
|
||||
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
|
||||
@ -448,13 +448,13 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
|
||||
// Run region on device
|
||||
int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
|
||||
__tgt_async_info *AsyncInfoPtr) {
|
||||
if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize)
|
||||
AsyncInfoTy &AsyncInfo) {
|
||||
if (!RTL->run_region || !RTL->synchronize)
|
||||
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
|
||||
TgtVarsSize);
|
||||
else
|
||||
return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
|
||||
TgtOffsets, TgtVarsSize, AsyncInfoPtr);
|
||||
TgtOffsets, TgtVarsSize, AsyncInfo);
|
||||
}
|
||||
|
||||
// Run team region on device.
|
||||
@ -462,15 +462,15 @@ int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
|
||||
int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint64_t LoopTripCount,
|
||||
__tgt_async_info *AsyncInfoPtr) {
|
||||
if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize)
|
||||
AsyncInfoTy &AsyncInfo) {
|
||||
if (!RTL->run_team_region_async || !RTL->synchronize)
|
||||
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
|
||||
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
|
||||
LoopTripCount);
|
||||
else
|
||||
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
|
||||
TgtOffsets, TgtVarsSize, NumTeams,
|
||||
ThreadLimit, LoopTripCount, AsyncInfoPtr);
|
||||
ThreadLimit, LoopTripCount, AsyncInfo);
|
||||
}
|
||||
|
||||
// Whether data can be copied to DstDevice directly
|
||||
@ -485,9 +485,9 @@ bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t DeviceTy::synchronize(__tgt_async_info *AsyncInfoPtr) {
|
||||
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
|
||||
if (RTL->synchronize)
|
||||
return RTL->synchronize(RTLDeviceID, AsyncInfoPtr);
|
||||
return RTL->synchronize(RTLDeviceID, AsyncInfo);
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
|
@ -22,13 +22,13 @@
|
||||
#include <set>
|
||||
#include <vector>
|
||||
|
||||
#include "omptarget.h"
|
||||
#include "rtl.h"
|
||||
|
||||
// Forward declarations.
|
||||
struct RTLInfoTy;
|
||||
struct __tgt_bin_desc;
|
||||
struct __tgt_target_table;
|
||||
struct __tgt_async_info;
|
||||
|
||||
using map_var_info_t = void *;
|
||||
|
||||
@ -200,24 +200,24 @@ struct DeviceTy {
|
||||
// synchronous.
|
||||
// Copy data from host to device
|
||||
int32_t submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
|
||||
__tgt_async_info *AsyncInfoPtr);
|
||||
AsyncInfoTy &AsyncInfo);
|
||||
// Copy data from device back to host
|
||||
int32_t retrieveData(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size,
|
||||
__tgt_async_info *AsyncInfoPtr);
|
||||
AsyncInfoTy &AsyncInfo);
|
||||
// Copy data from current device to destination device directly
|
||||
int32_t dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
|
||||
int64_t Size, __tgt_async_info *AsyncInfo);
|
||||
int64_t Size, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
int32_t runRegion(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets,
|
||||
int32_t TgtVarsSize, __tgt_async_info *AsyncInfoPtr);
|
||||
int32_t TgtVarsSize, AsyncInfoTy &AsyncInfo);
|
||||
int32_t runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
|
||||
int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint64_t LoopTripCount, __tgt_async_info *AsyncInfoPtr);
|
||||
uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
/// Synchronize device/queue/event based on \p AsyncInfoPtr and return
|
||||
/// OFFLOAD_SUCCESS/OFFLOAD_FAIL when succeeds/fails.
|
||||
int32_t synchronize(__tgt_async_info *AsyncInfoPtr);
|
||||
int32_t synchronize(AsyncInfoTy &AsyncInfo);
|
||||
|
||||
private:
|
||||
// Call to RTL
|
||||
|
@ -12,6 +12,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "device.h"
|
||||
#include "omptarget.h"
|
||||
#include "private.h"
|
||||
#include "rtl.h"
|
||||
|
||||
@ -183,8 +184,11 @@ EXTERN void __tgt_target_data_begin_mapper(ident_t *loc, int64_t device_id,
|
||||
}
|
||||
#endif
|
||||
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
int rc = targetDataBegin(loc, Device, arg_num, args_base, args, arg_sizes,
|
||||
arg_types, arg_names, arg_mappers, nullptr);
|
||||
arg_types, arg_names, arg_mappers, AsyncInfo);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
rc = AsyncInfo.synchronize();
|
||||
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
|
||||
}
|
||||
|
||||
@ -270,8 +274,11 @@ EXTERN void __tgt_target_data_end_mapper(ident_t *loc, int64_t device_id,
|
||||
}
|
||||
#endif
|
||||
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
int rc = targetDataEnd(loc, Device, arg_num, args_base, args, arg_sizes,
|
||||
arg_types, arg_names, arg_mappers, nullptr);
|
||||
arg_types, arg_names, arg_mappers, AsyncInfo);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
rc = AsyncInfo.synchronize();
|
||||
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
|
||||
}
|
||||
|
||||
@ -335,8 +342,11 @@ EXTERN void __tgt_target_data_update_mapper(ident_t *loc, int64_t device_id,
|
||||
arg_names, "Updating OpenMP data");
|
||||
|
||||
DeviceTy &Device = PM->Devices[device_id];
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
int rc = targetDataUpdate(loc, Device, arg_num, args_base, args, arg_sizes,
|
||||
arg_types, arg_names, arg_mappers, nullptr);
|
||||
arg_types, arg_names, arg_mappers, AsyncInfo);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
rc = AsyncInfo.synchronize();
|
||||
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
|
||||
}
|
||||
|
||||
@ -408,9 +418,12 @@ EXTERN int __tgt_target_mapper(ident_t *loc, int64_t device_id, void *host_ptr,
|
||||
#endif
|
||||
|
||||
DeviceTy &Device = PM->Devices[device_id];
|
||||
int rc =
|
||||
target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
|
||||
arg_types, arg_names, arg_mappers, 0, 0, false /*team*/, nullptr);
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
|
||||
arg_types, arg_names, arg_mappers, 0, 0, false /*team*/,
|
||||
AsyncInfo);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
rc = AsyncInfo.synchronize();
|
||||
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
|
||||
return rc;
|
||||
}
|
||||
@ -490,9 +503,12 @@ EXTERN int __tgt_target_teams_mapper(ident_t *loc, int64_t device_id,
|
||||
#endif
|
||||
|
||||
DeviceTy &Device = PM->Devices[device_id];
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
int rc = target(loc, Device, host_ptr, arg_num, args_base, args, arg_sizes,
|
||||
arg_types, arg_names, arg_mappers, team_num, thread_limit,
|
||||
true /*team*/, nullptr);
|
||||
true /*team*/, AsyncInfo);
|
||||
if (rc == OFFLOAD_SUCCESS)
|
||||
rc = AsyncInfo.synchronize();
|
||||
HandleTargetOutcome(rc == OFFLOAD_SUCCESS, loc);
|
||||
return rc;
|
||||
}
|
||||
|
@ -23,7 +23,7 @@ int AsyncInfoTy::synchronize() {
|
||||
int Result = OFFLOAD_SUCCESS;
|
||||
if (AsyncInfo.Queue) {
|
||||
// If we have a queue we need to synchronize it now.
|
||||
Result = Device.synchronize(&AsyncInfo);
|
||||
Result = Device.synchronize(*this);
|
||||
assert(AsyncInfo.Queue == nullptr &&
|
||||
"The device plugin should have nulled the queue to indicate there "
|
||||
"are no outstanding actions!");
|
||||
@ -166,6 +166,7 @@ static int InitLibrary(DeviceTy &Device) {
|
||||
* Run ctors for static objects
|
||||
*/
|
||||
if (!Device.PendingCtorsDtors.empty()) {
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
// Call all ctors for all libraries registered so far
|
||||
for (auto &lib : Device.PendingCtorsDtors) {
|
||||
if (!lib.second.PendingCtors.empty()) {
|
||||
@ -174,7 +175,7 @@ static int InitLibrary(DeviceTy &Device) {
|
||||
void *ctor = entry;
|
||||
int rc =
|
||||
target(nullptr, Device, ctor, 0, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
|
||||
nullptr, nullptr, nullptr, 1, 1, true /*team*/, AsyncInfo);
|
||||
if (rc != OFFLOAD_SUCCESS) {
|
||||
REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
|
||||
Device.PendingGlobalsMtx.unlock();
|
||||
@ -186,6 +187,9 @@ static int InitLibrary(DeviceTy &Device) {
|
||||
DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
|
||||
}
|
||||
}
|
||||
// All constructors have been issued, wait for them now.
|
||||
if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
Device.HasPendingGlobals = false;
|
||||
Device.PendingGlobalsMtx.unlock();
|
||||
@ -226,6 +230,7 @@ static int32_t getParentIndex(int64_t type) {
|
||||
int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
|
||||
int64_t arg_size, int64_t arg_type,
|
||||
map_var_info_t arg_names, void *arg_mapper,
|
||||
AsyncInfoTy &AsyncInfo,
|
||||
TargetDataFuncPtrTy target_data_function) {
|
||||
TIMESCOPE_WITH_IDENT(loc);
|
||||
DP("Calling the mapper function " DPxMOD "\n", DPxPTR(arg_mapper));
|
||||
@ -256,11 +261,10 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
|
||||
MapperArgNames[I] = C.Name;
|
||||
}
|
||||
|
||||
int rc = target_data_function(loc, Device, MapperComponents.Components.size(),
|
||||
MapperArgsBase.data(), MapperArgs.data(),
|
||||
MapperArgSizes.data(), MapperArgTypes.data(),
|
||||
MapperArgNames.data(), /*arg_mappers*/ nullptr,
|
||||
/* AsyncInfoTy */ nullptr);
|
||||
int rc = target_data_function(
|
||||
loc, Device, MapperComponents.Components.size(), MapperArgsBase.data(),
|
||||
MapperArgs.data(), MapperArgSizes.data(), MapperArgTypes.data(),
|
||||
MapperArgNames.data(), /*arg_mappers*/ nullptr, AsyncInfo);
|
||||
|
||||
return rc;
|
||||
}
|
||||
@ -269,7 +273,7 @@ int targetDataMapper(ident_t *loc, DeviceTy &Device, void *arg_base, void *arg,
|
||||
int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
|
||||
void **args_base, void **args, int64_t *arg_sizes,
|
||||
int64_t *arg_types, map_var_info_t *arg_names,
|
||||
void **arg_mappers, AsyncInfoTy *AsyncInfo) {
|
||||
void **arg_mappers, AsyncInfoTy &AsyncInfo) {
|
||||
// process each input.
|
||||
for (int32_t i = 0; i < arg_num; ++i) {
|
||||
// Ignore private variables and arrays - there is no mapping for them.
|
||||
@ -286,7 +290,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
|
||||
map_var_info_t arg_name = (!arg_names) ? nullptr : arg_names[i];
|
||||
int rc = targetDataMapper(loc, Device, args_base[i], args[i],
|
||||
arg_sizes[i], arg_types[i], arg_name,
|
||||
arg_mappers[i], targetDataBegin);
|
||||
arg_mappers[i], AsyncInfo, targetDataBegin);
|
||||
|
||||
if (rc != OFFLOAD_SUCCESS) {
|
||||
REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"
|
||||
@ -416,7 +420,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
|
||||
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
|
||||
data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
|
||||
int rt =
|
||||
Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, *AsyncInfo);
|
||||
Device.submitData(TgtPtrBegin, HstPtrBegin, data_size, AsyncInfo);
|
||||
if (rt != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data to device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -430,7 +434,7 @@ int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
|
||||
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
|
||||
void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
|
||||
int rt = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase,
|
||||
sizeof(void *), *AsyncInfo);
|
||||
sizeof(void *), AsyncInfo);
|
||||
if (rt != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data to device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -470,7 +474,7 @@ struct DeallocTgtPtrInfo {
|
||||
int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
void **ArgBases, void **Args, int64_t *ArgSizes,
|
||||
int64_t *ArgTypes, map_var_info_t *ArgNames,
|
||||
void **ArgMappers, AsyncInfoTy *AsyncInfo) {
|
||||
void **ArgMappers, AsyncInfoTy &AsyncInfo) {
|
||||
int Ret;
|
||||
std::vector<DeallocTgtPtrInfo> DeallocTgtPtrs;
|
||||
// process each input.
|
||||
@ -488,9 +492,9 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
DP("Calling targetDataMapper for the %dth argument\n", I);
|
||||
|
||||
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
|
||||
Ret =
|
||||
targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
|
||||
ArgTypes[I], ArgName, ArgMappers[I], targetDataEnd);
|
||||
Ret = targetDataMapper(loc, Device, ArgBases[I], Args[I], ArgSizes[I],
|
||||
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
|
||||
targetDataEnd);
|
||||
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"
|
||||
@ -585,7 +589,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
|
||||
DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
|
||||
Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize,
|
||||
*AsyncInfo);
|
||||
AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data from device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -637,17 +641,13 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: We should not synchronize here but pass the AsyncInfo object to the
|
||||
// allocate/deallocate device APIs.
|
||||
//
|
||||
// We need to synchronize before deallocating data.
|
||||
// If AsyncInfo is nullptr, the previous data transfer (if has) will be
|
||||
// synchronous, so we don't need to synchronize again. If AsyncInfo->Queue is
|
||||
// nullptr, there is no data transfer happened because once there is,
|
||||
// AsyncInfo->Queue will not be nullptr, so again, we don't need to
|
||||
// synchronize.
|
||||
if (AsyncInfo) {
|
||||
Ret = AsyncInfo->synchronize();
|
||||
if (Ret != OFFLOAD_SUCCESS)
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
Ret = AsyncInfo.synchronize();
|
||||
if (Ret != OFFLOAD_SUCCESS)
|
||||
return OFFLOAD_FAIL;
|
||||
|
||||
// Deallocate target pointer
|
||||
for (DeallocTgtPtrInfo &Info : DeallocTgtPtrs) {
|
||||
@ -664,7 +664,7 @@ int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
|
||||
static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
|
||||
void *HstPtrBegin, int64_t ArgSize,
|
||||
int64_t ArgType) {
|
||||
int64_t ArgType, AsyncInfoTy &AsyncInfo) {
|
||||
TIMESCOPE_WITH_IDENT(loc);
|
||||
bool IsLast, IsHostPtr;
|
||||
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, ArgSize, IsLast, false,
|
||||
@ -690,7 +690,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
|
||||
if (ArgType & OMP_TGT_MAPTYPE_FROM) {
|
||||
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
|
||||
ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
|
||||
int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, nullptr);
|
||||
int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data from device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -717,7 +717,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
|
||||
if (ArgType & OMP_TGT_MAPTYPE_TO) {
|
||||
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
|
||||
ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
|
||||
int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, nullptr);
|
||||
int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data to device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -737,7 +737,7 @@ static int targetDataContiguous(ident_t *loc, DeviceTy &Device, void *ArgsBase,
|
||||
"pointer " DPxMOD "\n",
|
||||
DPxPTR(IT->second.TgtPtrVal), DPxPTR(IT->second.TgtPtrAddr));
|
||||
Ret = Device.submitData(IT->second.TgtPtrAddr, &IT->second.TgtPtrVal,
|
||||
sizeof(void *), nullptr);
|
||||
sizeof(void *), AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data to device failed.\n");
|
||||
Device.ShadowMtx.unlock();
|
||||
@ -753,8 +753,8 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
|
||||
void *ArgsBase,
|
||||
__tgt_target_non_contig *NonContig,
|
||||
uint64_t Size, int64_t ArgType,
|
||||
int CurrentDim, int DimSize,
|
||||
uint64_t Offset) {
|
||||
int CurrentDim, int DimSize, uint64_t Offset,
|
||||
AsyncInfoTy &AsyncInfo) {
|
||||
TIMESCOPE_WITH_IDENT(loc);
|
||||
int Ret = OFFLOAD_SUCCESS;
|
||||
if (CurrentDim < DimSize) {
|
||||
@ -766,7 +766,7 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
|
||||
if (CurrentDim != DimSize - 1 || I == 0) {
|
||||
Ret = targetDataNonContiguous(loc, Device, ArgsBase, NonContig, Size,
|
||||
ArgType, CurrentDim + 1, DimSize,
|
||||
Offset + CurOffset);
|
||||
Offset + CurOffset, AsyncInfo);
|
||||
// Stop the whole process if any contiguous piece returns anything
|
||||
// other than OFFLOAD_SUCCESS.
|
||||
if (Ret != OFFLOAD_SUCCESS)
|
||||
@ -778,7 +778,8 @@ static int targetDataNonContiguous(ident_t *loc, DeviceTy &Device,
|
||||
DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64
|
||||
" len %" PRIu64 "\n",
|
||||
DPxPTR(Ptr), Offset, Size);
|
||||
Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType);
|
||||
Ret = targetDataContiguous(loc, Device, ArgsBase, Ptr, Size, ArgType,
|
||||
AsyncInfo);
|
||||
}
|
||||
return Ret;
|
||||
}
|
||||
@ -794,12 +795,10 @@ static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig,
|
||||
}
|
||||
|
||||
/// Internal function to pass data to/from the target.
|
||||
// AsyncInfo is currently unused, added here so targetDataUpdate has the
|
||||
// same signature as targetDataBegin and targetDataEnd.
|
||||
int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
void **ArgsBase, void **Args, int64_t *ArgSizes,
|
||||
int64_t *ArgTypes, map_var_info_t *ArgNames,
|
||||
void **ArgMappers, AsyncInfoTy *AsyncInfo) {
|
||||
void **ArgMappers, AsyncInfoTy &AsyncInfo) {
|
||||
// process each input.
|
||||
for (int32_t I = 0; I < ArgNum; ++I) {
|
||||
if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) ||
|
||||
@ -814,7 +813,7 @@ int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
|
||||
map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I];
|
||||
int Ret = targetDataMapper(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
|
||||
ArgTypes[I], ArgName, ArgMappers[I],
|
||||
ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo,
|
||||
targetDataUpdate);
|
||||
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
@ -837,10 +836,10 @@ int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize);
|
||||
Ret = targetDataNonContiguous(
|
||||
loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I],
|
||||
/*current_dim=*/0, DimSize - MergedDim, /*offset=*/0);
|
||||
/*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo);
|
||||
} else {
|
||||
Ret = targetDataContiguous(loc, Device, ArgsBase[I], Args[I], ArgSizes[I],
|
||||
ArgTypes[I]);
|
||||
ArgTypes[I], AsyncInfo);
|
||||
}
|
||||
if (Ret == OFFLOAD_FAIL)
|
||||
return OFFLOAD_FAIL;
|
||||
@ -950,7 +949,7 @@ class PrivateArgumentManagerTy {
|
||||
/// A reference to the \p DeviceTy object
|
||||
DeviceTy &Device;
|
||||
/// A pointer to a \p AsyncInfoTy object
|
||||
AsyncInfoTy *AsyncInfo;
|
||||
AsyncInfoTy &AsyncInfo;
|
||||
|
||||
// TODO: What would be the best value here? Should we make it configurable?
|
||||
// If the size is larger than this threshold, we will allocate and transfer it
|
||||
@ -959,7 +958,7 @@ class PrivateArgumentManagerTy {
|
||||
|
||||
public:
|
||||
/// Constructor
|
||||
PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy *AsyncInfo)
|
||||
PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo)
|
||||
: Device(Dev), AsyncInfo(AsyncInfo) {}
|
||||
|
||||
/// Add a private argument
|
||||
@ -986,7 +985,7 @@ public:
|
||||
#endif
|
||||
// If first-private, copy data from host
|
||||
if (IsFirstPrivate) {
|
||||
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, *AsyncInfo);
|
||||
int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
DP("Copying data to device failed, failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -1042,7 +1041,7 @@ public:
|
||||
FirstPrivateArgSize, DPxPTR(TgtPtr));
|
||||
// Transfer data to target device
|
||||
int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(),
|
||||
FirstPrivateArgSize, *AsyncInfo);
|
||||
FirstPrivateArgSize, AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
DP("Failed to submit data of private arguments.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -1090,7 +1089,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
|
||||
std::vector<void *> &TgtArgs,
|
||||
std::vector<ptrdiff_t> &TgtOffsets,
|
||||
PrivateArgumentManagerTy &PrivateArgumentManager,
|
||||
AsyncInfoTy *AsyncInfo) {
|
||||
AsyncInfoTy &AsyncInfo) {
|
||||
TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", loc);
|
||||
DeviceTy &Device = PM->Devices[DeviceId];
|
||||
int Ret = targetDataBegin(loc, Device, ArgNum, ArgBases, Args, ArgSizes,
|
||||
@ -1141,7 +1140,7 @@ static int processDataBefore(ident_t *loc, int64_t DeviceId, void *HostPtr,
|
||||
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
|
||||
DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin));
|
||||
Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin,
|
||||
sizeof(void *), *AsyncInfo);
|
||||
sizeof(void *), AsyncInfo);
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
REPORT("Copying data to device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
@ -1211,7 +1210,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
|
||||
int64_t *ArgSizes, int64_t *ArgTypes,
|
||||
map_var_info_t *ArgNames, void **ArgMappers,
|
||||
PrivateArgumentManagerTy &PrivateArgumentManager,
|
||||
AsyncInfoTy *AsyncInfo) {
|
||||
AsyncInfoTy &AsyncInfo) {
|
||||
TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", loc);
|
||||
DeviceTy &Device = PM->Devices[DeviceId];
|
||||
|
||||
@ -1243,7 +1242,7 @@ static int processDataAfter(ident_t *loc, int64_t DeviceId, void *HostPtr,
|
||||
int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
void **ArgBases, void **Args, int64_t *ArgSizes, int64_t *ArgTypes,
|
||||
map_var_info_t *ArgNames, void **ArgMappers, int32_t TeamNum,
|
||||
int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy *AsyncInfo) {
|
||||
int32_t ThreadLimit, int IsTeamConstruct, AsyncInfoTy &AsyncInfo) {
|
||||
int32_t DeviceId = Device.DeviceID;
|
||||
|
||||
TableMap *TM = getTableMap(HostPtr);
|
||||
@ -1264,12 +1263,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
}
|
||||
assert(TargetTable && "Global data has not been mapped\n");
|
||||
|
||||
// TODO: This will go away as soon as we consequently pass in async info
|
||||
// objects (as references).
|
||||
AsyncInfoTy InternalAsyncInfo(Device);
|
||||
if (!AsyncInfo)
|
||||
AsyncInfo = &InternalAsyncInfo;
|
||||
|
||||
std::vector<void *> TgtArgs;
|
||||
std::vector<ptrdiff_t> TgtOffsets;
|
||||
|
||||
@ -1301,10 +1294,10 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
if (IsTeamConstruct)
|
||||
Ret = Device.runTeamRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
|
||||
TgtArgs.size(), TeamNum, ThreadLimit,
|
||||
LoopTripCount, *AsyncInfo);
|
||||
LoopTripCount, AsyncInfo);
|
||||
else
|
||||
Ret = Device.runRegion(TgtEntryPtr, &TgtArgs[0], &TgtOffsets[0],
|
||||
TgtArgs.size(), *AsyncInfo);
|
||||
TgtArgs.size(), AsyncInfo);
|
||||
}
|
||||
|
||||
if (Ret != OFFLOAD_SUCCESS) {
|
||||
@ -1322,13 +1315,6 @@ int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
REPORT("Failed to process data after launching the kernel.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
} else {
|
||||
// TODO: We should not synchronize here but on the outer level once we pass
|
||||
// in a reference AsyncInfo object.
|
||||
// If ArgNum is zero, but AsyncInfo.Queue is valid, then the kernel doesn't
|
||||
// hava any argument, and the device supports async operations, so we need a
|
||||
// sync at this point.
|
||||
return AsyncInfo->synchronize();
|
||||
}
|
||||
|
||||
return OFFLOAD_SUCCESS;
|
||||
|
@ -23,23 +23,23 @@
|
||||
extern int targetDataBegin(ident_t *loc, DeviceTy &Device, int32_t arg_num,
|
||||
void **args_base, void **args, int64_t *arg_sizes,
|
||||
int64_t *arg_types, map_var_info_t *arg_names,
|
||||
void **arg_mappers, AsyncInfoTy *AsyncInfo);
|
||||
void **arg_mappers, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
extern int targetDataEnd(ident_t *loc, DeviceTy &Device, int32_t ArgNum,
|
||||
void **ArgBases, void **Args, int64_t *ArgSizes,
|
||||
int64_t *ArgTypes, map_var_info_t *arg_names,
|
||||
void **ArgMappers, AsyncInfoTy *AsyncInfo);
|
||||
void **ArgMappers, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
extern int targetDataUpdate(ident_t *loc, DeviceTy &Device, int32_t arg_num,
|
||||
void **args_base, void **args, int64_t *arg_sizes,
|
||||
int64_t *arg_types, map_var_info_t *arg_names,
|
||||
void **arg_mappers, AsyncInfoTy *AsyncInfo);
|
||||
void **arg_mappers, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
extern int target(ident_t *loc, DeviceTy &Device, void *HostPtr, int32_t ArgNum,
|
||||
void **ArgBases, void **Args, int64_t *ArgSizes,
|
||||
int64_t *ArgTypes, map_var_info_t *arg_names,
|
||||
void **ArgMappers, int32_t TeamNum, int32_t ThreadLimit,
|
||||
int IsTeamConstruct, AsyncInfoTy *AsyncInfo);
|
||||
int IsTeamConstruct, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
extern int CheckDeviceAndCtors(int64_t device_id);
|
||||
|
||||
@ -76,7 +76,7 @@ typedef void (*MapperFuncPtrTy)(void *, void *, void *, int64_t, int64_t,
|
||||
// targetDataEnd and targetDataUpdate).
|
||||
typedef int (*TargetDataFuncPtrTy)(ident_t *, DeviceTy &, int32_t, void **,
|
||||
void **, int64_t *, int64_t *,
|
||||
map_var_info_t *, void **, AsyncInfoTy *);
|
||||
map_var_info_t *, void **, AsyncInfoTy &);
|
||||
|
||||
// Implemented in libomp, they are called from within __tgt_* functions.
|
||||
#ifdef __cplusplus
|
||||
|
@ -400,16 +400,20 @@ void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) {
|
||||
DeviceTy &Device = PM->Devices[FoundRTL->Idx + i];
|
||||
Device.PendingGlobalsMtx.lock();
|
||||
if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
|
||||
AsyncInfoTy AsyncInfo(Device);
|
||||
for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
|
||||
int rc =
|
||||
target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, 1, 1, true /*team*/, nullptr);
|
||||
int rc = target(nullptr, Device, dtor, 0, nullptr, nullptr, nullptr,
|
||||
nullptr, nullptr, nullptr, 1, 1, true /*team*/,
|
||||
AsyncInfo);
|
||||
if (rc != OFFLOAD_SUCCESS) {
|
||||
DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
|
||||
}
|
||||
}
|
||||
// Remove this library's entry from PendingCtorsDtors
|
||||
Device.PendingCtorsDtors.erase(desc);
|
||||
// All constructors have been issued, wait for them now.
|
||||
if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS)
|
||||
DP("Failed synchronizing destructors kernels.\n");
|
||||
}
|
||||
Device.PendingGlobalsMtx.unlock();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user