mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-26 23:21:11 +00:00
[openmp] Fixed Support for VA for record-replay. (#70396)
The commit was discussed in phabricator (https://reviews.llvm.org/D157186). Record replay currently fails on AMD as it conflicts with the heap memory allocator introduced in #69806. The workaround is setting `LIBOMPTARGET_HEAP_SIZE=0` during both record and replay run.
This commit is contained in:
parent
d346c82435
commit
d6a3d6b96d
@ -253,6 +253,11 @@ template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
|
||||
return std::align(Alignment, sizeof(char), Ptr, Space);
|
||||
}
|
||||
|
||||
/// Round up \p V to a \p Boundary.
|
||||
template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
|
||||
return (V + Boundary - 1) / Boundary * Boundary;
|
||||
}
|
||||
|
||||
} // namespace target
|
||||
} // namespace omp
|
||||
} // namespace llvm
|
||||
|
@ -439,7 +439,7 @@ void __tgt_set_info_flag(uint32_t);
|
||||
int __tgt_print_device_info(int64_t DeviceId);
|
||||
|
||||
int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
||||
bool IsRecord, bool SaveOutput);
|
||||
void *VAddr, bool IsRecord, bool SaveOutput);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -73,7 +73,8 @@ struct RTLInfoTy {
|
||||
typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
|
||||
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
|
||||
typedef int32_t(set_device_offset_ty)(int32_t);
|
||||
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
|
||||
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
|
||||
bool);
|
||||
|
||||
int32_t Idx = -1; // RTL index, index is the number of devices
|
||||
// of other RTLs that were registered before,
|
||||
|
@ -2579,6 +2579,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
DeviceMemoryPoolSize = Value;
|
||||
return Plugin::success();
|
||||
}
|
||||
Error getDeviceMemorySize(uint64_t &Value) override {
|
||||
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
|
||||
if (Pool->isGlobal()) {
|
||||
hsa_status_t Status =
|
||||
Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
|
||||
return Plugin::check(Status, "Error in getting device memory size: %s");
|
||||
}
|
||||
}
|
||||
return Plugin::error("getDeviceMemorySize:: no global pool");
|
||||
}
|
||||
|
||||
/// AMDGPU-specific function to get device attributes.
|
||||
template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
|
||||
|
@ -49,40 +49,87 @@ private:
|
||||
void *MemoryStart;
|
||||
void *MemoryPtr;
|
||||
size_t MemorySize;
|
||||
size_t TotalSize;
|
||||
GenericDeviceTy *Device;
|
||||
std::mutex AllocationLock;
|
||||
|
||||
RRStatusTy Status;
|
||||
bool ReplaySaveOutput;
|
||||
uint64_t DeviceMemorySize;
|
||||
|
||||
// Record/replay pre-allocates the largest possible device memory using the
|
||||
// default kind.
|
||||
// TODO: Expand allocation to include other kinds (device, host, shared) and
|
||||
// possibly use a MemoryManager to track (de-)allocations for
|
||||
// storing/retrieving when recording/replaying.
|
||||
Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
|
||||
// Pre-allocate memory on device. Starts with 64GB and subtracts in steps
|
||||
// of 1GB until allocation succeeds.
|
||||
const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
|
||||
void *suggestAddress(uint64_t MaxMemoryAllocation) {
|
||||
// Get a valid pointer address for this system
|
||||
void *Addr =
|
||||
Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
|
||||
Device->free(Addr);
|
||||
// Align Address to MaxMemoryAllocation
|
||||
Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
|
||||
return Addr;
|
||||
}
|
||||
|
||||
Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
|
||||
size_t ASize = MaxMemoryAllocation;
|
||||
|
||||
if (!VAddr && isRecording())
|
||||
VAddr = suggestAddress(MaxMemoryAllocation);
|
||||
|
||||
DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
|
||||
|
||||
if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
|
||||
return Err;
|
||||
|
||||
if (isReplaying() && VAddr != MemoryStart) {
|
||||
return Plugin::error("Record-Replay cannot assign the"
|
||||
"requested recorded address (%p, %p)",
|
||||
VAddr, MemoryStart);
|
||||
}
|
||||
|
||||
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
|
||||
"Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
|
||||
|
||||
MemoryPtr = MemoryStart;
|
||||
MemorySize = 0;
|
||||
TotalSize = ASize;
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
|
||||
const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
|
||||
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
|
||||
MemoryStart = nullptr;
|
||||
for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
|
||||
MemoryStart =
|
||||
Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
|
||||
for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
|
||||
MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
|
||||
TARGET_ALLOC_DEFAULT);
|
||||
if (MemoryStart)
|
||||
break;
|
||||
}
|
||||
|
||||
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
|
||||
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
|
||||
MemoryStart);
|
||||
|
||||
if (!MemoryStart)
|
||||
return Plugin::error("Allocating record/replay memory");
|
||||
|
||||
if (VAddr && VAddr != MemoryStart)
|
||||
return Plugin::error("Cannot allocate recorded address");
|
||||
|
||||
MemoryPtr = MemoryStart;
|
||||
MemorySize = 0;
|
||||
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
|
||||
if (Device->supportVAManagement())
|
||||
return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
|
||||
|
||||
uint64_t DevMemSize;
|
||||
if (Device->getDeviceMemorySize(DevMemSize))
|
||||
return Plugin::error("Cannot determine Device Memory Size");
|
||||
|
||||
return preAllocateHeuristic(DevMemSize, ReqVAddr);
|
||||
}
|
||||
|
||||
void dumpDeviceMemory(StringRef Filename) {
|
||||
ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
|
||||
WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
|
||||
@ -114,8 +161,7 @@ public:
|
||||
bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
|
||||
|
||||
RecordReplayTy()
|
||||
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
|
||||
DeviceMemorySize(-1) {}
|
||||
: Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}
|
||||
|
||||
void saveImage(const char *Name, const DeviceImageTy &Image) {
|
||||
SmallString<128> ImageName = {Name, ".image"};
|
||||
@ -197,6 +243,7 @@ public:
|
||||
JsonKernelInfo["LoopTripCount"] = LoopTripCount;
|
||||
JsonKernelInfo["DeviceMemorySize"] = MemorySize;
|
||||
JsonKernelInfo["DeviceId"] = Device->getDeviceId();
|
||||
JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
|
||||
|
||||
json::Array JsonArgPtrs;
|
||||
for (int I = 0; I < NumArgs; ++I)
|
||||
@ -244,27 +291,33 @@ public:
|
||||
return Alloc;
|
||||
}
|
||||
|
||||
Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
|
||||
bool SaveOutput) {
|
||||
Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
|
||||
RRStatusTy Status, bool SaveOutput) {
|
||||
this->Device = Device;
|
||||
this->Status = Status;
|
||||
this->DeviceMemorySize = MemSize;
|
||||
this->ReplaySaveOutput = SaveOutput;
|
||||
|
||||
if (auto Err = preallocateDeviceMemory(MemSize))
|
||||
if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
|
||||
return Err;
|
||||
|
||||
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
|
||||
"Record Replay Initialized (%p)"
|
||||
" as starting address, %lu Memory Size"
|
||||
" and set on status %s\n",
|
||||
MemoryStart, MemSize,
|
||||
MemoryStart, TotalSize,
|
||||
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
|
||||
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
void deinit() { Device->free(MemoryStart); }
|
||||
void deinit() {
|
||||
if (Device->supportVAManagement()) {
|
||||
if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
|
||||
report_fatal_error("Error on releasing virtual memory space");
|
||||
} else {
|
||||
Device->free(MemoryStart);
|
||||
}
|
||||
}
|
||||
|
||||
} RecordReplay;
|
||||
|
||||
@ -1184,6 +1237,19 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
|
||||
return queryAsyncImpl(*AsyncInfo);
|
||||
}
|
||||
|
||||
Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
|
||||
return Plugin::error("Device does not suppport VA Management");
|
||||
}
|
||||
|
||||
Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
|
||||
return Plugin::error("Device does not suppport VA Management");
|
||||
}
|
||||
|
||||
Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
|
||||
return Plugin::error(
|
||||
"Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
|
||||
}
|
||||
|
||||
Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
|
||||
TargetAllocTy Kind) {
|
||||
void *Alloc = nullptr;
|
||||
@ -1552,8 +1618,8 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
|
||||
return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
|
||||
uint64_t MemorySize, bool isRecord,
|
||||
int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
|
||||
void *VAddr, bool isRecord,
|
||||
bool SaveOutput) {
|
||||
GenericPluginTy &Plugin = Plugin::get();
|
||||
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
|
||||
@ -1561,7 +1627,8 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
|
||||
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
|
||||
: RecordReplayTy::RRStatusTy::RRReplaying;
|
||||
|
||||
if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
|
||||
if (auto Err =
|
||||
RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
|
||||
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
|
||||
"(Error: %s)\n",
|
||||
MemorySize, toString(std::move(Err)).data());
|
||||
|
@ -655,6 +655,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
|
||||
Error queryAsync(__tgt_async_info *AsyncInfo);
|
||||
virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
|
||||
|
||||
/// Check whether the architecture supports VA management
|
||||
virtual bool supportVAManagement() const { return false; }
|
||||
|
||||
/// Get the total device memory size
|
||||
virtual Error getDeviceMemorySize(uint64_t &DSize);
|
||||
|
||||
/// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
|
||||
/// map it to \p VAddr. The obtained address is stored in \p Addr. At return
|
||||
/// \p RSize contains the actual size which can be equal or larger than the
|
||||
/// requested size.
|
||||
virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
|
||||
|
||||
/// De-allocates device memory and unmaps the virtual address \p VAddr
|
||||
virtual Error memoryVAUnMap(void *VAddr, size_t Size);
|
||||
|
||||
/// Allocate data on the device or involving the device.
|
||||
Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
|
||||
|
||||
|
@ -81,6 +81,16 @@ DLWRAP(cuEventDestroy, 1)
|
||||
|
||||
DLWRAP_FINALIZE()
|
||||
|
||||
DLWRAP(cuMemUnmap, 2)
|
||||
DLWRAP(cuMemRelease, 1)
|
||||
DLWRAP(cuMemAddressFree, 2)
|
||||
DLWRAP(cuMemGetInfo, 2)
|
||||
DLWRAP(cuMemAddressReserve, 5)
|
||||
DLWRAP(cuMemMap, 5)
|
||||
DLWRAP(cuMemCreate, 4)
|
||||
DLWRAP(cuMemSetAccess, 4)
|
||||
DLWRAP(cuMemGetAllocationGranularity, 3)
|
||||
|
||||
#ifndef DYNAMIC_CUDA_PATH
|
||||
#define DYNAMIC_CUDA_PATH "libcuda.so"
|
||||
#endif
|
||||
|
@ -26,6 +26,71 @@ typedef struct CUevent_st *CUevent;
|
||||
|
||||
#define CU_DEVICE_INVALID ((CUdevice)-2)
|
||||
|
||||
typedef unsigned long long CUmemGenericAllocationHandle_v1;
|
||||
typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
|
||||
|
||||
#define CU_DEVICE_INVALID ((CUdevice)-2)
|
||||
|
||||
typedef enum CUmemAllocationGranularity_flags_enum {
|
||||
CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
|
||||
CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
|
||||
} CUmemAllocationGranularity_flags;
|
||||
|
||||
typedef enum CUmemAccess_flags_enum {
|
||||
CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
|
||||
CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
|
||||
CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
|
||||
CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
|
||||
} CUmemAccess_flags;
|
||||
|
||||
typedef enum CUmemLocationType_enum {
|
||||
CU_MEM_LOCATION_TYPE_INVALID = 0x0,
|
||||
CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
|
||||
CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
|
||||
} CUmemLocationType;
|
||||
|
||||
typedef struct CUmemLocation_st {
|
||||
CUmemLocationType type;
|
||||
int id;
|
||||
} CUmemLocation_v1;
|
||||
typedef CUmemLocation_v1 CUmemLocation;
|
||||
|
||||
typedef struct CUmemAccessDesc_st {
|
||||
CUmemLocation location;
|
||||
CUmemAccess_flags flags;
|
||||
} CUmemAccessDesc_v1;
|
||||
|
||||
typedef CUmemAccessDesc_v1 CUmemAccessDesc;
|
||||
|
||||
typedef enum CUmemAllocationType_enum {
|
||||
CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
|
||||
CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
|
||||
CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
|
||||
} CUmemAllocationType;
|
||||
|
||||
typedef enum CUmemAllocationHandleType_enum {
|
||||
CU_MEM_HANDLE_TYPE_NONE = 0x0,
|
||||
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
|
||||
CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
|
||||
CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
|
||||
CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
|
||||
} CUmemAllocationHandleType;
|
||||
|
||||
typedef struct CUmemAllocationProp_st {
|
||||
CUmemAllocationType type;
|
||||
CUmemAllocationHandleType requestedHandleTypes;
|
||||
CUmemLocation location;
|
||||
|
||||
void *win32HandleMetaData;
|
||||
struct {
|
||||
unsigned char compressionType;
|
||||
unsigned char gpuDirectRDMACapable;
|
||||
unsigned short usage;
|
||||
unsigned char reserved[4];
|
||||
} allocFlags;
|
||||
} CUmemAllocationProp_v1;
|
||||
typedef CUmemAllocationProp_v1 CUmemAllocationProp;
|
||||
|
||||
typedef enum cudaError_enum {
|
||||
CUDA_SUCCESS = 0,
|
||||
CUDA_ERROR_INVALID_VALUE = 1,
|
||||
@ -268,4 +333,21 @@ CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
|
||||
CUresult cuEventSynchronize(CUevent);
|
||||
CUresult cuEventDestroy(CUevent);
|
||||
|
||||
CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
|
||||
CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
|
||||
CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
|
||||
CUresult cuMemGetInfo(size_t *free, size_t *total);
|
||||
CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
|
||||
CUdeviceptr addr, unsigned long long flags);
|
||||
CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
|
||||
CUmemGenericAllocationHandle handle,
|
||||
unsigned long long flags);
|
||||
CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
|
||||
const CUmemAllocationProp *prop, unsigned long long flags);
|
||||
CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
|
||||
const CUmemAccessDesc *desc, size_t count);
|
||||
CUresult cuMemGetAllocationGranularity(size_t *granularity,
|
||||
const CUmemAllocationProp *prop,
|
||||
CUmemAllocationGranularity_flags option);
|
||||
|
||||
#endif
|
||||
|
@ -517,6 +517,116 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
||||
return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
|
||||
}
|
||||
|
||||
/// CUDA support VA management
|
||||
bool supportVAManagement() const override { return true; }
|
||||
|
||||
/// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
|
||||
/// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
|
||||
/// At return \p RSize contains the actual size
|
||||
Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
|
||||
CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
|
||||
auto IHandle = DeviceMMaps.find(DVAddr);
|
||||
size_t Size = *RSize;
|
||||
|
||||
if (Size == 0)
|
||||
return Plugin::error("Memory Map Size must be larger than 0");
|
||||
|
||||
// Check if we have already mapped this address
|
||||
if (IHandle != DeviceMMaps.end())
|
||||
return Plugin::error("Address already memory mapped");
|
||||
|
||||
CUmemAllocationProp Prop = {};
|
||||
size_t Granularity = 0;
|
||||
|
||||
size_t Free, Total;
|
||||
CUresult Res = cuMemGetInfo(&Free, &Total);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
|
||||
return Err;
|
||||
|
||||
if (Size >= Free) {
|
||||
*Addr = nullptr;
|
||||
return Plugin::error(
|
||||
"Canot map memory size larger than the available device memory");
|
||||
}
|
||||
|
||||
// currently NVidia only supports pinned device types
|
||||
Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
|
||||
Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
|
||||
Prop.location.id = DeviceId;
|
||||
cuMemGetAllocationGranularity(&Granularity, &Prop,
|
||||
CU_MEM_ALLOC_GRANULARITY_MINIMUM);
|
||||
if (auto Err =
|
||||
Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
|
||||
return Err;
|
||||
|
||||
if (Granularity == 0)
|
||||
return Plugin::error("Wrong device Page size");
|
||||
|
||||
// Ceil to page size.
|
||||
Size = roundUp(Size, Granularity);
|
||||
|
||||
// Create a handler of our allocation
|
||||
CUmemGenericAllocationHandle AHandle;
|
||||
Res = cuMemCreate(&AHandle, Size, &Prop, 0);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
|
||||
return Err;
|
||||
|
||||
CUdeviceptr DevPtr = 0;
|
||||
Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
|
||||
return Err;
|
||||
|
||||
Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
|
||||
return Err;
|
||||
|
||||
CUmemAccessDesc ADesc = {};
|
||||
ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
|
||||
ADesc.location.id = DeviceId;
|
||||
ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
|
||||
|
||||
// Sets address
|
||||
Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
|
||||
return Err;
|
||||
|
||||
*Addr = reinterpret_cast<void *>(DevPtr);
|
||||
*RSize = Size;
|
||||
DeviceMMaps.insert({DevPtr, AHandle});
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// De-allocates device memory and Unmaps the Virtual Addr
|
||||
Error memoryVAUnMap(void *VAddr, size_t Size) override {
|
||||
CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
|
||||
auto IHandle = DeviceMMaps.find(DVAddr);
|
||||
// Mapping does not exist
|
||||
if (IHandle == DeviceMMaps.end()) {
|
||||
return Plugin::error("Addr is not MemoryMapped");
|
||||
}
|
||||
|
||||
if (IHandle == DeviceMMaps.end())
|
||||
return Plugin::error("Addr is not MemoryMapped");
|
||||
|
||||
CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
|
||||
|
||||
CUresult Res = cuMemUnmap(DVAddr, Size);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
|
||||
return Err;
|
||||
|
||||
Res = cuMemRelease(AllocHandle);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
|
||||
return Err;
|
||||
|
||||
Res = cuMemAddressFree(DVAddr, Size);
|
||||
if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
|
||||
return Err;
|
||||
|
||||
DeviceMMaps.erase(IHandle);
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// Query for the completion of the pending operations on the async info.
|
||||
Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
|
||||
CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
|
||||
@ -859,6 +969,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
||||
Error setDeviceHeapSize(uint64_t Value) override {
|
||||
return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
|
||||
}
|
||||
Error getDeviceMemorySize(uint64_t &Value) override {
|
||||
CUresult Res = cuDeviceTotalMem(&Value, Device);
|
||||
return Plugin::check(Res, "Error in getDeviceMemorySize %s");
|
||||
}
|
||||
|
||||
/// CUDA-specific functions for getting and setting context limits.
|
||||
Error setCtxLimit(CUlimit Kind, uint64_t Value) {
|
||||
@ -907,6 +1021,9 @@ private:
|
||||
/// The CUDA device handler.
|
||||
CUdevice Device = CU_DEVICE_INVALID;
|
||||
|
||||
/// The memory mapped addresses and their handles
|
||||
std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
|
||||
|
||||
/// The compute capability of the corresponding CUDA device.
|
||||
struct ComputeCapabilityTy {
|
||||
uint32_t Major;
|
||||
|
@ -482,7 +482,8 @@ void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
|
||||
int DeviceTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
|
||||
HostDataToTargetTy *Entry, int64_t Size) {
|
||||
assert(Entry && "Trying to delete a null entry from the HDTT map.");
|
||||
assert(Entry->getTotalRefCount() == 0 && Entry->getDataEndThreadCount() == 0 &&
|
||||
assert(Entry->getTotalRefCount() == 0 &&
|
||||
Entry->getDataEndThreadCount() == 0 &&
|
||||
"Trying to delete entry that is in use or owned by another thread.");
|
||||
|
||||
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
|
||||
@ -546,7 +547,7 @@ void DeviceTy::init() {
|
||||
|
||||
RTL->activate_record_replay(RTLDeviceID,
|
||||
OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
|
||||
true, OMPX_ReplaySaveOutput);
|
||||
nullptr, true, OMPX_ReplaySaveOutput);
|
||||
}
|
||||
|
||||
IsInit = true;
|
||||
|
@ -346,7 +346,8 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
|
||||
/// /param SaveOutput Store the device memory after kernel
|
||||
/// execution on persistent storage
|
||||
EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
||||
bool IsRecord, bool SaveOutput) {
|
||||
void *VAddr, bool IsRecord,
|
||||
bool SaveOutput) {
|
||||
if (!deviceIsReady(DeviceId)) {
|
||||
DP("Device %" PRId64 " is not ready\n", DeviceId);
|
||||
return OMP_TGT_FAIL;
|
||||
@ -354,7 +355,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
|
||||
|
||||
DeviceTy &Device = *PM->Devices[DeviceId];
|
||||
[[maybe_unused]] int Rc =
|
||||
target_activate_rr(Device, MemorySize, IsRecord, SaveOutput);
|
||||
target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
|
||||
assert(Rc == OFFLOAD_SUCCESS &&
|
||||
"__tgt_activate_record_replay unexpected failure!");
|
||||
return OMP_TGT_SUCCESS;
|
||||
|
@ -827,14 +827,13 @@ postProcessingTargetDataEnd(DeviceTy *Device,
|
||||
// remaining shadow pointer entries for this struct.
|
||||
const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
|
||||
if (HasFrom) {
|
||||
Entry->foreachShadowPointerInfo(
|
||||
[&](const ShadowPtrInfoTy &ShadowPtr) {
|
||||
*ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
|
||||
DP("Restoring original host pointer value " DPxMOD " for host "
|
||||
"pointer " DPxMOD "\n",
|
||||
DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
|
||||
return OFFLOAD_SUCCESS;
|
||||
});
|
||||
Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
|
||||
*ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
|
||||
DP("Restoring original host pointer value " DPxMOD " for host "
|
||||
"pointer " DPxMOD "\n",
|
||||
DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
|
||||
return OFFLOAD_SUCCESS;
|
||||
});
|
||||
}
|
||||
|
||||
// Give up the lock as we either don't need it anymore (e.g., done with
|
||||
@ -1713,9 +1712,9 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
||||
/// Enables the record replay mechanism by pre-allocating MemorySize
|
||||
/// and informing the record-replayer of whether to store the output
|
||||
/// in some file.
|
||||
int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord,
|
||||
bool SaveOutput) {
|
||||
return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize,
|
||||
int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
|
||||
bool isRecord, bool SaveOutput) {
|
||||
return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
|
||||
isRecord, SaveOutput);
|
||||
}
|
||||
|
||||
|
@ -42,7 +42,7 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
||||
KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
|
||||
|
||||
extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
|
||||
bool isRecord, bool SaveOutput);
|
||||
void *ReqAddr, bool isRecord, bool SaveOutput);
|
||||
|
||||
extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
|
||||
void *DeviceMemory, int64_t DeviceMemorySize,
|
||||
|
@ -87,6 +87,9 @@ int main(int argc, char **argv) {
|
||||
for (auto It : *TgtArgOffsetsArray)
|
||||
TgtArgOffsets.push_back(static_cast<ptrdiff_t>(It.getAsInteger().value()));
|
||||
|
||||
void *BAllocStart = reinterpret_cast<void *>(
|
||||
JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
|
||||
|
||||
__tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
|
||||
std::string KernelEntryName = KernelFunc.value().str();
|
||||
KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
|
||||
@ -125,8 +128,8 @@ int main(int argc, char **argv) {
|
||||
|
||||
__tgt_register_lib(&Desc);
|
||||
|
||||
int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false,
|
||||
VerifyOpt);
|
||||
int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
|
||||
false, VerifyOpt);
|
||||
|
||||
if (Rc != OMP_TGT_SUCCESS) {
|
||||
report_fatal_error("Cannot activate record replay\n");
|
||||
|
Loading…
Reference in New Issue
Block a user