[OpenMP] Be more forgiving during record and replay

When we record and replay kernels we should not error out early if there
is a chance the program might still run fine. This patch will:
1) Fallback to the allocation heuristic if the VAMap doesn't work.
2) Adjust the memory start to match the required address if possible.
3) Adjust the (guessed) pointer arguments if the memory start adjustment
   is impossible. This will allow kernels without indirect accesses to
   work while indirect accesses will most likely fail.
This commit is contained in:
Johannes Doerfert 2023-11-16 16:09:05 -08:00
parent 41566fb852
commit f48c4d8aa1
8 changed files with 82 additions and 35 deletions

View File

@ -450,7 +450,8 @@ void __tgt_set_info_flag(uint32_t);
int __tgt_print_device_info(int64_t DeviceId);
int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord, bool SaveOutput);
void *VAddr, bool IsRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);
#ifdef __cplusplus
}

View File

@ -20,6 +20,7 @@
#include "omptarget.h"
#include <cstdint>
#include <list>
#include <map>
#include <mutex>
@ -74,7 +75,7 @@ struct RTLInfoTy {
typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
typedef int32_t(set_device_offset_ty)(int32_t);
typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
bool);
bool, uint64_t &);
int32_t Idx = -1; // RTL index, index is the number of devices
// of other RTLs that were registered before,

View File

@ -55,6 +55,8 @@ private:
RRStatusTy Status;
bool ReplaySaveOutput;
bool UsedVAMap = false;
uintptr_t MemoryOffset = 0;
void *suggestAddress(uint64_t MaxMemoryAllocation) {
// Get a valid pointer address for this system
@ -89,10 +91,12 @@ private:
MemoryPtr = MemoryStart;
MemorySize = 0;
TotalSize = ASize;
UsedVAMap = true;
return Plugin::success();
}
Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
Error preAllocateHeuristic(uint64_t MaxMemoryAllocation,
uint64_t RequiredMemoryAllocation, void *VAddr) {
const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
constexpr size_t STEP = 1024 * 1024 * 1024ULL;
MemoryStart = nullptr;
@ -102,32 +106,55 @@ private:
if (MemoryStart)
break;
}
if (!MemoryStart)
return Plugin::error("Allocating record/replay memory");
if (VAddr && VAddr != MemoryStart)
MemoryOffset = uintptr_t(VAddr) - uintptr_t(MemoryStart);
MemoryPtr = MemoryStart;
MemorySize = 0;
// Check if we need adjustment.
if (MemoryOffset > 0 &&
TotalSize >= RequiredMemoryAllocation + MemoryOffset) {
// If we are off but "before" the required address and with enough space,
// we just "allocate" the offset to match the required address.
MemoryPtr = (char *)MemoryPtr + MemoryOffset;
MemorySize += MemoryOffset;
MemoryOffset = 0;
assert(MemoryPtr == VAddr && "Expected offset adjustment to work");
} else if (MemoryOffset) {
// If we are off and in a situation we cannot just "waste" memory to force
// a match, we hope adjusting the arguments is sufficient.
REPORT(
"WARNING Failed to allocate replay memory at required location %p, "
"got %p, trying to offset argument pointers by %" PRIi64 "\n",
VAddr, MemoryStart, MemoryOffset);
}
INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
"Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
MemoryStart);
if (!MemoryStart)
return Plugin::error("Allocating record/replay memory");
if (VAddr && VAddr != MemoryStart)
return Plugin::error("Cannot allocate recorded address");
MemoryPtr = MemoryStart;
MemorySize = 0;
return Plugin::success();
}
Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
if (Device->supportVAManagement())
return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
if (Device->supportVAManagement()) {
auto Err = preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
if (Err) {
REPORT("WARNING VA mapping failed, fallback to heuristic: "
"(Error: %s)\n",
toString(std::move(Err)).data());
}
}
uint64_t DevMemSize;
if (Device->getDeviceMemorySize(DevMemSize))
return Plugin::error("Cannot determine Device Memory Size");
return preAllocateHeuristic(DevMemSize, ReqVAddr);
return preAllocateHeuristic(DevMemSize, DeviceMemorySize, ReqVAddr);
}
void dumpDeviceMemory(StringRef Filename) {
@ -293,7 +320,7 @@ public:
}
Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
RRStatusTy Status, bool SaveOutput) {
RRStatusTy Status, bool SaveOutput, uint64_t &ReqPtrArgOffset) {
this->Device = Device;
this->Status = Status;
this->ReplaySaveOutput = SaveOutput;
@ -308,11 +335,14 @@ public:
MemoryStart, TotalSize,
Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
// Tell the user to offset pointer arguments as the memory allocation does
// not match.
ReqPtrArgOffset = MemoryOffset;
return Plugin::success();
}
void deinit() {
if (Device->supportVAManagement()) {
if (UsedVAMap) {
if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
report_fatal_error("Error on releasing virtual memory space");
} else {
@ -1694,15 +1724,16 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
void *VAddr, bool isRecord,
bool SaveOutput) {
bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
GenericPluginTy &Plugin = Plugin::get();
GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
RecordReplayTy::RRStatusTy Status =
isRecord ? RecordReplayTy::RRStatusTy::RRRecording
: RecordReplayTy::RRStatusTy::RRReplaying;
if (auto Err =
RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
if (auto Err = RecordReplay.init(&Device, MemorySize, VAddr, Status,
SaveOutput, ReqPtrArgOffset)) {
REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
"(Error: %s)\n",
MemorySize, toString(std::move(Err)).data());

View File

@ -539,15 +539,10 @@ void DeviceTy::init() {
// Enables saving the device memory kernel output post execution if set.
llvm::omp::target::BoolEnvar OMPX_ReplaySaveOutput(
"LIBOMPTARGET_RR_SAVE_OUTPUT", false);
// Sets the maximum to pre-allocate device memory.
llvm::omp::target::UInt64Envar OMPX_DeviceMemorySize(
"LIBOMPTARGET_RR_DEVMEM_SIZE", 16);
DP("Activating Record-Replay for Device %d with %lu GB memory\n",
RTLDeviceID, OMPX_DeviceMemorySize.get());
RTL->activate_record_replay(RTLDeviceID,
OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
nullptr, true, OMPX_ReplaySaveOutput);
uint64_t ReqPtrArgOffset;
RTL->activate_record_replay(RTLDeviceID, 0, nullptr, true,
OMPX_ReplaySaveOutput, ReqPtrArgOffset);
}
IsInit = true;

View File

@ -21,6 +21,7 @@
#include "Utilities.h"
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <mutex>
@ -347,15 +348,16 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
/// execution on persistent storage
EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
void *VAddr, bool IsRecord,
bool SaveOutput) {
bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
if (!deviceIsReady(DeviceId)) {
DP("Device %" PRId64 " is not ready\n", DeviceId);
return OMP_TGT_FAIL;
}
DeviceTy &Device = *PM->Devices[DeviceId];
[[maybe_unused]] int Rc =
target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
[[maybe_unused]] int Rc = target_activate_rr(
Device, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
assert(Rc == OFFLOAD_SUCCESS &&
"__tgt_activate_record_replay unexpected failure!");
return OMP_TGT_SUCCESS;

View File

@ -1725,9 +1725,11 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
/// and informing the record-replayer of whether to store the output
/// in some file.
int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
bool isRecord, bool SaveOutput) {
bool isRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset) {
return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
isRecord, SaveOutput);
isRecord, SaveOutput,
ReqPtrArgOffset);
}
/// Executes a kernel using pre-recorded information for loading to

View File

@ -42,7 +42,8 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
void *ReqAddr, bool isRecord, bool SaveOutput);
void *ReqAddr, bool isRecord, bool SaveOutput,
uint64_t &ReqPtrArgOffset);
extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
void *DeviceMemory, int64_t DeviceMemorySize,

View File

@ -16,6 +16,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/JSON.h"
#include "llvm/Support/MemoryBuffer.h"
#include <cstdint>
#include <cstdlib>
using namespace llvm;
@ -128,8 +129,9 @@ int main(int argc, char **argv) {
__tgt_register_lib(&Desc);
uint64_t ReqPtrArgOffset = 0;
int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
false, VerifyOpt);
false, VerifyOpt, ReqPtrArgOffset);
if (Rc != OMP_TGT_SUCCESS) {
report_fatal_error("Cannot activate record replay\n");
@ -149,6 +151,18 @@ int main(int argc, char **argv) {
const_cast<char *>(DeviceMemoryMB.get()->getBuffer().data()),
DeviceMemoryMB.get()->getBufferSize());
// If necessary, adjust pointer arguments.
if (ReqPtrArgOffset) {
for (auto *&Arg : TgtArgs) {
auto ArgInt = uintptr_t(Arg);
// Try to find pointer arguments.
if (ArgInt < uintptr_t(BAllocStart) ||
ArgInt >= uintptr_t(BAllocStart) + DeviceMemorySize)
continue;
Arg = reinterpret_cast<void *>(ArgInt - ReqPtrArgOffset);
}
}
__tgt_target_kernel_replay(
/* Loc */ nullptr, DeviceId, KernelEntry.addr, (char *)recored_data,
DeviceMemoryMB.get()->getBufferSize(), TgtArgs.data(),