[openmp] Fixed Support for VA for record-replay. (#70396)

The commit was discussed in phabricator (https://reviews.llvm.org/D157186). Record replay currently fails on AMD as it conflicts with the heap memory allocator introduced in #69806. The workaround is setting `LIBOMPTARGET_HEAP_SIZE=0` during both record and replay run.
2024-11-26 23:21:11 +00:00 · 2023-10-29 12:27:19 -07:00 · 2023-10-29 12:27:19 -07:00 · d6a3d6b96d
commit d6a3d6b96d
parent d346c82435
14 changed files with 355 additions and 44 deletions
--- a/openmp/libomptarget/include/Utilities.h
+++ b/openmp/libomptarget/include/Utilities.h
@ -253,6 +253,11 @@ template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
  return std::align(Alignment, sizeof(char), Ptr, Space);
 }

+/// Round up \p V to a \p Boundary.
+template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
+  return (V + Boundary - 1) / Boundary * Boundary;
+}
+
 } // namespace target
 } // namespace omp
 } // namespace llvm
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@ -439,7 +439,7 @@ void __tgt_set_info_flag(uint32_t);
 int __tgt_print_device_info(int64_t DeviceId);

 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                 bool IsRecord, bool SaveOutput);
+                                 void *VAddr, bool IsRecord, bool SaveOutput);

 #ifdef __cplusplus
 }
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@ -73,7 +73,8 @@ struct RTLInfoTy {
  typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
  typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
  typedef int32_t(set_device_offset_ty)(int32_t);
-  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
+  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
+                                             bool);

  int32_t Idx = -1;             // RTL index, index is the number of devices
                                // of other RTLs that were registered before,
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@ -2579,6 +2579,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
    DeviceMemoryPoolSize = Value;
    return Plugin::success();
  }
+  Error getDeviceMemorySize(uint64_t &Value) override {
+    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+      if (Pool->isGlobal()) {
+        hsa_status_t Status =
+            Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+        return Plugin::check(Status, "Error in getting device memory size: %s");
+      }
+    }
+    return Plugin::error("getDeviceMemorySize:: no global pool");
+  }

  /// AMDGPU-specific function to get device attributes.
  template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@ -49,40 +49,87 @@ private:
  void *MemoryStart;
  void *MemoryPtr;
  size_t MemorySize;
+  size_t TotalSize;
  GenericDeviceTy *Device;
  std::mutex AllocationLock;

  RRStatusTy Status;
  bool ReplaySaveOutput;
-  uint64_t DeviceMemorySize;

-  // Record/replay pre-allocates the largest possible device memory using the
-  // default kind.
-  // TODO: Expand allocation to include other kinds (device, host, shared) and
-  // possibly use a MemoryManager to track (de-)allocations for
-  // storing/retrieving when recording/replaying.
-  Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
-    // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
-    // of 1GB until allocation succeeds.
-    const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
+  void *suggestAddress(uint64_t MaxMemoryAllocation) {
+    // Get a valid pointer address for this system
+    void *Addr =
+        Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+    Device->free(Addr);
+    // Align Address to MaxMemoryAllocation
+    Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+    return Addr;
+  }
+
+  Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
+    size_t ASize = MaxMemoryAllocation;
+
+    if (!VAddr && isRecording())
+      VAddr = suggestAddress(MaxMemoryAllocation);
+
+    DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
+
+    if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
+      return Err;
+
+    if (isReplaying() && VAddr != MemoryStart) {
+      return Plugin::error("Record-Replay cannot assign the"
+                           "requested recorded address (%p, %p)",
+                           VAddr, MemoryStart);
+    }
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+         "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
+
+    MemoryPtr = MemoryStart;
+    MemorySize = 0;
+    TotalSize = ASize;
+    return Plugin::success();
+  }
+
+  Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
+    const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
    constexpr size_t STEP = 1024 * 1024 * 1024ULL;
    MemoryStart = nullptr;
-    for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
-      MemoryStart =
-          Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+    for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
+      MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
+                                     TARGET_ALLOC_DEFAULT);
      if (MemoryStart)
        break;
    }

+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+         "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
+         MemoryStart);
+
    if (!MemoryStart)
      return Plugin::error("Allocating record/replay memory");

+    if (VAddr && VAddr != MemoryStart)
+      return Plugin::error("Cannot allocate recorded address");
+
    MemoryPtr = MemoryStart;
    MemorySize = 0;

    return Plugin::success();
  }

+  Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
+    if (Device->supportVAManagement())
+      return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
+
+    uint64_t DevMemSize;
+    if (Device->getDeviceMemorySize(DevMemSize))
+      return Plugin::error("Cannot determine Device Memory Size");
+
+    return preAllocateHeuristic(DevMemSize, ReqVAddr);
+  }
+
  void dumpDeviceMemory(StringRef Filename) {
    ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
        WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
@ -114,8 +161,7 @@ public:
  bool isSaveOutputEnabled() const { return ReplaySaveOutput; }

  RecordReplayTy()
-      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
-        DeviceMemorySize(-1) {}
+      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}

  void saveImage(const char *Name, const DeviceImageTy &Image) {
    SmallString<128> ImageName = {Name, ".image"};
@ -197,6 +243,7 @@ public:
    JsonKernelInfo["LoopTripCount"] = LoopTripCount;
    JsonKernelInfo["DeviceMemorySize"] = MemorySize;
    JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+    JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;

    json::Array JsonArgPtrs;
    for (int I = 0; I < NumArgs; ++I)
@ -244,27 +291,33 @@ public:
    return Alloc;
  }

-  Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
-             bool SaveOutput) {
+  Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
+             RRStatusTy Status, bool SaveOutput) {
    this->Device = Device;
    this->Status = Status;
-    this->DeviceMemorySize = MemSize;
    this->ReplaySaveOutput = SaveOutput;

-    if (auto Err = preallocateDeviceMemory(MemSize))
+    if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
      return Err;

    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
         "Record Replay Initialized (%p)"
         " as starting address, %lu Memory Size"
         " and set on status %s\n",
-         MemoryStart, MemSize,
+         MemoryStart, TotalSize,
         Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");

    return Plugin::success();
  }

-  void deinit() { Device->free(MemoryStart); }
+  void deinit() {
+    if (Device->supportVAManagement()) {
+      if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
+        report_fatal_error("Error on releasing virtual memory space");
+    } else {
+      Device->free(MemoryStart);
+    }
+  }

 } RecordReplay;

@ -1184,6 +1237,19 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
  return queryAsyncImpl(*AsyncInfo);
 }

+Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
+  return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
+  return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
+  return Plugin::error(
+      "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
+}
+
 Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
                                            TargetAllocTy Kind) {
  void *Alloc = nullptr;
@ -1552,8 +1618,8 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
  return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
 }

-int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
-                                           uint64_t MemorySize, bool isRecord,
+int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
+                                           void *VAddr, bool isRecord,
                                           bool SaveOutput) {
  GenericPluginTy &Plugin = Plugin::get();
  GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
@ -1561,7 +1627,8 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
      isRecord ? RecordReplayTy::RRStatusTy::RRRecording
               : RecordReplayTy::RRStatusTy::RRReplaying;

-  if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
+  if (auto Err =
+          RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
    REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
           "(Error: %s)\n",
           MemorySize, toString(std::move(Err)).data());
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@ -655,6 +655,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
  Error queryAsync(__tgt_async_info *AsyncInfo);
  virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;

+  /// Check whether the architecture supports VA management
+  virtual bool supportVAManagement() const { return false; }
+
+  /// Get the total device memory size
+  virtual Error getDeviceMemorySize(uint64_t &DSize);
+
+  /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
+  /// map it to \p VAddr. The obtained address is stored in \p Addr. At return
+  /// \p RSize contains the actual size which can be equal or larger than the
+  /// requested size.
+  virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
+
+  /// De-allocates device memory and unmaps the virtual address \p VAddr
+  virtual Error memoryVAUnMap(void *VAddr, size_t Size);
+
  /// Allocate data on the device or involving the device.
  Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);

--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@ -81,6 +81,16 @@ DLWRAP(cuEventDestroy, 1)

 DLWRAP_FINALIZE()

+DLWRAP(cuMemUnmap, 2)
+DLWRAP(cuMemRelease, 1)
+DLWRAP(cuMemAddressFree, 2)
+DLWRAP(cuMemGetInfo, 2)
+DLWRAP(cuMemAddressReserve, 5)
+DLWRAP(cuMemMap, 5)
+DLWRAP(cuMemCreate, 4)
+DLWRAP(cuMemSetAccess, 4)
+DLWRAP(cuMemGetAllocationGranularity, 3)
+
 #ifndef DYNAMIC_CUDA_PATH
 #define DYNAMIC_CUDA_PATH "libcuda.so"
 #endif
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@ -26,6 +26,71 @@ typedef struct CUevent_st *CUevent;

 #define CU_DEVICE_INVALID ((CUdevice)-2)

+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+#define CU_DEVICE_INVALID ((CUdevice)-2)
+
+typedef enum CUmemAllocationGranularity_flags_enum {
+  CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
+  CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
+} CUmemAllocationGranularity_flags;
+
+typedef enum CUmemAccess_flags_enum {
+  CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
+  CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
+  CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
+  CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+typedef enum CUmemLocationType_enum {
+  CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+  CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
+  CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
+} CUmemLocationType;
+
+typedef struct CUmemLocation_st {
+  CUmemLocationType type;
+  int id;
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+typedef struct CUmemAccessDesc_st {
+  CUmemLocation location;
+  CUmemAccess_flags flags;
+} CUmemAccessDesc_v1;
+
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+typedef enum CUmemAllocationType_enum {
+  CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+  CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
+  CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
+} CUmemAllocationType;
+
+typedef enum CUmemAllocationHandleType_enum {
+  CU_MEM_HANDLE_TYPE_NONE = 0x0,
+  CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
+  CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
+  CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
+  CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+typedef struct CUmemAllocationProp_st {
+  CUmemAllocationType type;
+  CUmemAllocationHandleType requestedHandleTypes;
+  CUmemLocation location;
+
+  void *win32HandleMetaData;
+  struct {
+    unsigned char compressionType;
+    unsigned char gpuDirectRDMACapable;
+    unsigned short usage;
+    unsigned char reserved[4];
+  } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
 typedef enum cudaError_enum {
  CUDA_SUCCESS = 0,
  CUDA_ERROR_INVALID_VALUE = 1,
@ -268,4 +333,21 @@ CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
 CUresult cuEventSynchronize(CUevent);
 CUresult cuEventDestroy(CUevent);

+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
+CUresult cuMemGetInfo(size_t *free, size_t *total);
+CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags);
+CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags);
+CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                     const CUmemAllocationProp *prop, unsigned long long flags);
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc *desc, size_t count);
+CUresult cuMemGetAllocationGranularity(size_t *granularity,
+                                       const CUmemAllocationProp *prop,
+                                       CUmemAllocationGranularity_flags option);
+
 #endif
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@ -517,6 +517,116 @@ struct CUDADeviceTy : public GenericDeviceTy {
    return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
  }

+  /// CUDA support VA management
+  bool supportVAManagement() const override { return true; }
+
+  /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
+  /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
+  /// At return \p RSize contains the actual size
+  Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
+    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+    auto IHandle = DeviceMMaps.find(DVAddr);
+    size_t Size = *RSize;
+
+    if (Size == 0)
+      return Plugin::error("Memory Map Size must be larger than 0");
+
+    // Check if we have already mapped this address
+    if (IHandle != DeviceMMaps.end())
+      return Plugin::error("Address already memory mapped");
+
+    CUmemAllocationProp Prop = {};
+    size_t Granularity = 0;
+
+    size_t Free, Total;
+    CUresult Res = cuMemGetInfo(&Free, &Total);
+    if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
+      return Err;
+
+    if (Size >= Free) {
+      *Addr = nullptr;
+      return Plugin::error(
+          "Canot map memory size larger than the available device memory");
+    }
+
+    // currently NVidia only supports pinned device types
+    Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+
+    Prop.location.id = DeviceId;
+    cuMemGetAllocationGranularity(&Granularity, &Prop,
+                                  CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+    if (auto Err =
+            Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
+      return Err;
+
+    if (Granularity == 0)
+      return Plugin::error("Wrong device Page size");
+
+    // Ceil to page size.
+    Size = roundUp(Size, Granularity);
+
+    // Create a handler of our allocation
+    CUmemGenericAllocationHandle AHandle;
+    Res = cuMemCreate(&AHandle, Size, &Prop, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
+      return Err;
+
+    CUdeviceptr DevPtr = 0;
+    Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
+      return Err;
+
+    Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
+      return Err;
+
+    CUmemAccessDesc ADesc = {};
+    ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    ADesc.location.id = DeviceId;
+    ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    // Sets address
+    Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
+    if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
+      return Err;
+
+    *Addr = reinterpret_cast<void *>(DevPtr);
+    *RSize = Size;
+    DeviceMMaps.insert({DevPtr, AHandle});
+    return Plugin::success();
+  }
+
+  /// De-allocates device memory and Unmaps the Virtual Addr
+  Error memoryVAUnMap(void *VAddr, size_t Size) override {
+    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+    auto IHandle = DeviceMMaps.find(DVAddr);
+    // Mapping does not exist
+    if (IHandle == DeviceMMaps.end()) {
+      return Plugin::error("Addr is not MemoryMapped");
+    }
+
+    if (IHandle == DeviceMMaps.end())
+      return Plugin::error("Addr is not MemoryMapped");
+
+    CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
+
+    CUresult Res = cuMemUnmap(DVAddr, Size);
+    if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
+      return Err;
+
+    Res = cuMemRelease(AllocHandle);
+    if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
+      return Err;
+
+    Res = cuMemAddressFree(DVAddr, Size);
+    if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
+      return Err;
+
+    DeviceMMaps.erase(IHandle);
+    return Plugin::success();
+  }
+
  /// Query for the completion of the pending operations on the async info.
  Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
    CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
@ -859,6 +969,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
  Error setDeviceHeapSize(uint64_t Value) override {
    return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
  }
+  Error getDeviceMemorySize(uint64_t &Value) override {
+    CUresult Res = cuDeviceTotalMem(&Value, Device);
+    return Plugin::check(Res, "Error in getDeviceMemorySize %s");
+  }

  /// CUDA-specific functions for getting and setting context limits.
  Error setCtxLimit(CUlimit Kind, uint64_t Value) {
@ -907,6 +1021,9 @@ private:
  /// The CUDA device handler.
  CUdevice Device = CU_DEVICE_INVALID;

+  /// The memory mapped addresses and their handles
+  std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
+
  /// The compute capability of the corresponding CUDA device.
  struct ComputeCapabilityTy {
    uint32_t Major;
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@ -482,7 +482,8 @@ void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
 int DeviceTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
                            HostDataToTargetTy *Entry, int64_t Size) {
  assert(Entry && "Trying to delete a null entry from the HDTT map.");
-  assert(Entry->getTotalRefCount() == 0 && Entry->getDataEndThreadCount() == 0 &&
+  assert(Entry->getTotalRefCount() == 0 &&
+         Entry->getDataEndThreadCount() == 0 &&
         "Trying to delete entry that is in use or owned by another thread.");

  INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
@ -546,7 +547,7 @@ void DeviceTy::init() {

    RTL->activate_record_replay(RTLDeviceID,
                                OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
-                                true, OMPX_ReplaySaveOutput);
+                                nullptr, true, OMPX_ReplaySaveOutput);
  }

  IsInit = true;
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@ -346,7 +346,8 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 /// /param SaveOutput Store the device memory after kernel
 ///                   execution on persistent storage
 EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                        bool IsRecord, bool SaveOutput) {
+                                        void *VAddr, bool IsRecord,
+                                        bool SaveOutput) {
  if (!deviceIsReady(DeviceId)) {
    DP("Device %" PRId64 " is not ready\n", DeviceId);
    return OMP_TGT_FAIL;
@ -354,7 +355,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,

  DeviceTy &Device = *PM->Devices[DeviceId];
  [[maybe_unused]] int Rc =
-      target_activate_rr(Device, MemorySize, IsRecord, SaveOutput);
+      target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
  assert(Rc == OFFLOAD_SUCCESS &&
         "__tgt_activate_record_replay unexpected failure!");
  return OMP_TGT_SUCCESS;
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@ -827,14 +827,13 @@ postProcessingTargetDataEnd(DeviceTy *Device,
    // remaining shadow pointer entries for this struct.
    const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
    if (HasFrom) {
-      Entry->foreachShadowPointerInfo(
-          [&](const ShadowPtrInfoTy &ShadowPtr) {
-            *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-            DP("Restoring original host pointer value " DPxMOD " for host "
-               "pointer " DPxMOD "\n",
-               DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
-            return OFFLOAD_SUCCESS;
-          });
+      Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
+        *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
+        DP("Restoring original host pointer value " DPxMOD " for host "
+           "pointer " DPxMOD "\n",
+           DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+        return OFFLOAD_SUCCESS;
+      });
    }

    // Give up the lock as we either don't need it anymore (e.g., done with
@ -1713,9 +1712,9 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 /// Enables the record replay mechanism by pre-allocating MemorySize
 /// and informing the record-replayer of whether to store the output
 /// in some file.
-int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord,
-                       bool SaveOutput) {
-  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize,
+int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
+                       bool isRecord, bool SaveOutput) {
+  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
                                            isRecord, SaveOutput);
 }

--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@ -42,7 +42,7 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                  KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);

 extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
-                              bool isRecord, bool SaveOutput);
+                              void *ReqAddr, bool isRecord, bool SaveOutput);

 extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                         void *DeviceMemory, int64_t DeviceMemorySize,
--- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@ -87,6 +87,9 @@ int main(int argc, char **argv) {
  for (auto It : *TgtArgOffsetsArray)
    TgtArgOffsets.push_back(static_cast<ptrdiff_t>(It.getAsInteger().value()));

+  void *BAllocStart = reinterpret_cast<void *>(
+      JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
+
  __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
  std::string KernelEntryName = KernelFunc.value().str();
  KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
@ -125,8 +128,8 @@ int main(int argc, char **argv) {

  __tgt_register_lib(&Desc);

-  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false,
-                                        VerifyOpt);
+  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
+                                        false, VerifyOpt);

  if (Rc != OMP_TGT_SUCCESS) {
    report_fatal_error("Cannot activate record replay\n");