mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-10-07 02:43:57 +00:00
[OpenMP] Enable automatic unified shared memory on MI300A. (#77512)
This patch enables applications that did not request OpenMP unified_shared_memory to run with the same zero-copy behavior, where mapped memory does not result in extra memory allocations and memory copies, but CPU-allocated memory is accessed from the device. The name for this behavior is "automatic zero-copy" and it relies on detecting: that the runtime is running on a MI300A, that the user did not select unified_shared_memory in their program, and that XNACK (unified memory support) is enabled in the current GPU configuration. If all these conditions are met, then automatic zero-copy is triggered. This patch also introduces an environment variable OMPX_APU_MAPS that, if set, triggers automatic zero-copy also on non APU GPUs (e.g., on discrete GPUs). This patch is still missing support for global variables, which will be provided in a subsequent patch. Co-authored-by: Thorsten Blass <thorsten.blass@amd.com>
This commit is contained in:
parent
4897b9888f
commit
ae99966a27
@ -219,6 +219,9 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
|
||||
void *VAddr, bool isRecord,
|
||||
bool SaveOutput,
|
||||
uint64_t &ReqPtrArgOffset);
|
||||
|
||||
// Returns true if the device \p DeviceId suggests to use auto zero-copy.
|
||||
int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
|
||||
}
|
||||
|
||||
#endif // OMPTARGET_SHARED_PLUGIN_API_H
|
||||
|
@ -47,3 +47,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
|
||||
PLUGIN_API_HANDLE(data_notify_unmapped, false);
|
||||
PLUGIN_API_HANDLE(set_device_offset, false);
|
||||
PLUGIN_API_HANDLE(initialize_record_replay, false);
|
||||
PLUGIN_API_HANDLE(use_auto_zero_copy, false);
|
||||
|
@ -33,7 +33,12 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
|
||||
/// unified_shared_memory clause.
|
||||
OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
|
||||
/// dynamic_allocators clause.
|
||||
OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
|
||||
OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
|
||||
/// Auto zero-copy extension:
|
||||
/// when running on an APU, the GPU plugin may decide to
|
||||
/// run in zero-copy even though the user did not program
|
||||
/// their application with unified_shared_memory requirement.
|
||||
OMPX_REQ_AUTO_ZERO_COPY = 0x020
|
||||
};
|
||||
|
||||
class RequirementCollection {
|
||||
@ -65,6 +70,14 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
// Auto zero-copy is only valid when no other requirement has been set
|
||||
// and it is computed at device initialization time, after the requirement
|
||||
// flag has already been set to OMP_REQ_NONE.
|
||||
if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
|
||||
SetFlags = NewFlags;
|
||||
return;
|
||||
}
|
||||
|
||||
// If multiple compilation units are present enforce
|
||||
// consistency across all of them for require clauses:
|
||||
// - reverse_offload
|
||||
|
@ -164,6 +164,9 @@ struct DeviceTy {
|
||||
/// Print all offload entries to stderr.
|
||||
void dumpOffloadEntries();
|
||||
|
||||
/// Ask the device whether the runtime should use auto zero-copy.
|
||||
bool useAutoZeroCopy();
|
||||
|
||||
private:
|
||||
/// Deinitialize the device (and plugin).
|
||||
void deinit();
|
||||
|
@ -63,6 +63,7 @@ typedef enum {
|
||||
} hsa_amd_memory_pool_access_t;
|
||||
|
||||
typedef enum hsa_amd_agent_info_s {
|
||||
HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
|
||||
HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
|
||||
HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
|
||||
HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
|
||||
|
@ -184,6 +184,29 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
|
||||
#endif
|
||||
}
|
||||
|
||||
Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
|
||||
std::string Target;
|
||||
auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
|
||||
uint32_t Length;
|
||||
hsa_status_t Status;
|
||||
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
|
||||
if (Status != HSA_STATUS_SUCCESS)
|
||||
return Status;
|
||||
|
||||
llvm::SmallVector<char> ISAName(Length);
|
||||
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
|
||||
if (Status != HSA_STATUS_SUCCESS)
|
||||
return Status;
|
||||
|
||||
llvm::StringRef TripleTarget(ISAName.begin(), Length);
|
||||
if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
|
||||
Target = TripleTarget.ltrim('-').rtrim('\0').str();
|
||||
return HSA_STATUS_SUCCESS;
|
||||
});
|
||||
if (Err)
|
||||
return Err;
|
||||
return Target;
|
||||
}
|
||||
} // namespace utils
|
||||
|
||||
/// Utility class representing generic resource references to AMDGPU resources.
|
||||
@ -1849,8 +1872,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
|
||||
OMPX_UseMultipleSdmaEngines(
|
||||
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
|
||||
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
|
||||
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
|
||||
OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
|
||||
AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
|
||||
HostDevice(HostDevice) {}
|
||||
|
||||
~AMDGPUDeviceTy() {}
|
||||
|
||||
@ -1941,6 +1965,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
|
||||
return Err;
|
||||
|
||||
// Detect if XNACK is enabled
|
||||
auto TargeTripleAndFeaturesOrError =
|
||||
utils::getTargetTripleAndFeatures(Agent);
|
||||
if (!TargeTripleAndFeaturesOrError)
|
||||
return TargeTripleAndFeaturesOrError.takeError();
|
||||
if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
|
||||
.contains("xnack+"))
|
||||
IsXnackEnabled = true;
|
||||
|
||||
// detect if device is an APU.
|
||||
if (auto Err = checkIfAPU())
|
||||
return Err;
|
||||
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
@ -2650,6 +2687,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// Returns true if auto zero-copy the best configuration for the current
|
||||
/// arch.
|
||||
/// On AMDGPUs, automatic zero-copy is turned on
|
||||
/// when running on an APU with XNACK (unified memory) support
|
||||
/// enabled. On discrete GPUs, automatic zero-copy is triggered
|
||||
/// if the user sets the environment variable OMPX_APU_MAPS=1
|
||||
/// and if XNACK is enabled. The rationale is that zero-copy
|
||||
/// is the best configuration (performance, memory footprint) on APUs,
|
||||
/// while it is often not the best on discrete GPUs.
|
||||
/// XNACK can be enabled with a kernel boot parameter or with
|
||||
/// the HSA_XNACK environment variable.
|
||||
bool useAutoZeroCopyImpl() override {
|
||||
return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
|
||||
}
|
||||
|
||||
/// Getters and setters for stack and heap sizes.
|
||||
Error getDeviceStackSize(uint64_t &Value) override {
|
||||
Value = StackSize;
|
||||
@ -2749,6 +2801,34 @@ private:
|
||||
return Err;
|
||||
}
|
||||
|
||||
/// Detect if current architecture is an APU.
|
||||
Error checkIfAPU() {
|
||||
// TODO: replace with ROCr API once it becomes available.
|
||||
llvm::StringRef StrGfxName(ComputeUnitKind);
|
||||
IsAPU = llvm::StringSwitch<bool>(StrGfxName)
|
||||
.Case("gfx940", true)
|
||||
.Default(false);
|
||||
if (IsAPU)
|
||||
return Plugin::success();
|
||||
|
||||
bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
|
||||
.Case("gfx942", true)
|
||||
.Default(false);
|
||||
if (!MayBeAPU)
|
||||
return Plugin::success();
|
||||
|
||||
// can be MI300A or MI300X
|
||||
uint32_t ChipID = 0;
|
||||
if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
|
||||
return Err;
|
||||
|
||||
if (!(ChipID & 0x1)) {
|
||||
IsAPU = true;
|
||||
return Plugin::success();
|
||||
}
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// Envar for controlling the number of HSA queues per device. High number of
|
||||
/// queues may degrade performance.
|
||||
UInt32Envar OMPX_NumQueues;
|
||||
@ -2785,6 +2865,10 @@ private:
|
||||
/// Use ROCm 5.7 interface for multiple SDMA engines
|
||||
BoolEnvar OMPX_UseMultipleSdmaEngines;
|
||||
|
||||
/// Value of OMPX_APU_MAPS env var used to force
|
||||
/// automatic zero-copy behavior on non-APU GPUs.
|
||||
BoolEnvar OMPX_ApuMaps;
|
||||
|
||||
/// Stream manager for AMDGPU streams.
|
||||
AMDGPUStreamManagerTy AMDGPUStreamManager;
|
||||
|
||||
@ -2815,6 +2899,13 @@ private:
|
||||
/// The current size of the stack that will be used in cases where it could
|
||||
/// not be statically determined.
|
||||
uint64_t StackSize = 16 * 1024 /* 16 KB */;
|
||||
|
||||
/// Is the plugin associated with an APU?
|
||||
bool IsAPU = false;
|
||||
|
||||
/// True is the system is configured with XNACK-Enabled.
|
||||
/// False otherwise.
|
||||
bool IsXnackEnabled = false;
|
||||
};
|
||||
|
||||
Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
|
||||
@ -3059,30 +3150,13 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
|
||||
std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
|
||||
|
||||
for (hsa_agent_t Agent : KernelAgents) {
|
||||
std::string Target;
|
||||
auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
|
||||
uint32_t Length;
|
||||
hsa_status_t Status;
|
||||
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
|
||||
if (Status != HSA_STATUS_SUCCESS)
|
||||
return Status;
|
||||
|
||||
llvm::SmallVector<char> ISAName(Length);
|
||||
Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
|
||||
if (Status != HSA_STATUS_SUCCESS)
|
||||
return Status;
|
||||
|
||||
llvm::StringRef TripleTarget(ISAName.begin(), Length);
|
||||
if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
|
||||
Target = TripleTarget.ltrim('-').rtrim('\0').str();
|
||||
return HSA_STATUS_SUCCESS;
|
||||
});
|
||||
if (Err)
|
||||
return std::move(Err);
|
||||
|
||||
auto TargeTripleAndFeaturesOrError =
|
||||
utils::getTargetTripleAndFeatures(Agent);
|
||||
if (!TargeTripleAndFeaturesOrError)
|
||||
return TargeTripleAndFeaturesOrError.takeError();
|
||||
if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
|
||||
ElfOrErr->getPlatformFlags(),
|
||||
Target))
|
||||
*TargeTripleAndFeaturesOrError))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -883,6 +883,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
|
||||
|
||||
virtual Error getDeviceStackSize(uint64_t &V) = 0;
|
||||
|
||||
/// Returns true if current plugin architecture is an APU
|
||||
/// and unified_shared_memory was not requested by the program.
|
||||
bool useAutoZeroCopy();
|
||||
virtual bool useAutoZeroCopyImpl() { return false; }
|
||||
|
||||
private:
|
||||
/// Register offload entry for global variable.
|
||||
Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
|
||||
|
@ -1555,6 +1555,8 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
|
||||
return syncEventImpl(EventPtr);
|
||||
}
|
||||
|
||||
bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
|
||||
|
||||
Error GenericPluginTy::init() {
|
||||
auto NumDevicesOrErr = initImpl();
|
||||
if (!NumDevicesOrErr)
|
||||
@ -2067,6 +2069,14 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
|
||||
return OFFLOAD_SUCCESS;
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
|
||||
// Automatic zero-copy only applies to programs that did
|
||||
// not request unified_shared_memory and are deployed on an
|
||||
// APU with XNACK enabled.
|
||||
if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
|
||||
return false;
|
||||
return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
|
||||
}
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -252,8 +252,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
|
||||
MESSAGE("device mapping required by 'present' map type modifier does not "
|
||||
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
|
||||
DPxPTR(HstPtrBegin), Size);
|
||||
} else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
|
||||
!HasCloseModifier) {
|
||||
} else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
|
||||
!HasCloseModifier) ||
|
||||
(PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
|
||||
|
||||
// If unified shared memory is active, implicitly mapped variables that are
|
||||
// not privatized use host address. Any explicitly mapped variables also use
|
||||
// host address where correctness is not impeded. In all other cases maps
|
||||
@ -261,6 +263,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
|
||||
// In addition to the mapping rules above, the close map modifier forces the
|
||||
// mapping of the variable to the device.
|
||||
if (Size) {
|
||||
INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
|
||||
"Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
|
||||
"memory\n",
|
||||
DPxPTR((uintptr_t)HstPtrBegin), Size);
|
||||
DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
|
||||
"memory\n",
|
||||
DPxPTR((uintptr_t)HstPtrBegin), Size);
|
||||
@ -415,7 +421,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
|
||||
LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
|
||||
LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
|
||||
LR.TPR.TargetPointer = (void *)TP;
|
||||
} else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
|
||||
} else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
|
||||
PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
|
||||
// If the value isn't found in the mapping and unified shared memory
|
||||
// is on then it means we have stumbled upon a value which we need to
|
||||
// use directly from the host.
|
||||
|
@ -144,6 +144,9 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
|
||||
|
||||
int32_t NumPD = getNumberOfPluginDevices();
|
||||
ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
|
||||
// Auto zero-copy is a per-device property. We need to ensure
|
||||
// that all devices are suggesting to use it.
|
||||
bool UseAutoZeroCopy = !(NumPD == 0);
|
||||
for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
|
||||
auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
|
||||
if (auto Err = Device->init()) {
|
||||
@ -151,12 +154,20 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
|
||||
toString(std::move(Err)).c_str());
|
||||
continue;
|
||||
}
|
||||
UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
|
||||
|
||||
ExclusiveDevicesAccessor->push_back(std::move(Device));
|
||||
++NumberOfUserDevices;
|
||||
++UserDevId;
|
||||
}
|
||||
|
||||
// Auto Zero-Copy can only be currently triggered when the system is an
|
||||
// homogeneous APU architecture without attached discrete GPUs.
|
||||
// If all devices suggest to use it, change requirment flags to trigger
|
||||
// zero-copy behavior when mapping memory.
|
||||
if (UseAutoZeroCopy)
|
||||
PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
|
||||
|
||||
DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
|
||||
DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
|
||||
NumberOfPluginDevices);
|
||||
|
@ -339,3 +339,9 @@ void DeviceTy::dumpOffloadEntries() {
|
||||
fprintf(stderr, " %11s: %s\n", Kind, It.second.getNameAsCStr());
|
||||
}
|
||||
}
|
||||
|
||||
bool DeviceTy::useAutoZeroCopy() {
|
||||
if (RTL->use_auto_zero_copy)
|
||||
return RTL->use_auto_zero_copy(RTLDeviceID);
|
||||
return false;
|
||||
}
|
||||
|
57
openmp/libomptarget/test/mapping/auto_zero_copy.cpp
Normal file
57
openmp/libomptarget/test/mapping/auto_zero_copy.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
// clang-format off
|
||||
// RUN: %libomptarget-compilexx-generic
|
||||
// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
|
||||
// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
|
||||
|
||||
// RUN: %libomptarget-compilexx-generic
|
||||
// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
|
||||
// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
|
||||
|
||||
// UNSUPPORTED: aarch64-unknown-linux-gnu
|
||||
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
|
||||
// UNSUPPORTED: nvptx64-nvidia-cuda
|
||||
// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
|
||||
// UNSUPPORTED: x86_64-pc-linux-gnu
|
||||
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
|
||||
|
||||
// REQUIRES: unified_shared_memory
|
||||
|
||||
// clang-format on
|
||||
|
||||
#include <cstdio>
|
||||
|
||||
int main() {
|
||||
int n = 1024;
|
||||
|
||||
// test various mapping types
|
||||
int *a = new int[n];
|
||||
int k = 3;
|
||||
int b[n];
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
b[i] = i;
|
||||
|
||||
// clang-format off
|
||||
// INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
|
||||
// INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
|
||||
|
||||
// INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
|
||||
// INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
|
||||
// INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
|
||||
// INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
|
||||
// clang-format on
|
||||
#pragma omp target teams distribute parallel for map(tofrom : a[ : n]) \
|
||||
map(to : b[ : n])
|
||||
for (int i = 0; i < n; i++)
|
||||
a[i] = i + b[i] + k;
|
||||
|
||||
int err = 0;
|
||||
for (int i = 0; i < n; i++)
|
||||
if (a[i] != i + b[i] + k)
|
||||
err++;
|
||||
|
||||
// CHECK: PASS
|
||||
if (err == 0)
|
||||
printf("PASS\n");
|
||||
return err;
|
||||
}
|
Loading…
Reference in New Issue
Block a user