mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-26 23:21:11 +00:00
[OpenMP][libomptarget] Enable parallel copies via multiple SDMA engines (#71801)
This enables the AMDGPU plugin to use a new ROCm 5.7 interface to dispatch asynchronous data transfers across SDMA engines. The default functionality stays unchanged, meaning that all data transfers are enqueued into a H2D queue or an D2H queue, depending on transfer direction, via the HSA interface used previously. The new interface can be enabled via the environment variable `LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES=true` when libomptarget is built against a recent ROCm version (5.7 and later). As of now, requests are distributed in a round-robin fashion across available SDMA engines.
This commit is contained in:
parent
a7bbcc4690
commit
e876250b63
@ -130,6 +130,45 @@ Error iterateAgentMemoryPools(hsa_agent_t Agent, CallbackTy Cb) {
|
||||
"Error in hsa_amd_agent_iterate_memory_pools: %s");
|
||||
}
|
||||
|
||||
/// Dispatches an asynchronous memory copy.
|
||||
/// Enables different SDMA engines for the dispatch in a round-robin fashion.
|
||||
Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
|
||||
const void *Src, hsa_agent_t SrcAgent, size_t Size,
|
||||
uint32_t NumDepSignals, const hsa_signal_t *DepSignals,
|
||||
hsa_signal_t CompletionSignal) {
|
||||
if (UseMultipleSdmaEngines) {
|
||||
hsa_status_t S =
|
||||
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, Size,
|
||||
NumDepSignals, DepSignals, CompletionSignal);
|
||||
return Plugin::check(S, "Error in hsa_amd_memory_async_copy: %s");
|
||||
}
|
||||
|
||||
// This solution is probably not the best
|
||||
#if !(HSA_AMD_INTERFACE_VERSION_MAJOR >= 1 && \
|
||||
HSA_AMD_INTERFACE_VERSION_MINOR >= 2)
|
||||
return Plugin::error("Async copy on selected SDMA requires ROCm 5.7");
|
||||
#else
|
||||
static std::atomic<int> SdmaEngine{1};
|
||||
|
||||
// This atomics solution is probably not the best, but should be sufficient
|
||||
// for now.
|
||||
// In a worst case scenario, in which threads read the same value, they will
|
||||
// dispatch to the same SDMA engine. This may result in sub-optimal
|
||||
// performance. However, I think the possibility to be fairly low.
|
||||
int LocalSdmaEngine = SdmaEngine.load(std::memory_order_acquire);
|
||||
// This call is only avail in ROCm >= 5.7
|
||||
hsa_status_t S = hsa_amd_memory_async_copy_on_engine(
|
||||
Dst, DstAgent, Src, SrcAgent, Size, NumDepSignals, DepSignals,
|
||||
CompletionSignal, (hsa_amd_sdma_engine_id_t)LocalSdmaEngine,
|
||||
/*force_copy_on_sdma=*/true);
|
||||
// Increment to use one of three SDMA engines: 0x1, 0x2, 0x4
|
||||
LocalSdmaEngine = (LocalSdmaEngine << 1) % 7;
|
||||
SdmaEngine.store(LocalSdmaEngine, std::memory_order_relaxed);
|
||||
|
||||
return Plugin::check(S, "Error in hsa_amd_memory_async_copy_on_engine: %s");
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace utils
|
||||
|
||||
/// Utility class representing generic resource references to AMDGPU resources.
|
||||
@ -945,6 +984,9 @@ private:
|
||||
/// Timeout hint for HSA actively waiting for signal value to change
|
||||
const uint64_t StreamBusyWaitMicroseconds;
|
||||
|
||||
/// Indicate to spread data transfers across all avilable SDMAs
|
||||
bool UseMultipleSdmaEngines;
|
||||
|
||||
/// Return the current number of asychronous operations on the stream.
|
||||
uint32_t size() const { return NextSlot; }
|
||||
|
||||
@ -1170,15 +1212,15 @@ public:
|
||||
InputSignal = nullptr;
|
||||
|
||||
// Issue the async memory copy.
|
||||
hsa_status_t Status;
|
||||
if (InputSignal) {
|
||||
hsa_signal_t InputSignalRaw = InputSignal->get();
|
||||
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 1,
|
||||
&InputSignalRaw, OutputSignal->get());
|
||||
} else
|
||||
Status = hsa_amd_memory_async_copy(Dst, Agent, Src, Agent, CopySize, 0,
|
||||
nullptr, OutputSignal->get());
|
||||
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
|
||||
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
|
||||
CopySize, 1, &InputSignalRaw,
|
||||
OutputSignal->get());
|
||||
}
|
||||
|
||||
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Src, Agent,
|
||||
CopySize, 0, nullptr, OutputSignal->get());
|
||||
}
|
||||
|
||||
/// Push an asynchronous memory copy device-to-host involving an unpinned
|
||||
@ -1214,21 +1256,19 @@ public:
|
||||
|
||||
// Issue the first step: device to host transfer. Avoid defining the input
|
||||
// dependency if already satisfied.
|
||||
hsa_status_t Status;
|
||||
if (InputSignal) {
|
||||
hsa_signal_t InputSignalRaw = InputSignal->get();
|
||||
Status =
|
||||
hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 1,
|
||||
&InputSignalRaw, OutputSignals[0]->get());
|
||||
if (auto Err = utils::asyncMemCopy(
|
||||
UseMultipleSdmaEngines, Inter, Agent, Src, Agent, CopySize, 1,
|
||||
&InputSignalRaw, OutputSignals[0]->get()))
|
||||
return Err;
|
||||
} else {
|
||||
Status = hsa_amd_memory_async_copy(Inter, Agent, Src, Agent, CopySize, 0,
|
||||
nullptr, OutputSignals[0]->get());
|
||||
if (auto Err = utils::asyncMemCopy(UseMultipleSdmaEngines, Inter, Agent,
|
||||
Src, Agent, CopySize, 0, nullptr,
|
||||
OutputSignals[0]->get()))
|
||||
return Err;
|
||||
}
|
||||
|
||||
if (auto Err =
|
||||
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
|
||||
return Err;
|
||||
|
||||
// Consume another stream slot and compute dependencies.
|
||||
std::tie(Curr, InputSignal) = consume(OutputSignals[1]);
|
||||
assert(InputSignal && "Invalid input signal");
|
||||
@ -1242,7 +1282,7 @@ public:
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
|
||||
// Issue the second step: host to host transfer.
|
||||
Status = hsa_amd_signal_async_handler(
|
||||
hsa_status_t Status = hsa_amd_signal_async_handler(
|
||||
InputSignal->get(), HSA_SIGNAL_CONDITION_EQ, 0, asyncActionCallback,
|
||||
(void *)&Slots[Curr]);
|
||||
|
||||
@ -1318,16 +1358,14 @@ public:
|
||||
|
||||
// Issue the second step: host to device transfer. Avoid defining the input
|
||||
// dependency if already satisfied.
|
||||
hsa_status_t Status;
|
||||
if (InputSignal && InputSignal->load()) {
|
||||
hsa_signal_t InputSignalRaw = InputSignal->get();
|
||||
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 1,
|
||||
&InputSignalRaw, OutputSignal->get());
|
||||
} else
|
||||
Status = hsa_amd_memory_async_copy(Dst, Agent, Inter, Agent, CopySize, 0,
|
||||
nullptr, OutputSignal->get());
|
||||
|
||||
return Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s");
|
||||
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter,
|
||||
Agent, CopySize, 1, &InputSignalRaw,
|
||||
OutputSignal->get());
|
||||
}
|
||||
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, Agent, Inter, Agent,
|
||||
CopySize, 0, nullptr, OutputSignal->get());
|
||||
}
|
||||
|
||||
// AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
|
||||
@ -1353,17 +1391,15 @@ public:
|
||||
// allocated by this runtime or the caller made the appropriate
|
||||
// access calls.
|
||||
|
||||
hsa_status_t Status;
|
||||
if (InputSignal && InputSignal->load()) {
|
||||
hsa_signal_t InputSignalRaw = InputSignal->get();
|
||||
Status =
|
||||
hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize, 1,
|
||||
&InputSignalRaw, OutputSignal->get());
|
||||
} else
|
||||
Status = hsa_amd_memory_async_copy(Dst, DstAgent, Src, SrcAgent, CopySize,
|
||||
0, nullptr, OutputSignal->get());
|
||||
|
||||
return Plugin::check(Status, "Error in D2D hsa_amd_memory_async_copy: %s");
|
||||
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
|
||||
SrcAgent, CopySize, 1, &InputSignalRaw,
|
||||
OutputSignal->get());
|
||||
}
|
||||
return utils::asyncMemCopy(UseMultipleSdmaEngines, Dst, DstAgent, Src,
|
||||
SrcAgent, CopySize, 0, nullptr,
|
||||
OutputSignal->get());
|
||||
}
|
||||
|
||||
/// Synchronize with the stream. The current thread waits until all operations
|
||||
@ -1788,6 +1824,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS",
|
||||
64),
|
||||
OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
|
||||
OMPX_UseMultipleSdmaEngines(
|
||||
"LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
|
||||
AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
|
||||
AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
|
||||
|
||||
@ -2206,10 +2244,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
if (auto Err = Signal.init())
|
||||
return Err;
|
||||
|
||||
Status = hsa_amd_memory_async_copy(TgtPtr, Agent, PinnedPtr, Agent, Size,
|
||||
0, nullptr, Signal.get());
|
||||
if (auto Err =
|
||||
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
|
||||
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
|
||||
Agent, PinnedPtr, Agent, Size, 0,
|
||||
nullptr, Signal.get()))
|
||||
return Err;
|
||||
|
||||
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
||||
@ -2267,10 +2304,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
if (auto Err = Signal.init())
|
||||
return Err;
|
||||
|
||||
Status = hsa_amd_memory_async_copy(PinnedPtr, Agent, TgtPtr, Agent, Size,
|
||||
0, nullptr, Signal.get());
|
||||
if (auto Err =
|
||||
Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s"))
|
||||
if (auto Err = utils::asyncMemCopy(useMultipleSdmaEngines(), PinnedPtr,
|
||||
Agent, TgtPtr, Agent, Size, 0, nullptr,
|
||||
Signal.get()))
|
||||
return Err;
|
||||
|
||||
if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds()))
|
||||
@ -2633,6 +2669,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
});
|
||||
}
|
||||
|
||||
bool useMultipleSdmaEngines() const { return OMPX_UseMultipleSdmaEngines; }
|
||||
|
||||
private:
|
||||
using AMDGPUEventRef = AMDGPUResourceRef<AMDGPUEventTy>;
|
||||
using AMDGPUEventManagerTy = GenericDeviceResourceManagerTy<AMDGPUEventRef>;
|
||||
@ -2702,6 +2740,9 @@ private:
|
||||
/// are microseconds.
|
||||
UInt32Envar OMPX_StreamBusyWait;
|
||||
|
||||
/// Use ROCm 5.7 interface for multiple SDMA engines
|
||||
BoolEnvar OMPX_UseMultipleSdmaEngines;
|
||||
|
||||
/// Stream manager for AMDGPU streams.
|
||||
AMDGPUStreamManagerTy AMDGPUStreamManager;
|
||||
|
||||
@ -2803,7 +2844,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device)
|
||||
SignalManager(Device.getSignalManager()), Device(Device),
|
||||
// Initialize the std::deque with some empty positions.
|
||||
Slots(32), NextSlot(0), SyncCycle(0), RPCServer(nullptr),
|
||||
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {}
|
||||
StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()),
|
||||
UseMultipleSdmaEngines(Device.useMultipleSdmaEngines()) {}
|
||||
|
||||
/// Class implementing the AMDGPU-specific functionalities of the global
|
||||
/// handler.
|
||||
|
Loading…
Reference in New Issue
Block a user