mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-04-01 12:43:47 +00:00
[OpenMP][AMDGPU] Add Envar for controlling HSA busy queue tracking
If the Envar is set to true (default), busy HSA queues will be actively avoided when assigning a queue to a Stream. Otherwise, we will initialize a new HSA queue for each requested Stream, then default to round robin once the set maximum has been reached. Reviewed By: jdoerfert, kevinsala Differential Revision: https://reviews.llvm.org/D156996
This commit is contained in:
parent
999ac10d76
commit
7eba3e58d5
@ -1175,6 +1175,7 @@ There are several environment variables to change the behavior of the plugins:
|
||||
* ``LIBOMPTARGET_LOCK_MAPPED_HOST_BUFFERS``
|
||||
* ``LIBOMPTARGET_AMDGPU_NUM_HSA_QUEUES``
|
||||
* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_SIZE``
|
||||
* ``LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING``
|
||||
* ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU``
|
||||
* ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES``
|
||||
* ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS``
|
||||
@ -1231,6 +1232,17 @@ plugin. The size is the number of AQL packets an HSA queue is expected to hold.
|
||||
It is also the number of AQL packets that can be pushed into each queue without
|
||||
waiting the driver to process them. The default value is ``512``.
|
||||
|
||||
LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING
|
||||
"""""""""""""""""""""""""""""""""""""""""""
|
||||
|
||||
This environment variable controls if idle HSA queues will be preferentially
|
||||
assigned to streams, for example when they are requested for a kernel launch.
|
||||
Should all queues be considered busy, a new queue is initialized and returned,
|
||||
until we reach the set maximum. Otherwise, we will select the least utilized
|
||||
queue. If this is disabled, each time a stream is requested a new HSA queue
|
||||
will be initialized, regardless of their utilization. Additionally, queues will
|
||||
be selected using round robin selection. The default value is ``true``.
|
||||
|
||||
.. _libomptarget_amdgpu_teams_per_cu:
|
||||
|
||||
LIBOMPTARGET_AMDGPU_TEAMS_PER_CU
|
||||
|
@ -594,13 +594,16 @@ struct AMDGPUQueueTy {
|
||||
return Plugin::check(Status, "Error in hsa_queue_destroy: %s");
|
||||
}
|
||||
|
||||
/// Returns if this queue is considered busy
|
||||
bool isBusy() const { return NumUsers > 0; }
|
||||
/// Returns the number of streams, this queue is currently assigned to.
|
||||
bool getUserCount() const { return NumUsers; }
|
||||
|
||||
/// Decrement user count of the queue object
|
||||
/// Returns if the underlying HSA queue is initialized.
|
||||
bool isInitialized() { return Queue != nullptr; }
|
||||
|
||||
/// Decrement user count of the queue object.
|
||||
void removeUser() { --NumUsers; }
|
||||
|
||||
/// Increase user count of the queue object
|
||||
/// Increase user count of the queue object.
|
||||
void addUser() { ++NumUsers; }
|
||||
|
||||
/// Push a kernel launch to the queue. The kernel launch requires an output
|
||||
@ -784,8 +787,9 @@ private:
|
||||
/// atomic operations. We can further investigate it if this is a bottleneck.
|
||||
std::mutex Mutex;
|
||||
|
||||
/// Indicates that the queue is busy when > 0
|
||||
int NumUsers;
|
||||
/// The number of streams, this queue is currently assigned to. A queue is
|
||||
/// considered idle when this is zero, otherwise: busy.
|
||||
uint32_t NumUsers;
|
||||
};
|
||||
|
||||
/// Struct that implements a stream of asynchronous operations for AMDGPU
|
||||
@ -1451,7 +1455,9 @@ struct AMDGPUStreamManagerTy final
|
||||
using ResourcePoolTy = GenericDeviceResourceManagerTy<ResourceRef>;
|
||||
|
||||
AMDGPUStreamManagerTy(GenericDeviceTy &Device, hsa_agent_t HSAAgent)
|
||||
: GenericDeviceResourceManagerTy(Device), NextQueue(0), Agent(HSAAgent) {}
|
||||
: GenericDeviceResourceManagerTy(Device),
|
||||
OMPX_QueueTracking("LIBOMPTARGET_AMDGPU_HSA_QUEUE_BUSY_TRACKING", true),
|
||||
NextQueue(0), Agent(HSAAgent) {}
|
||||
|
||||
Error init(uint32_t InitialSize, int NumHSAQueues, int HSAQueueSize) {
|
||||
Queues = std::vector<AMDGPUQueueTy>(NumHSAQueues);
|
||||
@ -1493,35 +1499,39 @@ struct AMDGPUStreamManagerTy final
|
||||
|
||||
private:
|
||||
/// Search for and assign an prefereably idle queue to the given Stream. If
|
||||
/// there is no queue without current users, resort to round robin selection.
|
||||
/// there is no queue without current users, choose the queue with the lowest
|
||||
/// user count. If utilization is ignored: use round robin selection.
|
||||
inline Error assignNextQueue(AMDGPUStreamTy *Stream) {
|
||||
uint32_t StartIndex = NextQueue % MaxNumQueues;
|
||||
AMDGPUQueueTy *Q = nullptr;
|
||||
// Start from zero when tracking utilization, otherwise: round robin policy.
|
||||
uint32_t Index = OMPX_QueueTracking ? 0 : NextQueue++ % MaxNumQueues;
|
||||
|
||||
for (int i = 0; i < MaxNumQueues; ++i) {
|
||||
Q = &Queues[StartIndex++];
|
||||
if (StartIndex == MaxNumQueues)
|
||||
StartIndex = 0;
|
||||
if (OMPX_QueueTracking) {
|
||||
// Find the least used queue.
|
||||
for (uint32_t I = 0; I < MaxNumQueues; ++I) {
|
||||
// Early exit when an initialized queue is idle.
|
||||
if (Queues[I].isInitialized() && Queues[I].getUserCount() == 0) {
|
||||
Index = I;
|
||||
break;
|
||||
}
|
||||
|
||||
if (Q->isBusy())
|
||||
continue;
|
||||
else {
|
||||
if (auto Err = Q->init(Agent, QueueSize))
|
||||
return Err;
|
||||
|
||||
Q->addUser();
|
||||
Stream->Queue = Q;
|
||||
return Plugin::success();
|
||||
// Update the least used queue.
|
||||
if (Queues[Index].getUserCount() > Queues[I].getUserCount())
|
||||
Index = I;
|
||||
}
|
||||
}
|
||||
|
||||
// All queues busy: Round robin (StartIndex has the initial value again)
|
||||
Queues[StartIndex].addUser();
|
||||
Stream->Queue = &Queues[StartIndex];
|
||||
++NextQueue;
|
||||
// Make sure the queue is initialized, then add user & assign.
|
||||
if (auto Err = Queues[Index].init(Agent, QueueSize))
|
||||
return Err;
|
||||
Queues[Index].addUser();
|
||||
Stream->Queue = &Queues[Index];
|
||||
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// Envar for controlling the tracking of busy HSA queues.
|
||||
BoolEnvar OMPX_QueueTracking;
|
||||
|
||||
/// The next queue index to use for round robin selection.
|
||||
uint32_t NextQueue;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user