[OpenMP][libomptarget][AMDGPU] Update print launch info

Clean up for the AMD-specific kernel launch info in the NextGen Plugins. - Fixes a mistake introduced with the initial commit that added printing of an AMD-only property. - Removes another AMD-only property (not clear on upstream status) - Adds some more comment to what info is printed. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D145924
2025-02-20 18:27:20 +00:00 · 2023-03-15 05:58:01 -04:00 · 2023-03-15 05:58:01 -04:00 · 13a0b48f37
commit 13a0b48f37
parent 70562607ab
3 changed files with 47 additions and 15 deletions
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@ -2614,33 +2614,39 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
    return Plugin::success();

  // General Info
-  auto ConstWGSize = getDefaultNumThreads(GenericDevice);
  auto NumGroups = NumBlocks;
-  auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice);
-  auto NumTeams = KernelArgs.NumTeams[0];       // Only first dimension
-  auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension
+  auto ThreadsPerGroup = NumThreads;

  // Kernel Arguments Info
  auto ArgNum = KernelArgs.NumArgs;
  auto LoopTripCount = KernelArgs.Tripcount;

-  // Details for AMDGPU kernels
+  // Details for AMDGPU kernels (read from image)
+  // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
  auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
  auto SGPRCount = (*KernelInfo).SGPRCount;
  auto VGPRCount = (*KernelInfo).VGPRCount;
  auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
  auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
+  auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;

-  // TODO set correctly once host services available
-  auto HostCallRequired = false;
+  // Prints additional launch info that contains the following.
+  // Num Args: The number of kernel arguments
+  // Teams x Thrds: The number of teams and the number of threads actually
+  // running.
+  // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
+  // kernel in work-items
+  // LDS Usage: Amount of bytes used in LDS storage
+  // S/VGPR Count: the number of S/V GPRs occupied by the kernel
+  // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
+  // Tripcount: loop tripcount for the kernel
  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
-       "SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4luX%4d) "
-       "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
-       "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n",
-       getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup,
-       NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount,
-       SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired,
-       getName());
+       "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
+       "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
+       "%lu\n",
+       ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
+       GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
+       LoopTripCount);

  return Plugin::success();
 }
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@ -146,6 +146,10 @@ struct KernelMetaDataTy {
  uint32_t KernelSegmentSize;
  uint32_t ExplicitArgumentCount;
  uint32_t ImplicitArgumentCount;
+  uint32_t RequestedWorkgroupSize[3];
+  uint32_t WorkgroupSizeHint[3];
+  uint32_t WavefronSize;
+  uint32_t MaxFlatWorkgroupSize;
 };
 namespace {

@ -194,6 +198,19 @@ private:
      return DK.getString() == SK;
    };

+    const auto getSequenceOfThreeInts = [](msgpack::DocNode &DN,
+                                           uint32_t *Vals) {
+      assert(DN.isArray() && "MsgPack DocNode is an array node");
+      auto DNA = DN.getArray();
+      assert(DNA.size() == 3 && "ArrayNode has at most three elements");
+
+      int i = 0;
+      for (auto DNABegin = DNA.begin(), DNAEnd = DNA.end(); DNABegin != DNAEnd;
+           ++DNABegin) {
+        Vals[i++] = DNABegin->getUInt();
+      }
+    };
+
    if (isKey(V.first, ".name")) {
      KernelName = V.second.toString();
    } else if (isKey(V.first, ".sgpr_count")) {
@ -208,6 +225,14 @@ private:
      KernelData.PrivateSegmentSize = V.second.getUInt();
    } else if (isKey(V.first, ".group_segement_fixed_size")) {
      KernelData.GroupSegmentList = V.second.getUInt();
+    } else if (isKey(V.first, ".reqd_workgroup_size")) {
+      getSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize);
+    } else if (isKey(V.first, ".workgroup_size_hint")) {
+      getSequenceOfThreeInts(V.second, KernelData.WorkgroupSizeHint);
+    } else if (isKey(V.first, ".wavefront_size")) {
+      KernelData.WavefronSize = V.second.getUInt();
+    } else if (isKey(V.first, ".max_flat_workgroup_size")) {
+      KernelData.MaxFlatWorkgroupSize = V.second.getUInt();
    }

    return Error::success();
@ -295,6 +320,7 @@ Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,

  return Error::success();
 }
+
 } // namespace utils
 } // namespace plugin
 } // namespace target
--- a/openmp/libomptarget/test/offloading/info.c
+++ b/openmp/libomptarget/test/offloading/info.c
@ -39,7 +39,7 @@ int main() {
 // INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments:
 // INFO: info: firstprivate(val)[4]
 // INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode
-// AMDGPU: AMDGPU device {{[0-9]}} info: SGN:Generic ConstWGSize:{{[0-9]+}} args:{{[0-9]}} teamsXthrds:({{   [0-9]+}}X {{[0-9]+}}) reqd:(   {{[0-9]+}}X   {{[0-9]+}}) lds_usage:{{[0-9]+}}B sgpr_count:{{[0-9]+}} vgpr_count:{{[0-9]+}} sgpr_spill_count:{{[0-9]+}} vgpr_spill_count:{{[0-9]+}} tripcount:{{[0-9]+}} rpc:0 n:__omp_offloading_{{.*}}main{{.*}}
+// AMDGPU: AMDGPU device {{[0-9]}} info: #Args: {{[0-9]}} Teams x Thrds: {{[0-9]+}}x {{[0-9]+}} (MaxFlatWorkGroupSize: {{[0-9]+}}) LDS Usage: {{[0-9]+}}B #SGPRs/VGPRs: {{[0-9]+}}/{{[0-9]+}} #SGPR/VGPR Spills: {{[0-9]+}}/{{[0-9]+}} Tripcount: {{[0-9]+}}
 // INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}:
 // INFO: info: Host Ptr           Target Ptr         Size (B) DynRefCount HoldRefCount Declaration
 // INFO: info: {{.*}}             {{.*}}             256      1           0            C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}