From 13a0b48f37250ad2f8cb72c04e72600874112037 Mon Sep 17 00:00:00 2001
From: JP Lehr <JanPatrick.Lehr@amd.com>
Date: Wed, 15 Mar 2023 05:58:01 -0400
Subject: [PATCH] [OpenMP][libomptarget][AMDGPU] Update print launch info

Clean up for the AMD-specific kernel launch info in the NextGen Plugins.
- Fixes a mistake introduced with the initial commit that added printing
  of an AMD-only property.
- Removes another AMD-only property (not clear on upstream status)
- Adds some more comment to what info is printed.

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D145924
---
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 34 +++++++++++--------
 .../amdgpu/utils/UtilitiesRTL.h               | 26 ++++++++++++++
 openmp/libomptarget/test/offloading/info.c    |  2 +-
 3 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 99b45ad386d8..e03825651286 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2614,33 +2614,39 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
     return Plugin::success();
 
   // General Info
-  auto ConstWGSize = getDefaultNumThreads(GenericDevice);
   auto NumGroups = NumBlocks;
-  auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice);
-  auto NumTeams = KernelArgs.NumTeams[0];       // Only first dimension
-  auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension
+  auto ThreadsPerGroup = NumThreads;
 
   // Kernel Arguments Info
   auto ArgNum = KernelArgs.NumArgs;
   auto LoopTripCount = KernelArgs.Tripcount;
 
-  // Details for AMDGPU kernels
+  // Details for AMDGPU kernels (read from image)
+  // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata
   auto GroupSegmentSize = (*KernelInfo).GroupSegmentList;
   auto SGPRCount = (*KernelInfo).SGPRCount;
   auto VGPRCount = (*KernelInfo).VGPRCount;
   auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount;
   auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount;
+  auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize;
 
-  // TODO set correctly once host services available
-  auto HostCallRequired = false;
+  // Prints additional launch info that contains the following.
+  // Num Args: The number of kernel arguments
+  // Teams x Thrds: The number of teams and the number of threads actually
+  // running.
+  // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the
+  // kernel in work-items
+  // LDS Usage: Amount of bytes used in LDS storage
+  // S/VGPR Count: the number of S/V GPRs occupied by the kernel
+  // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel
+  // Tripcount: loop tripcount for the kernel
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(),
-       "SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4luX%4d) "
-       "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
-       "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n",
-       getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup,
-       NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount,
-       SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired,
-       getName());
+       "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS "
+       "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: "
+       "%lu\n",
+       ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize,
+       GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount,
+       LoopTripCount);
 
   return Plugin::success();
 }
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
index 1c549189b5c4..cdf1d10980e9 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h
@@ -146,6 +146,10 @@ struct KernelMetaDataTy {
   uint32_t KernelSegmentSize;
   uint32_t ExplicitArgumentCount;
   uint32_t ImplicitArgumentCount;
+  uint32_t RequestedWorkgroupSize[3];
+  uint32_t WorkgroupSizeHint[3];
+  uint32_t WavefronSize;
+  uint32_t MaxFlatWorkgroupSize;
 };
 namespace {
 
@@ -194,6 +198,19 @@ private:
       return DK.getString() == SK;
     };
 
+    const auto getSequenceOfThreeInts = [](msgpack::DocNode &DN,
+                                           uint32_t *Vals) {
+      assert(DN.isArray() && "MsgPack DocNode is an array node");
+      auto DNA = DN.getArray();
+      assert(DNA.size() == 3 && "ArrayNode has at most three elements");
+
+      int i = 0;
+      for (auto DNABegin = DNA.begin(), DNAEnd = DNA.end(); DNABegin != DNAEnd;
+           ++DNABegin) {
+        Vals[i++] = DNABegin->getUInt();
+      }
+    };
+
     if (isKey(V.first, ".name")) {
       KernelName = V.second.toString();
     } else if (isKey(V.first, ".sgpr_count")) {
@@ -208,6 +225,14 @@ private:
       KernelData.PrivateSegmentSize = V.second.getUInt();
     } else if (isKey(V.first, ".group_segement_fixed_size")) {
       KernelData.GroupSegmentList = V.second.getUInt();
+    } else if (isKey(V.first, ".reqd_workgroup_size")) {
+      getSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize);
+    } else if (isKey(V.first, ".workgroup_size_hint")) {
+      getSequenceOfThreeInts(V.second, KernelData.WorkgroupSizeHint);
+    } else if (isKey(V.first, ".wavefront_size")) {
+      KernelData.WavefronSize = V.second.getUInt();
+    } else if (isKey(V.first, ".max_flat_workgroup_size")) {
+      KernelData.MaxFlatWorkgroupSize = V.second.getUInt();
     }
 
     return Error::success();
@@ -295,6 +320,7 @@ Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
 
   return Error::success();
 }
+
 } // namespace utils
 } // namespace plugin
 } // namespace target
diff --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c
index f24727400bcd..c2f845006410 100644
--- a/openmp/libomptarget/test/offloading/info.c
+++ b/openmp/libomptarget/test/offloading/info.c
@@ -39,7 +39,7 @@ int main() {
 // INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments:
 // INFO: info: firstprivate(val)[4]
 // INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode
-// AMDGPU: AMDGPU device {{[0-9]}} info: SGN:Generic ConstWGSize:{{[0-9]+}} args:{{[0-9]}} teamsXthrds:({{   [0-9]+}}X {{[0-9]+}}) reqd:(   {{[0-9]+}}X   {{[0-9]+}}) lds_usage:{{[0-9]+}}B sgpr_count:{{[0-9]+}} vgpr_count:{{[0-9]+}} sgpr_spill_count:{{[0-9]+}} vgpr_spill_count:{{[0-9]+}} tripcount:{{[0-9]+}} rpc:0 n:__omp_offloading_{{.*}}main{{.*}}
+// AMDGPU: AMDGPU device {{[0-9]}} info: #Args: {{[0-9]}} Teams x Thrds: {{[0-9]+}}x {{[0-9]+}} (MaxFlatWorkGroupSize: {{[0-9]+}}) LDS Usage: {{[0-9]+}}B #SGPRs/VGPRs: {{[0-9]+}}/{{[0-9]+}} #SGPR/VGPR Spills: {{[0-9]+}}/{{[0-9]+}} Tripcount: {{[0-9]+}}
 // INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}:
 // INFO: info: Host Ptr           Target Ptr         Size (B) DynRefCount HoldRefCount Declaration
 // INFO: info: {{.*}}             {{.*}}             256      1           0            C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}