From 13a0b48f37250ad2f8cb72c04e72600874112037 Mon Sep 17 00:00:00 2001 From: JP Lehr Date: Wed, 15 Mar 2023 05:58:01 -0400 Subject: [PATCH] [OpenMP][libomptarget][AMDGPU] Update print launch info Clean up for the AMD-specific kernel launch info in the NextGen Plugins. - Fixes a mistake introduced with the initial commit that added printing of an AMD-only property. - Removes another AMD-only property (not clear on upstream status) - Adds some more comment to what info is printed. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D145924 --- .../plugins-nextgen/amdgpu/src/rtl.cpp | 34 +++++++++++-------- .../amdgpu/utils/UtilitiesRTL.h | 26 ++++++++++++++ openmp/libomptarget/test/offloading/info.c | 2 +- 3 files changed, 47 insertions(+), 15 deletions(-) diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index 99b45ad386d8..e03825651286 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2614,33 +2614,39 @@ Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice, return Plugin::success(); // General Info - auto ConstWGSize = getDefaultNumThreads(GenericDevice); auto NumGroups = NumBlocks; - auto ThreadsPerGroup = getDefaultNumThreads(GenericDevice); - auto NumTeams = KernelArgs.NumTeams[0]; // Only first dimension - auto ThreadLimit = KernelArgs.ThreadLimit[0]; // Only first dimension + auto ThreadsPerGroup = NumThreads; // Kernel Arguments Info auto ArgNum = KernelArgs.NumArgs; auto LoopTripCount = KernelArgs.Tripcount; - // Details for AMDGPU kernels + // Details for AMDGPU kernels (read from image) + // https://www.llvm.org/docs/AMDGPUUsage.html#code-object-v4-metadata auto GroupSegmentSize = (*KernelInfo).GroupSegmentList; auto SGPRCount = (*KernelInfo).SGPRCount; auto VGPRCount = (*KernelInfo).VGPRCount; auto SGPRSpillCount = (*KernelInfo).SGPRSpillCount; auto VGPRSpillCount = (*KernelInfo).VGPRSpillCount; + auto MaxFlatWorkgroupSize = (*KernelInfo).MaxFlatWorkgroupSize; - // TODO set correctly once host services available - auto HostCallRequired = false; + // Prints additional launch info that contains the following. + // Num Args: The number of kernel arguments + // Teams x Thrds: The number of teams and the number of threads actually + // running. + // MaxFlatWorkgroupSize: Maximum flat work-group size supported by the + // kernel in work-items + // LDS Usage: Amount of bytes used in LDS storage + // S/VGPR Count: the number of S/V GPRs occupied by the kernel + // S/VGPR Spill Count: how many S/VGPRs are spilled by the kernel + // Tripcount: loop tripcount for the kernel INFO(OMP_INFOTYPE_PLUGIN_KERNEL, GenericDevice.getDeviceId(), - "SGN:%s ConstWGSize:%d args:%d teamsXthrds:(%4luX%4d) " - "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " - "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu rpc:%d n:%s\n", - getExecutionModeName(), ConstWGSize, ArgNum, NumGroups, ThreadsPerGroup, - NumTeams, ThreadLimit, GroupSegmentSize, SGPRCount, VGPRCount, - SGPRSpillCount, VGPRSpillCount, LoopTripCount, HostCallRequired, - getName()); + "#Args: %d Teams x Thrds: %4lux%4u (MaxFlatWorkGroupSize: %u) LDS " + "Usage: %uB #SGPRs/VGPRs: %u/%u #SGPR/VGPR Spills: %u/%u Tripcount: " + "%lu\n", + ArgNum, NumGroups, ThreadsPerGroup, MaxFlatWorkgroupSize, + GroupSegmentSize, SGPRCount, VGPRCount, SGPRSpillCount, VGPRSpillCount, + LoopTripCount); return Plugin::success(); } diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h index 1c549189b5c4..cdf1d10980e9 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h @@ -146,6 +146,10 @@ struct KernelMetaDataTy { uint32_t KernelSegmentSize; uint32_t ExplicitArgumentCount; uint32_t ImplicitArgumentCount; + uint32_t RequestedWorkgroupSize[3]; + uint32_t WorkgroupSizeHint[3]; + uint32_t WavefronSize; + uint32_t MaxFlatWorkgroupSize; }; namespace { @@ -194,6 +198,19 @@ private: return DK.getString() == SK; }; + const auto getSequenceOfThreeInts = [](msgpack::DocNode &DN, + uint32_t *Vals) { + assert(DN.isArray() && "MsgPack DocNode is an array node"); + auto DNA = DN.getArray(); + assert(DNA.size() == 3 && "ArrayNode has at most three elements"); + + int i = 0; + for (auto DNABegin = DNA.begin(), DNAEnd = DNA.end(); DNABegin != DNAEnd; + ++DNABegin) { + Vals[i++] = DNABegin->getUInt(); + } + }; + if (isKey(V.first, ".name")) { KernelName = V.second.toString(); } else if (isKey(V.first, ".sgpr_count")) { @@ -208,6 +225,14 @@ private: KernelData.PrivateSegmentSize = V.second.getUInt(); } else if (isKey(V.first, ".group_segement_fixed_size")) { KernelData.GroupSegmentList = V.second.getUInt(); + } else if (isKey(V.first, ".reqd_workgroup_size")) { + getSequenceOfThreeInts(V.second, KernelData.RequestedWorkgroupSize); + } else if (isKey(V.first, ".workgroup_size_hint")) { + getSequenceOfThreeInts(V.second, KernelData.WorkgroupSizeHint); + } else if (isKey(V.first, ".wavefront_size")) { + KernelData.WavefronSize = V.second.getUInt(); + } else if (isKey(V.first, ".max_flat_workgroup_size")) { + KernelData.MaxFlatWorkgroupSize = V.second.getUInt(); } return Error::success(); @@ -295,6 +320,7 @@ Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer, return Error::success(); } + } // namespace utils } // namespace plugin } // namespace target diff --git a/openmp/libomptarget/test/offloading/info.c b/openmp/libomptarget/test/offloading/info.c index f24727400bcd..c2f845006410 100644 --- a/openmp/libomptarget/test/offloading/info.c +++ b/openmp/libomptarget/test/offloading/info.c @@ -39,7 +39,7 @@ int main() { // INFO: info: Entering OpenMP kernel at info.c:{{[0-9]+}}:{{[0-9]+}} with 1 arguments: // INFO: info: firstprivate(val)[4] // INFO: info: Launching kernel __omp_offloading_{{.*}}main{{.*}} with {{[0-9]+}} blocks and {{[0-9]+}} threads in Generic mode -// AMDGPU: AMDGPU device {{[0-9]}} info: SGN:Generic ConstWGSize:{{[0-9]+}} args:{{[0-9]}} teamsXthrds:({{ [0-9]+}}X {{[0-9]+}}) reqd:( {{[0-9]+}}X {{[0-9]+}}) lds_usage:{{[0-9]+}}B sgpr_count:{{[0-9]+}} vgpr_count:{{[0-9]+}} sgpr_spill_count:{{[0-9]+}} vgpr_spill_count:{{[0-9]+}} tripcount:{{[0-9]+}} rpc:0 n:__omp_offloading_{{.*}}main{{.*}} +// AMDGPU: AMDGPU device {{[0-9]}} info: #Args: {{[0-9]}} Teams x Thrds: {{[0-9]+}}x {{[0-9]+}} (MaxFlatWorkGroupSize: {{[0-9]+}}) LDS Usage: {{[0-9]+}}B #SGPRs/VGPRs: {{[0-9]+}}/{{[0-9]+}} #SGPR/VGPR Spills: {{[0-9]+}}/{{[0-9]+}} Tripcount: {{[0-9]+}} // INFO: info: OpenMP Host-Device pointer mappings after block at info.c:{{[0-9]+}}:{{[0-9]+}}: // INFO: info: Host Ptr Target Ptr Size (B) DynRefCount HoldRefCount Declaration // INFO: info: {{.*}} {{.*}} 256 1 0 C[0:64] at info.c:{{[0-9]+}}:{{[0-9]+}}