Update *_TMPRING_SIZE.WAVESIZE for GFX11

The encoding of COMPUTE_TMPRING_SIZE.WAVESIZE and SPI_TMPRING_SIZE.WAVESIZE has changed in GFX11: it is now in units of 64 dwords instead of 256 dwords, and the field has been widened from 13 bits to 15 bits. Depends on D126989 Reviewed By: rampitec, arsenm, #amdgpu Differential Revision: https://reviews.llvm.org/D127248
2025-02-16 07:49:43 +00:00 · 2022-04-05 13:38:57 +01:00 · 2022-04-05 13:38:57 +01:00 · ff85d61a6e
commit ff85d61a6e
parent ed0288f7c4
4 changed files with 36 additions and 17 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -695,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
  ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;

  const uint64_t MaxScratchPerWorkitem =
-      GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
+      STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
  if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
    DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
                                          ProgInfo.ScratchSize,
@ -879,15 +879,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
  ProgInfo.LDSBlocks =
      alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;

-  // Scratch is allocated in 256 dword blocks.
-  unsigned ScratchAlignShift = 10;
+  // Scratch is allocated in 64-dword or 256-dword blocks.
+  unsigned ScratchAlignShift =
+      STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
  // We need to program the hardware with the amount of scratch memory that
  // is used by the entire wave.  ProgInfo.ScratchSize is the amount of
  // scratch memory used per thread.
-  ProgInfo.ScratchBlocks =
-      alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
-              1ULL << ScratchAlignShift) >>
-      ScratchAlignShift;
+  ProgInfo.ScratchBlocks = divideCeil(
+      ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);

  if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
    ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@ -946,6 +945,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
 void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
                                         const SIProgramInfo &CurrentProgramInfo) {
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
  unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());

  if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@ -957,7 +957,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
    OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);

    OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
-    OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
+    OutStreamer->emitInt32(
+        STM.getGeneration() >= AMDGPUSubtarget::GFX11
+            ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+            : S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));

    // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
    // 0" comment but I don't see a corresponding field in the register spec.
@ -966,8 +969,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
    OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                              S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
    OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
-    OutStreamer->emitIntValue(
-        S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+    OutStreamer->emitInt32(
+        STM.getGeneration() >= AMDGPUSubtarget::GFX11
+            ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+            : S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
  }

  if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@ -201,9 +201,6 @@ private:
  SIFrameLowering FrameLowering;

 public:
-  // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
-  static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
-
  GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
               const GCNTargetMachine &TM);
  ~GCNSubtarget() override;
@ -266,9 +263,19 @@ public:
    return (Generation)Gen;
  }

+  unsigned getMaxWaveScratchSize() const {
+    // See COMPUTE_TMPRING_SIZE.WAVESIZE.
+    if (getGeneration() < GFX11) {
+      // 13-bit field in units of 256-dword.
+      return (256 * 4) * ((1 << 13) - 1);
+    }
+    // 15-bit field in units of 64-dword.
+    return (64 * 4) * ((1 << 15) - 1);
+  }
+
  /// Return the number of high bits known to be zero for a frame index.
  unsigned getKnownHighZeroBitsForFrameIndex() const {
-    return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
+    return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
  }

  int getLDSBankCount() const {
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@ -1036,10 +1036,12 @@ enum Offset_COV5 : unsigned {
 #define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)

 #define R_00B860_COMPUTE_TMPRING_SIZE                                   0x00B860
-#define   S_00B860_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+#define   S_00B860_WAVESIZE_PreGFX11(x)                               (((x) & 0x1FFF) << 12)
+#define   S_00B860_WAVESIZE_GFX11Plus(x)                              (((x) & 0x7FFF) << 12)

 #define R_0286E8_SPI_TMPRING_SIZE                                       0x0286E8
-#define   S_0286E8_WAVESIZE(x)                                        (((x) & 0x1FFF) << 12)
+#define   S_0286E8_WAVESIZE_PreGFX11(x)                               (((x) & 0x1FFF) << 12)
+#define   S_0286E8_WAVESIZE_GFX11Plus(x)                              (((x) & 0x7FFF) << 12)

 #define R_028B54_VGT_SHADER_STAGES_EN                                 0x028B54
 #define   S_028B54_HS_W32_EN(x)                                       (((x) & 0x1) << 21)
--- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll
+++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll
@ -1,10 +1,15 @@
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s

-; Check SPI_TMPRING_SIZE.WAVESIZE = 5
+; SPI_TMPRING_SIZE.WAVESIZE = 5
 ; GFX10: .long 165608
 ; GFX10-NEXT: .long 20480

+; SPI_TMPRING_SIZE.WAVESIZE = 17
+; GFX11: .long 165608
+; GFX11-NEXT: .long 69632
+
 ; GCN-LABEL: {{^}}scratch_ps:
 ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}}
 ; GCN-DAG: s_mov_b32 s6, -1{{$}}