Update *_TMPRING_SIZE.WAVESIZE for GFX11

The encoding of COMPUTE_TMPRING_SIZE.WAVESIZE and
SPI_TMPRING_SIZE.WAVESIZE has changed in GFX11: it is now in units
of 64 dwords instead of 256 dwords, and the field has been widened
from 13 bits to 15 bits.

Depends on D126989

Reviewed By: rampitec, arsenm, #amdgpu

Differential Revision: https://reviews.llvm.org/D127248
This commit is contained in:
Jay Foad 2022-04-05 13:38:57 +01:00 committed by Joe Nash
parent ed0288f7c4
commit ff85d61a6e
4 changed files with 36 additions and 17 deletions

View File

@ -695,7 +695,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
const uint64_t MaxScratchPerWorkitem =
GCNSubtarget::MaxWaveScratchSize / STM.getWavefrontSize();
STM.getMaxWaveScratchSize() / STM.getWavefrontSize();
if (ProgInfo.ScratchSize > MaxScratchPerWorkitem) {
DiagnosticInfoStackSize DiagStackSize(MF.getFunction(),
ProgInfo.ScratchSize,
@ -879,15 +879,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.LDSBlocks =
alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
// Scratch is allocated in 256 dword blocks.
unsigned ScratchAlignShift = 10;
// Scratch is allocated in 64-dword or 256-dword blocks.
unsigned ScratchAlignShift =
STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 8 : 10;
// We need to program the hardware with the amount of scratch memory that
// is used by the entire wave. ProgInfo.ScratchSize is the amount of
// scratch memory used per thread.
ProgInfo.ScratchBlocks =
alignTo(ProgInfo.ScratchSize * STM.getWavefrontSize(),
1ULL << ScratchAlignShift) >>
ScratchAlignShift;
ProgInfo.ScratchBlocks = divideCeil(
ProgInfo.ScratchSize * STM.getWavefrontSize(), 1ULL << ScratchAlignShift);
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
@ -946,6 +945,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
const SIProgramInfo &CurrentProgramInfo) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@ -957,7 +957,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
OutStreamer->emitInt32(
STM.getGeneration() >= AMDGPUSubtarget::GFX11
? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
: S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
@ -966,8 +969,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
OutStreamer->emitIntValue(
S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
OutStreamer->emitInt32(
STM.getGeneration() >= AMDGPUSubtarget::GFX11
? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
: S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
}
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {

View File

@ -201,9 +201,6 @@ private:
SIFrameLowering FrameLowering;
public:
// See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM);
~GCNSubtarget() override;
@ -266,9 +263,19 @@ public:
return (Generation)Gen;
}
unsigned getMaxWaveScratchSize() const {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
if (getGeneration() < GFX11) {
// 13-bit field in units of 256-dword.
return (256 * 4) * ((1 << 13) - 1);
}
// 15-bit field in units of 64-dword.
return (64 * 4) * ((1 << 15) - 1);
}
/// Return the number of high bits known to be zero for a frame index.
unsigned getKnownHighZeroBitsForFrameIndex() const {
return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
}
int getLDSBankCount() const {

View File

@ -1036,10 +1036,12 @@ enum Offset_COV5 : unsigned {
#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6)
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE(x) (((x) & 0x1FFF) << 12)
#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE(x) (((x) & 0x1FFF) << 12)
#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)

View File

@ -1,10 +1,15 @@
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
; Check SPI_TMPRING_SIZE.WAVESIZE = 5
; SPI_TMPRING_SIZE.WAVESIZE = 5
; GFX10: .long 165608
; GFX10-NEXT: .long 20480
; SPI_TMPRING_SIZE.WAVESIZE = 17
; GFX11: .long 165608
; GFX11-NEXT: .long 69632
; GCN-LABEL: {{^}}scratch_ps:
; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0{{$}}
; GCN-DAG: s_mov_b32 s6, -1{{$}}