AMDGPU: Rework how private buffer passed for HSA

If we know we have stack objects, we reserve the registers
that the private buffer resource and wave offset are passed
and use them directly.

If not, reserve the last 5 SGPRs just in case we need to spill.
After register allocation, try to pick the next available registers
instead of the last SGPRs, and then insert copies from the inputs
to the reserved registers in the progloue.

This also only selectively enables all of the input registers
which are really required instead of always enabling them.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254331 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2015-11-30 21:16:03 +00:00
parent d4a0a430cc
commit 0f1b95f818
21 changed files with 1001 additions and 298 deletions

View File

@ -452,18 +452,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
S_00B848_FLOAT_MODE(ProgInfo.FloatMode) |
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) |
S_00B848_IEEE_MODE(ProgInfo.DebugMode) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_IEEE_MODE(ProgInfo.IEEEMode);
// 0 = X, 1 = XY, 2 = XYZ
unsigned TIDIGCompCnt = 0;
if (MFI->hasWorkItemIDZ())
TIDIGCompCnt = 2;
else if (MFI->hasWorkItemIDY())
TIDIGCompCnt = 1;
ProgInfo.ComputePGMRSrc2 =
S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
S_00B84C_USER_SGPR(MFI->NumUserSGPRs) |
S_00B84C_TGID_X_EN(1) |
S_00B84C_TGID_Y_EN(1) |
S_00B84C_TGID_Z_EN(1) |
S_00B84C_TG_SIZE_EN(1) |
S_00B84C_TIDIG_COMP_CNT(2) |
S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks);
S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
S_00B84C_EXCP_EN_MSB(0) |
S_00B84C_LDS_SIZE(ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
}
static unsigned getRsrcReg(unsigned ShaderType) {
@ -524,9 +533,44 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
header.compute_pgm_resource_registers =
KernelInfo.ComputePGMRSrc1 |
(KernelInfo.ComputePGMRSrc2 << 32);
header.code_properties =
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR |
AMD_CODE_PROPERTY_IS_PTR64;
header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
if (MFI->hasPrivateSegmentBuffer()) {
header.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}
if (MFI->hasDispatchPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
if (MFI->hasQueuePtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
if (MFI->hasKernargSegmentPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
if (MFI->hasDispatchID())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
if (MFI->hasFlatScratchInit())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
// TODO: Private segment size
if (MFI->hasGridWorkgroupCountX()) {
header.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
}
if (MFI->hasGridWorkgroupCountY()) {
header.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
}
if (MFI->hasGridWorkgroupCountZ()) {
header.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
}
if (MFI->hasDispatchPtr())
header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;

View File

@ -1062,14 +1062,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
SDLoc DL(Addr);
MachineFunction &MF = CurDAG->getMachineFunction();
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32);
SOffset = CurDAG->getRegister(Info->getScratchWaveOffsetReg(), MVT::i32);
// (add n0, c1)
if (CurDAG->isBaseWithConstantOffset(Addr)) {

View File

@ -36,6 +36,16 @@ static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
return true;
}
static ArrayRef<MCPhysReg> getAllSGPR128() {
return makeArrayRef(AMDGPU::SReg_128RegClass.begin(),
AMDGPU::SReg_128RegClass.getNumRegs());
}
static ArrayRef<MCPhysReg> getAllSGPRs() {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
AMDGPU::SGPR_32RegClass.getNumRegs());
}
void SIFrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
if (!MF.getFrameInfo()->hasStackObjects())
@ -43,7 +53,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// If we only have SGPR spills, we won't actually be using scratch memory
// since these spill to VGPRs.
@ -56,31 +66,159 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
const SIRegisterInfo *TRI = &TII->getRegisterInfo();
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
// We need to insert initialization of the scratch resource descriptor.
unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
assert(ScratchRsrcReg != AMDGPU::NoRegister);
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdHsaOS()) {
PreloadedPrivateBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
}
// If we reserved the original input registers, we don't need to copy to the
// reserved registers.
if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
// We should always reserve these 5 registers at the same time.
assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
"scratch wave offset and private segment buffer inconsistent");
return;
}
// We added live-ins during argument lowering, but since they were not used
// they were deleted. We're adding the uses now, so add them back.
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
if (ST.isAmdHsaOS()) {
MRI.addLiveIn(PreloadedPrivateBufferReg);
MBB.addLiveIn(PreloadedPrivateBufferReg);
}
// We reserved the last registers for this. Shift it down to the end of those
// which were actually used.
//
// FIXME: It might be safer to use a pseudoregister before replacement.
// FIXME: We should be able to eliminate unused input registers. We only
// cannot do this for the resources required for scratch access. For now we
// skip over user SGPRs and may leave unused holes.
// We find the resource first because it has an alignment requirement.
if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
// Skip the last 2 elements because the last one is reserved for VCC, and
// this is the 2nd to last element already.
for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
// Pick the first unallocated one. Make sure we don't clobber the other
// reserved input we needed.
if (!MRI.isPhysRegUsed(Reg)) {
assert(MRI.isAllocatable(Reg));
MRI.replaceRegWith(ScratchRsrcReg, Reg);
ScratchRsrcReg = Reg;
MFI->setScratchRSrcReg(ScratchRsrcReg);
break;
}
}
}
if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
MachineRegisterInfo &MRI = MF.getRegInfo();
// Skip the last 2 elements because the last one is reserved for VCC, and
// this is the 2nd to last element already.
unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
// Pick the first unallocated SGPR. Be careful not to pick an alias of the
// scratch descriptor, since we havent added its uses yet.
if (!MRI.isPhysRegUsed(Reg)) {
assert(MRI.isAllocatable(Reg) &&
!TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
ScratchWaveOffsetReg = Reg;
MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
break;
}
}
}
assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
MachineBasicBlock::iterator I = MBB.begin();
DebugLoc DL;
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
// Make sure we emit the copy for the offset first. We may have chosen to copy
// the buffer resource into a register that aliases the input offset register.
BuildMI(MBB, I, DL, SMovB32, ScratchWaveOffsetReg)
.addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
}
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
.addExternalSymbol("SCRATCH_RSRC_DWORD0");
if (ST.isAmdHsaOS()) {
// Insert copies from argument register.
assert(
!TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
!TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
.addExternalSymbol("SCRATCH_RSRC_DWORD1");
unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
unsigned Rsrc23 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2_sub3);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
.addImm(Rsrc23 & 0xffffffff);
unsigned Lo = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub0_sub1);
unsigned Hi = TRI->getSubReg(PreloadedPrivateBufferReg, AMDGPU::sub2_sub3);
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
.addImm(Rsrc23 >> 32);
const MCInstrDesc &SMovB64 = TII->get(AMDGPU::S_MOV_B64);
BuildMI(MBB, I, DL, SMovB64, Rsrc01)
.addReg(Lo, RegState::Kill);
BuildMI(MBB, I, DL, SMovB64, Rsrc23)
.addReg(Hi, RegState::Kill);
} else {
unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
BuildMI(MBB, I, DL, SMovB32, Rsrc1)
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
.addImm(Rsrc23 & 0xffffffff)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
BuildMI(MBB, I, DL, SMovB32, Rsrc3)
.addImm(Rsrc23 >> 32)
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
}
// Make the register selected live throughout the function.
for (MachineBasicBlock &OtherBB : MF) {
if (&OtherBB == &MBB)
continue;
OtherBB.addLiveIn(ScratchRsrcReg);
OtherBB.addLiveIn(ScratchWaveOffsetReg);
}
}
void SIFrameLowering::processFunctionBeforeFrameFinalized(

View File

@ -542,6 +542,11 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
Align); // Alignment
}
static ArrayRef<MCPhysReg> getAllSGPRs() {
return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
AMDGPU::SGPR_32RegClass.getNumRegs());
}
SDValue SITargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
@ -619,39 +624,30 @@ SDValue SITargetLowering::LowerFormalArguments(
CCInfo.AllocateReg(AMDGPU::VGPR1);
}
// The pointer to the list of arguments is stored in SGPR0, SGPR1
// The pointer to the scratch buffer is stored in SGPR2, SGPR3
if (Info->getShaderType() == ShaderType::COMPUTE) {
if (Subtarget->isAmdHsaOS())
Info->NumUserSGPRs += 4; // FIXME: Need to support scratch buffers.
else
Info->NumUserSGPRs += 4;
unsigned InputPtrReg =
TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
unsigned InputPtrRegLo =
TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 0);
unsigned InputPtrRegHi =
TRI->getPhysRegSubReg(InputPtrReg, &AMDGPU::SReg_32RegClass, 1);
CCInfo.AllocateReg(InputPtrRegLo);
CCInfo.AllocateReg(InputPtrRegHi);
MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (MFI->hasDispatchPtr()) {
unsigned DispatchPtrReg
= TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
}
}
if (Info->getShaderType() == ShaderType::COMPUTE) {
getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
Splits);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
if (Info->hasPrivateSegmentBuffer()) {
unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
if (Info->hasDispatchPtr()) {
unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
if (Info->hasKernargSegmentPtr()) {
unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
CCInfo.AllocateReg(InputPtrReg);
}
AnalyzeFormalArguments(CCInfo, Splits);
SmallVector<SDValue, 16> Chains;
@ -739,14 +735,114 @@ SDValue SITargetLowering::LowerFormalArguments(
InVals.push_back(Val);
}
if (Info->getShaderType() != ShaderType::COMPUTE) {
unsigned ScratchIdx = CCInfo.getFirstUnallocated(makeArrayRef(
AMDGPU::SGPR_32RegClass.begin(), AMDGPU::SGPR_32RegClass.getNumRegs()));
Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
// TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
// these from the dispatch pointer.
// Start adding system SGPRs.
if (Info->hasWorkGroupIDX()) {
unsigned Reg = Info->addWorkGroupIDX();
MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
CCInfo.AllocateReg(Reg);
} else
llvm_unreachable("work group id x is always enabled");
if (Info->hasWorkGroupIDY()) {
unsigned Reg = Info->addWorkGroupIDY();
MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info))
Info->setScratchRSrcReg(TRI);
if (Info->hasWorkGroupIDZ()) {
unsigned Reg = Info->addWorkGroupIDZ();
MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkGroupInfo()) {
unsigned Reg = Info->addWorkGroupInfo();
MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasPrivateSegmentWaveByteOffset()) {
// Scratch wave offset passed in system SGPR.
unsigned PrivateSegmentWaveByteOffsetReg
= Info->addPrivateSegmentWaveByteOffset();
MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
}
// Now that we've figured out where the scratch register inputs are, see if
// should reserve the arguments and use them directly.
bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
if (ST.isAmdHsaOS()) {
// TODO: Assume we will spill without optimizations.
if (HasStackObjects) {
// If we have stack objects, we unquestionably need the private buffer
// resource. For the HSA ABI, this will be the first 4 user SGPR
// inputs. We can reserve those and use them directly.
unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
Info->setScratchRSrcReg(PrivateSegmentBufferReg);
unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
} else {
unsigned ReservedBufferReg
= TRI->reservedPrivateSegmentBufferReg(MF);
unsigned ReservedOffsetReg
= TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
// We tentatively reserve the last registers (skipping the last two
// which may contain VCC). After register allocation, we'll replace
// these with the ones immediately after those which were really
// allocated. In the prologue copies will be inserted from the argument
// to these reserved registers.
Info->setScratchRSrcReg(ReservedBufferReg);
Info->setScratchWaveOffsetReg(ReservedOffsetReg);
}
} else {
unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
// Without HSA, relocations are used for the scratch pointer and the
// buffer resource setup is always inserted in the prologue. Scratch wave
// offset is still in an input SGPR.
Info->setScratchRSrcReg(ReservedBufferReg);
if (HasStackObjects) {
unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
} else {
unsigned ReservedOffsetReg
= TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
Info->setScratchWaveOffsetReg(ReservedOffsetReg);
}
}
if (Info->hasWorkItemIDX()) {
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
} else
llvm_unreachable("workitem id x should always be enabled");
if (Info->hasWorkItemIDY()) {
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info->hasWorkItemIDZ()) {
unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Chains.empty())
return Chain;

View File

@ -551,16 +551,13 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
*MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
.addReg(SrcReg) // src
.addFrameIndex(FrameIndex) // frame_idx
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(ScratchOffsetPreloadReg) // scratch_offset
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
.addMemOperand(MMO);
}
@ -638,14 +635,11 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
unsigned ScratchOffsetPreloadReg = RI.getPreloadedValue(
*MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // frame_idx
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(ScratchOffsetPreloadReg) // scratch_offset
.addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
.addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
.addMemOperand(MMO);
}

View File

@ -30,15 +30,33 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
TIDReg(AMDGPU::NoRegister),
ScratchRSrcReg(AMDGPU::NoRegister),
ScratchWaveOffsetReg(AMDGPU::NoRegister),
PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister),
DispatchPtrUserSGPR(AMDGPU::NoRegister),
QueuePtrUserSGPR(AMDGPU::NoRegister),
KernargSegmentPtrUserSGPR(AMDGPU::NoRegister),
DispatchIDUserSGPR(AMDGPU::NoRegister),
FlatScratchInitUserSGPR(AMDGPU::NoRegister),
PrivateSegmentSizeUserSGPR(AMDGPU::NoRegister),
GridWorkGroupCountXUserSGPR(AMDGPU::NoRegister),
GridWorkGroupCountYUserSGPR(AMDGPU::NoRegister),
GridWorkGroupCountZUserSGPR(AMDGPU::NoRegister),
WorkGroupIDXSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDYSystemSGPR(AMDGPU::NoRegister),
WorkGroupIDZSystemSGPR(AMDGPU::NoRegister),
WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
LDSWaveSpillSize(0),
PSInputAddr(0),
NumUserSGPRs(0),
NumSystemSGPRs(0),
HasSpilledSGPRs(false),
HasSpilledVGPRs(false),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
DispatchID(false),
KernargSegmentPtr(true),
KernargSegmentPtr(false),
FlatScratchInit(false),
GridWorkgroupCountX(false),
GridWorkgroupCountY(false),
@ -47,13 +65,17 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
WorkGroupIDY(false),
WorkGroupIDZ(false),
WorkGroupInfo(false),
PrivateSegmentWaveByteOffset(false),
WorkItemIDX(true),
WorkItemIDY(false),
WorkItemIDZ(false) {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
const Function *F = MF.getFunction();
if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
DispatchPtr = true;
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
if (getShaderType() == ShaderType::COMPUTE)
KernargSegmentPtr = true;
if (F->hasFnAttribute("amdgpu-work-group-id-y"))
WorkGroupIDY = true;
@ -66,14 +88,54 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (F->hasFnAttribute("amdgpu-work-item-id-z"))
WorkItemIDZ = true;
bool MaySpill = ST.isVGPRSpillingEnabled(this);
bool HasStackObjects = FrameInfo->hasStackObjects();
if (HasStackObjects || MaySpill)
PrivateSegmentWaveByteOffset = true;
if (ST.isAmdHsaOS()) {
if (HasStackObjects || MaySpill)
PrivateSegmentBuffer = true;
if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
DispatchPtr = true;
}
// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
if (WorkItemIDZ)
WorkItemIDY = true;
}
void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) {
// We need to round up to next multiple of 4.
unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4);
unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128);
ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0,
&AMDGPU::SReg_128RegClass);
unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
PrivateSegmentBufferUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
NumUserSGPRs += 4;
return PrivateSegmentBufferUserSGPR;
}
unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
DispatchPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
return DispatchPtrUserSGPR;
}
unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
QueuePtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
return QueuePtrUserSGPR;
}
unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
KernargSegmentPtrUserSGPR = TRI.getMatchingSuperReg(
getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
NumUserSGPRs += 2;
return KernargSegmentPtrUserSGPR;
}
SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(

View File

@ -26,10 +26,36 @@ class MachineRegisterInfo;
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
class SIMachineFunctionInfo : public AMDGPUMachineFunction {
// FIXME: This should be removed and getPreloadedValue moved here.
friend struct SIRegisterInfo;
void anchor() override;
unsigned TIDReg;
// Registers that may be reserved for spilling purposes. These may be the same
// as the input registers.
unsigned ScratchRSrcReg;
unsigned ScratchWaveOffsetReg;
// Input registers setup for the HSA ABI.
// User SGPRs in allocation order.
unsigned PrivateSegmentBufferUserSGPR;
unsigned DispatchPtrUserSGPR;
unsigned QueuePtrUserSGPR;
unsigned KernargSegmentPtrUserSGPR;
unsigned DispatchIDUserSGPR;
unsigned FlatScratchInitUserSGPR;
unsigned PrivateSegmentSizeUserSGPR;
unsigned GridWorkGroupCountXUserSGPR;
unsigned GridWorkGroupCountYUserSGPR;
unsigned GridWorkGroupCountZUserSGPR;
// System SGPRs in allocation order.
unsigned WorkGroupIDXSystemSGPR;
unsigned WorkGroupIDYSystemSGPR;
unsigned WorkGroupIDZSystemSGPR;
unsigned WorkGroupInfoSystemSGPR;
unsigned PrivateSegmentWaveByteOffsetSystemSGPR;
public:
// FIXME: Make private
@ -38,12 +64,14 @@ public:
std::map<unsigned, unsigned> LaneVGPRs;
unsigned ScratchOffsetReg;
unsigned NumUserSGPRs;
unsigned NumSystemSGPRs;
private:
bool HasSpilledSGPRs;
bool HasSpilledVGPRs;
// Feature bits required for inputs passed in user / system SGPRs.
// Feature bits required for inputs passed in user SGPRs.
bool PrivateSegmentBuffer : 1;
bool DispatchPtr : 1;
bool QueuePtr : 1;
bool DispatchID : 1;
@ -53,15 +81,27 @@ private:
bool GridWorkgroupCountY : 1;
bool GridWorkgroupCountZ : 1;
// Feature bits required for inputs passed in system SGPRs.
bool WorkGroupIDX : 1; // Always initialized.
bool WorkGroupIDY : 1;
bool WorkGroupIDZ : 1;
bool WorkGroupInfo : 1;
bool PrivateSegmentWaveByteOffset : 1;
bool WorkItemIDX : 1; // Always initialized.
bool WorkItemIDY : 1;
bool WorkItemIDZ : 1;
MCPhysReg getNextUserSGPR() const {
assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
return AMDGPU::SGPR0 + NumUserSGPRs;
}
MCPhysReg getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}
public:
struct SpilledReg {
unsigned VGPR;
@ -80,6 +120,47 @@ public:
unsigned getTIDReg() const { return TIDReg; };
void setTIDReg(unsigned Reg) { TIDReg = Reg; }
// Add user SGPRs.
unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
unsigned addDispatchPtr(const SIRegisterInfo &TRI);
unsigned addQueuePtr(const SIRegisterInfo &TRI);
unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
// Add system SGPRs.
unsigned addWorkGroupIDX() {
WorkGroupIDXSystemSGPR = getNextSystemSGPR();
NumSystemSGPRs += 1;
return WorkGroupIDXSystemSGPR;
}
unsigned addWorkGroupIDY() {
WorkGroupIDYSystemSGPR = getNextSystemSGPR();
NumSystemSGPRs += 1;
return WorkGroupIDYSystemSGPR;
}
unsigned addWorkGroupIDZ() {
WorkGroupIDZSystemSGPR = getNextSystemSGPR();
NumSystemSGPRs += 1;
return WorkGroupIDZSystemSGPR;
}
unsigned addWorkGroupInfo() {
WorkGroupInfoSystemSGPR = getNextSystemSGPR();
NumSystemSGPRs += 1;
return WorkGroupInfoSystemSGPR;
}
unsigned addPrivateSegmentWaveByteOffset() {
PrivateSegmentWaveByteOffsetSystemSGPR = getNextSystemSGPR();
NumSystemSGPRs += 1;
return PrivateSegmentWaveByteOffsetSystemSGPR;
}
bool hasPrivateSegmentBuffer() const {
return PrivateSegmentBuffer;
}
bool hasDispatchPtr() const {
return DispatchPtr;
}
@ -128,6 +209,10 @@ public:
return WorkGroupInfo;
}
bool hasPrivateSegmentWaveByteOffset() const {
return PrivateSegmentWaveByteOffset;
}
bool hasWorkItemIDX() const {
return WorkItemIDX;
}
@ -140,13 +225,37 @@ public:
return WorkItemIDZ;
}
unsigned getNumUserSGPRs() const {
return NumUserSGPRs;
}
unsigned getNumPreloadedSGPRs() const {
return NumUserSGPRs + NumSystemSGPRs;
}
unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return PrivateSegmentWaveByteOffsetSystemSGPR;
}
/// \brief Returns the physical register reserved for use as the resource
/// descriptor for scratch accesses.
unsigned getScratchRSrcReg() const {
return ScratchRSrcReg;
}
void setScratchRSrcReg(const SIRegisterInfo *TRI);
void setScratchRSrcReg(unsigned Reg) {
assert(Reg != AMDGPU::NoRegister && "Should never be unset");
ScratchRSrcReg = Reg;
}
unsigned getScratchWaveOffsetReg() const {
return ScratchWaveOffsetReg;
}
void setScratchWaveOffsetReg(unsigned Reg) {
assert(Reg != AMDGPU::NoRegister && "Should never be unset");
ScratchWaveOffsetReg = Reg;
}
bool hasSpilledSGPRs() const {
return HasSpilledSGPRs;

View File

@ -32,6 +32,40 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
Reserved.set(*R);
}
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// 98/99 need to be reserved for flat_scr, and 100/101 for vcc. This is the
// next sgpr128 down.
return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
}
return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
}
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
if (ST.hasSGPRInitBug()) {
unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 4 - 5;
return AMDGPU::SGPR_32RegClass.getRegister(Idx);
}
if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
// Next register before reservations for flat_scr and vcc.
return AMDGPU::SGPR97;
}
return AMDGPU::SGPR95;
}
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
@ -69,19 +103,20 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
}
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
// Reserve 1 SGPR for scratch wave offset in case we need to spill.
reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
}
unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
unsigned ScratchOffsetPreloadReg
= getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
// We will need to use this user SGPR argument for spilling, and thus never
// want it to be spilled.
reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg);
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
// to spill.
// TODO: May need to reserve a VGPR if doing LDS spilling.
reserveRegisterTuples(Reserved, ScratchRSrcReg);
assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg));
assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
}
return Reserved;
@ -204,11 +239,10 @@ void SIRegisterInfo::buildScratchLoadStore(MachineBasicBlock::iterator MI,
unsigned SubReg = NumSubRegs > 1 ?
getPhysRegSubReg(Value, &AMDGPU::VGPR_32RegClass, i) :
Value;
bool IsKill = (i == e - 1);
BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
.addReg(SubReg, getDefRegState(IsLoad))
.addReg(ScratchRsrcReg, getKillRegState(IsKill))
.addReg(ScratchRsrcReg)
.addReg(SOffset)
.addImm(Offset)
.addImm(0) // glc
@ -526,6 +560,9 @@ bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
return OpType == AMDGPU::OPERAND_REG_INLINE_C;
}
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
// as input registers if unused. Whether the dispatch ptr is necessary should be
// easy to detect from used intrinsics. Scratch setup is harder to know.
unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
enum PreloadedValue Value) const {
@ -533,29 +570,36 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
switch (Value) {
case SIRegisterInfo::WORKGROUP_ID_X:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 0);
assert(MFI->hasWorkGroupIDX());
return MFI->WorkGroupIDXSystemSGPR;
case SIRegisterInfo::WORKGROUP_ID_Y:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 1);
assert(MFI->hasWorkGroupIDY());
return MFI->WorkGroupIDYSystemSGPR;
case SIRegisterInfo::WORKGROUP_ID_Z:
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 2);
assert(MFI->hasWorkGroupIDZ());
return MFI->WorkGroupIDZSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
if (MFI->getShaderType() != ShaderType::COMPUTE)
return MFI->ScratchOffsetReg;
return AMDGPU::SReg_32RegClass.getRegister(MFI->NumUserSGPRs + 4);
return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
llvm_unreachable("currently unused");
assert(ST.isAmdHsaOS() && "Non-HSA ABI currently uses relocations");
assert(MFI->hasPrivateSegmentBuffer());
return MFI->PrivateSegmentBufferUserSGPR;
case SIRegisterInfo::KERNARG_SEGMENT_PTR:
return ST.isAmdHsaOS() ? AMDGPU::SGPR2_SGPR3 : AMDGPU::SGPR0_SGPR1;
assert(MFI->hasKernargSegmentPtr());
return MFI->KernargSegmentPtrUserSGPR;
case SIRegisterInfo::DISPATCH_PTR:
assert(MFI->hasDispatchPtr());
return AMDGPU::SGPR0_SGPR1;
return MFI->DispatchPtrUserSGPR;
case SIRegisterInfo::QUEUE_PTR:
llvm_unreachable("not implemented");
case SIRegisterInfo::WORKITEM_ID_X:
assert(MFI->hasWorkItemIDX());
return AMDGPU::VGPR0;
case SIRegisterInfo::WORKITEM_ID_Y:
assert(MFI->hasWorkItemIDY());
return AMDGPU::VGPR1;
case SIRegisterInfo::WORKITEM_ID_Z:
assert(MFI->hasWorkItemIDZ());
return AMDGPU::VGPR2;
}
llvm_unreachable("unexpected preloaded value type");

View File

@ -29,6 +29,15 @@ private:
public:
SIRegisterInfo();
/// Return the end register initially reserved for the scratch buffer in case
/// spilling is needed.
unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
/// Return the end register initially reserved for the scratch wave offset in
/// case spilling is needed.
unsigned reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const;
BitVector getReservedRegs(const MachineFunction &MF) const override;
unsigned getRegPressureSetLimit(const MachineFunction &MF,

View File

@ -38,8 +38,10 @@
; HSA: .amdgpu_hsa_kernel simple
; HSA: {{^}}simple:
; HSA: .amd_kernel_code_t
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_kernarg_segment_ptr = 1
; HSA: .end_amd_kernel_code_t
; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x0
; HSA: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
; Make sure we are setting the ATC bit:
; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000

View File

@ -1,31 +1,46 @@
; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
; FIXME: align on alloca seems to be ignored for private_segment_alignment
; ALL-LABEL: {{^}}large_alloca_compute_shader:
; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN: s_mov_b32 s14, -1
; CI: s_mov_b32 s15, 0x80f000
; VI: s_mov_b32 s15, 0x800000
; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN: s_mov_b32 s10, -1
; CI: s_mov_b32 s11, 0x80f000
; VI: s_mov_b32 s11, 0x800000
; GCNHSA: .amd_kernel_code_t
; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
; GCNHSA: compute_pgm_rsrc2_tg_size_en = 0
; GCNHSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
; GCNHSA: enable_sgpr_private_segment_buffer = 1
; GCNHSA: enable_sgpr_dispatch_ptr = 0
; GCNHSA: enable_sgpr_queue_ptr = 0
; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
; GCNHSA: enable_sgpr_dispatch_id = 0
; GCNHSA: enable_sgpr_flat_scratch_init = 0
; GCNHSA: enable_sgpr_private_segment_size = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
; GCNHSA: enable_sgpr_grid_workgroup_count_z = 0
; GCNHSA: workitem_private_segment_byte_size = 0
; GCNHSA: private_segment_alignment = 4
; GCNHSA: .end_amd_kernel_code_t
; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCNHSA: s_mov_b32 s10, -1
; CIHSA: s_mov_b32 s11, 0x180f000
; VIHSA: s_mov_b32 s11, 0x11800000
; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
; Scratch size = alloca size + emergency stack slot
; ALL: ; ScratchSize: 32772

View File

@ -8,8 +8,8 @@
; CI: s_mov_b32 s11, 0x80f000
; VI: s_mov_b32 s11, 0x800000
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
; ALL: ; ScratchSize: 32772
define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
@ -29,8 +29,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
; CI: s_mov_b32 s11, 0x80f000
; VI: s_mov_b32 s11, 0x800000
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
; ALL: ; ScratchSize: 32772
define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {

View File

@ -0,0 +1,37 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}read_workdim:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[2].Z
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @read_workdim(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.AMDGPU.read.workdim() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}read_workdim_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
; GCN-NOT: 0xff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @read_workdim_known_bits(i32 addrspace(1)* %out) {
entry:
%dim = call i32 @llvm.AMDGPU.read.workdim() #0
%shl = shl i32 %dim, 24
%shr = lshr i32 %shl, 24
store i32 %shr, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.AMDGPU.read.workdim() #0
attributes #0 = { readnone }

View File

@ -2,7 +2,7 @@
; GCN-LABEL: {{^}}test:
; GCN: enable_sgpr_dispatch_ptr = 1
; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0
; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
define void @test(i32 addrspace(1)* %out) {
%dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
%header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*

View File

@ -1,8 +1,8 @@
; RUN: llc -O0 -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck %s
; CHECK-LABEL: {{^}}test_debug_value:
; CHECK: s_load_dwordx2
; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR0_SGPR1
; CHECK: s_load_dwordx2 s[4:5]
; CHECK: DEBUG_VALUE: test_debug_value:globalptr_arg <- %SGPR4_SGPR5
; CHECK: buffer_store_dword
; CHECK: s_endpgm
define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {

View File

@ -0,0 +1,184 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}local_size_x:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].Z
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
; CI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x1
; VI-HSA: s_load_dword [[XY:s[0-9]+]], s[4:5], 0x4
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @local_size_x(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.local.size.x() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_y:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].W
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @local_size_y(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.local.size.y() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_z:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[2].X
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @local_size_z(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.local.size.z() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_xy:
; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
; GCN: buffer_store_dword [[VAL]]
define void @local_size_xy(i32 addrspace(1)* %out) {
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
%val = mul i32 %x, %y
store i32 %val, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_xz:
; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
; HSA-DAG: s_and_b32 [[X:s[0-9]+]], [[XY]], 0xffff
; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
; GCN: buffer_store_dword [[VAL]]
define void @local_size_xz(i32 addrspace(1)* %out) {
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
%val = mul i32 %x, %z
store i32 %val, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_yz:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 1
; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
; GCN: buffer_store_dword [[VAL]]
define void @local_size_yz(i32 addrspace(1)* %out) {
entry:
%y = call i32 @llvm.r600.read.local.size.y() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
%val = mul i32 %y, %z
store i32 %val, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_xyz:
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 1
; SI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x6
; SI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x7
; SI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x8
; VI-NOHSA-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x18
; VI-NOHSA-DAG: s_load_dword [[Y:s[0-9]+]], s[0:1], 0x1c
; VI-NOHSA-DAG: s_load_dword [[Z:s[0-9]+]], s[0:1], 0x20
; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
; GCN: buffer_store_dword [[VAL]]
define void @local_size_xyz(i32 addrspace(1)* %out) {
entry:
%x = call i32 @llvm.r600.read.local.size.x() #0
%y = call i32 @llvm.r600.read.local.size.y() #0
%z = call i32 @llvm.r600.read.local.size.z() #0
%xy = mul i32 %x, %y
%xyz = add i32 %xy, %z
store i32 %xyz, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_x_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
; GCN-NOT: 0xffff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
entry:
%size = call i32 @llvm.r600.read.local.size.x() #0
%shl = shl i32 %size, 16
%shr = lshr i32 %shl, 16
store i32 %shr, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_y_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
; GCN-NOT: 0xffff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
entry:
%size = call i32 @llvm.r600.read.local.size.y() #0
%shl = shl i32 %size, 16
%shr = lshr i32 %shl, 16
store i32 %shr, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_z_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
; GCN-NOT: 0xffff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
entry:
%size = call i32 @llvm.r600.read.local.size.z() #0
%shl = shl i32 %size, 16
%shr = lshr i32 %shl, 16
store i32 %shr, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.r600.read.local.size.x() #0
declare i32 @llvm.r600.read.local.size.y() #0
declare i32 @llvm.r600.read.local.size.z() #0
attributes #0 = { nounwind readnone }

View File

@ -10,7 +10,7 @@
; EG: .long 166120
; EG-NEXT: .long 8
; GCN: .long 47180
; GCN-NEXT: .long 38792
; GCN-NEXT: .long 32900
; EG: {{^}}local_memory_two_objects:

View File

@ -9,9 +9,9 @@
; EG: .long 166120
; EG-NEXT: .long 128
; SI: .long 47180
; SI-NEXT: .long 71560
; SI-NEXT: .long 65668
; CI: .long 47180
; CI-NEXT: .long 38792
; CI-NEXT: .long 32900
; FUNC-LABEL: {{^}}local_memory:

View File

@ -17,16 +17,18 @@ declare i32 @llvm.r600.read.tgid.z() #1
; GCN-LABEL: {{^}}spill_vgpr_compute:
; GCN: s_mov_b32 s16, s3
; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_mov_b32 s15, 0x80f000
; VI-NEXT: s_mov_b32 s15, 0x800000
; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
; GCN: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[12:15], s16 offen offset:{{[0-9]+}}
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024

View File

@ -11,14 +11,14 @@
; GCN-LABEL: {{^}}main:
; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s11, 0x80f000
; VI-NEXT: s_mov_b32 s11, 0x800000
; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-NEXT: s_mov_b32 s14, -1
; SI-NEXT: s_mov_b32 s15, 0x80f000
; VI-NEXT: s_mov_b32 s15, 0x800000
; s12 is offset user SGPR
; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024

View File

@ -1,5 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=HSA -check-prefix=CI-HSA -check-prefix=FUNC %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=HSA -check-prefix=VI-HSA -check-prefix=FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
@ -7,9 +9,26 @@
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].X
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
; HSA: .amd_kernel_code_t
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 0
; HSA: enable_sgpr_kernarg_segment_ptr = 1
; HSA: enable_sgpr_dispatch_id = 0
; HSA: enable_sgpr_flat_scratch_init = 0
; HSA: enable_sgpr_private_segment_size = 0
; HSA: enable_sgpr_grid_workgroup_count_x = 0
; HSA: enable_sgpr_grid_workgroup_count_y = 0
; HSA: enable_sgpr_grid_workgroup_count_z = 0
; HSA: .end_amd_kernel_code_t
; GCN-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @ngroups_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.x() #0
@ -21,10 +40,10 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].Y
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @ngroups_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.y() #0
@ -36,10 +55,10 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].Z
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @ngroups_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.ngroups.z() #0
@ -51,10 +70,10 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[0].W
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x3
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xc
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @global_size_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.x() #0
@ -66,10 +85,10 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].X
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x4
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x10
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @global_size_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.y() #0
@ -81,10 +100,10 @@ entry:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].Y
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
; SI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x5
; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x14
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NOHSA: buffer_store_dword [[VVAL]]
define void @global_size_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.global.size.z() #0
@ -92,74 +111,33 @@ entry:
ret void
}
; FUNC-LABEL: {{^}}local_size_x:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].Z
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @local_size_x (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.local.size.x() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_y:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[1].W
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @local_size_y (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.local.size.y() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_z:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[2].X
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @local_size_z (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.local.size.z() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}get_work_dim:
; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
; EG: MOV [[VAL]], KC0[2].Z
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @get_work_dim (i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.AMDGPU.read.workdim() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; The tgid values are stored in sgprs offset by the number of user sgprs.
; Currently we always use exactly 2 user sgprs for the pointer to the
; kernel arguments, but this may change in the future.
; The tgid values are stored in sgprs offset by the number of user
; sgprs.
; FUNC-LABEL: {{^}}tgid_x:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s4
; HSA: .amd_kernel_code_t
; HSA: compute_pgm_rsrc2_user_sgpr = 6
; HSA: compute_pgm_rsrc2_tgid_x_en = 1
; HSA: compute_pgm_rsrc2_tgid_y_en = 0
; HSA: compute_pgm_rsrc2_tgid_z_en = 0
; HSA: compute_pgm_rsrc2_tg_size_en = 0
; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
; HSA: enable_sgpr_grid_workgroup_count_x = 0
; HSA: enable_sgpr_grid_workgroup_count_y = 0
; HSA: enable_sgpr_grid_workgroup_count_z = 0
; HSA: .end_amd_kernel_code_t
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s2{{$}}
; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6{{$}}
; GCN: buffer_store_dword [[VVAL]]
define void @tgid_x (i32 addrspace(1)* %out) {
; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
define void @tgid_x(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.x() #0
store i32 %0, i32 addrspace(1)* %out
@ -167,9 +145,25 @@ entry:
}
; FUNC-LABEL: {{^}}tgid_y:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s5
; HSA: compute_pgm_rsrc2_user_sgpr = 6
; HSA: compute_pgm_rsrc2_tgid_x_en = 1
; HSA: compute_pgm_rsrc2_tgid_y_en = 1
; HSA: compute_pgm_rsrc2_tgid_z_en = 0
; HSA: compute_pgm_rsrc2_tg_size_en = 0
; HSA: enable_sgpr_grid_workgroup_count_x = 0
; HSA: enable_sgpr_grid_workgroup_count_y = 0
; HSA: enable_sgpr_grid_workgroup_count_z = 0
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3
; GCN-HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7
; GCN: buffer_store_dword [[VVAL]]
define void @tgid_y (i32 addrspace(1)* %out) {
; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
define void @tgid_y(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.y() #0
store i32 %0, i32 addrspace(1)* %out
@ -177,102 +171,83 @@ entry:
}
; FUNC-LABEL: {{^}}tgid_z:
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], s6
; HSA: compute_pgm_rsrc2_user_sgpr = 6
; HSA: compute_pgm_rsrc2_tgid_x_en = 1
; HSA: compute_pgm_rsrc2_tgid_y_en = 0
; HSA: compute_pgm_rsrc2_tgid_z_en = 1
; HSA: compute_pgm_rsrc2_tg_size_en = 0
; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
; HSA: enable_sgpr_private_segment_buffer = 1
; HSA: enable_sgpr_dispatch_ptr = 0
; HSA: enable_sgpr_queue_ptr = 0
; HSA: enable_sgpr_kernarg_segment_ptr = 1
; HSA: enable_sgpr_dispatch_id = 0
; HSA: enable_sgpr_flat_scratch_init = 0
; HSA: enable_sgpr_private_segment_size = 0
; HSA: enable_sgpr_grid_workgroup_count_x = 0
; HSA: enable_sgpr_grid_workgroup_count_y = 0
; HSA: enable_sgpr_grid_workgroup_count_z = 0
; GCN-NOHSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s3{{$}}
; HSA: v_mov_b32_e32 [[VVAL:v[0-9]+]], s7{{$}}
; GCN: buffer_store_dword [[VVAL]]
define void @tgid_z (i32 addrspace(1)* %out) {
; HSA: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
; GCN: COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
define void @tgid_z(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tgid.z() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; GCN-NOHSA: .section .AMDGPU.config
; GCN-NOHSA: .long 47180
; GCN-NOHSA-NEXT: .long 132{{$}}
; FUNC-LABEL: {{^}}tidig_x:
; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 0
; GCN: buffer_store_dword v0
define void @tidig_x (i32 addrspace(1)* %out) {
define void @tidig_x(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.x() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; GCN-NOHSA: .section .AMDGPU.config
; GCN-NOHSA: .long 47180
; GCN-NOHSA-NEXT: .long 2180{{$}}
; FUNC-LABEL: {{^}}tidig_y:
; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 1
; GCN: buffer_store_dword v1
define void @tidig_y (i32 addrspace(1)* %out) {
define void @tidig_y(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.y() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; GCN-NOHSA: .section .AMDGPU.config
; GCN-NOHSA: .long 47180
; GCN-NOHSA-NEXT: .long 4228{{$}}
; FUNC-LABEL: {{^}}tidig_z:
; HSA: compute_pgm_rsrc2_tidig_comp_cnt = 2
; GCN: buffer_store_dword v2
define void @tidig_z (i32 addrspace(1)* %out) {
define void @tidig_z(i32 addrspace(1)* %out) {
entry:
%0 = call i32 @llvm.r600.read.tidig.z() #0
store i32 %0, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_x_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x6
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x18
; GCN-NOT: 0xffff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
entry:
%size = call i32 @llvm.r600.read.local.size.x() #0
%shl = shl i32 %size, 16
%shr = lshr i32 %shl, 16
store i32 %shr, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_y_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x7
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
; GCN-NOT: 0xffff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
entry:
%size = call i32 @llvm.r600.read.local.size.y() #0
%shl = shl i32 %size, 16
%shr = lshr i32 %shl, 16
store i32 %shr, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}local_size_z_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x8
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
; GCN-NOT: 0xffff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN-NEXT: buffer_store_dword [[VVAL]]
define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
entry:
%size = call i32 @llvm.r600.read.local.size.z() #0
%shl = shl i32 %size, 16
%shr = lshr i32 %shl, 16
store i32 %shr, i32 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}get_work_dim_known_bits:
; SI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0xb
; VI: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x2c
; GCN-NOT: 0xff
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
; GCN: buffer_store_dword [[VVAL]]
define void @get_work_dim_known_bits(i32 addrspace(1)* %out) {
entry:
%dim = call i32 @llvm.AMDGPU.read.workdim() #0
%shl = shl i32 %dim, 24
%shr = lshr i32 %shl, 24
store i32 %shr, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.r600.read.ngroups.x() #0
declare i32 @llvm.r600.read.ngroups.y() #0
declare i32 @llvm.r600.read.ngroups.z() #0
@ -281,10 +256,6 @@ declare i32 @llvm.r600.read.global.size.x() #0
declare i32 @llvm.r600.read.global.size.y() #0
declare i32 @llvm.r600.read.global.size.z() #0
declare i32 @llvm.r600.read.local.size.x() #0
declare i32 @llvm.r600.read.local.size.y() #0
declare i32 @llvm.r600.read.local.size.z() #0
declare i32 @llvm.r600.read.tgid.x() #0
declare i32 @llvm.r600.read.tgid.y() #0
declare i32 @llvm.r600.read.tgid.z() #0