AMDGPU: Activate all lanes when spilling CSR VGPR for SGPR spills

If some lanes weren't active on entry to the function, this could
clobber their VGPR values.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@361655 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2019-05-24 18:18:51 +00:00
parent bbaa274fa9
commit 332260473f
8 changed files with 136 additions and 55 deletions

View File

@ -523,22 +523,20 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
// but we would then have to make sure that we were in fact saving at least one
// callee-save register in the prologue, which is additional complexity that
// doesn't seem worth the benefit.
static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock &MBB) {
MachineFunction *MF = MBB.getParent();
const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
static unsigned findScratchNonCalleeSaveRegister(MachineFunction &MF,
LivePhysRegs &LiveRegs,
const TargetRegisterClass &RC) {
const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo &TRI = *Subtarget.getRegisterInfo();
LivePhysRegs LiveRegs(TRI);
LiveRegs.addLiveIns(MBB);
// Mark callee saved registers as used so we will not choose them.
const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
for (unsigned i = 0; CSRegs[i]; ++i)
LiveRegs.addReg(CSRegs[i]);
MachineRegisterInfo &MRI = MF->getRegInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
for (unsigned Reg : AMDGPU::SReg_32_XM0RegClass) {
for (unsigned Reg : RC) {
if (LiveRegs.available(MRI, Reg))
return Reg;
}
@ -561,6 +559,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
LivePhysRegs LiveRegs;
MachineBasicBlock::iterator MBBI = MBB.begin();
DebugLoc DL;
@ -578,7 +577,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
RoundedSize += Alignment;
unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(MBB);
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
unsigned ScratchSPReg
= findScratchNonCalleeSaveRegister(MF, LiveRegs,
AMDGPU::SReg_32_XM0RegClass);
assert(ScratchSPReg != AMDGPU::NoRegister);
// s_add_u32 tmp_reg, s32, NumBytes
@ -609,13 +613,33 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
.setMIFlag(MachineInstr::FrameSetup);
}
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
&TII->getRegisterInfo());
if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
if (LiveRegs.empty()) {
LiveRegs.init(TRI);
LiveRegs.addLiveIns(MBB);
}
// To avoid clobbering VGPRs in lanes that weren't active on function entry,
// turn on all lanes before doing the spill to memory.
unsigned ScratchExecCopy
= findScratchNonCalleeSaveRegister(MF, LiveRegs,
AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
.addImm(-1);
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
TII->storeRegToStackSlot(MBB, MBBI, Reg.VGPR, true,
Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
&TII->getRegisterInfo());
}
// FIXME: Split block and make terminator.
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addReg(ScratchExecCopy);
}
}
@ -628,14 +652,32 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
DebugLoc DL;
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
&TII->getRegisterInfo());
if (!FuncInfo->getSGPRSpillVGPRs().empty()) {
// See emitPrologue
LivePhysRegs LiveRegs(*ST.getRegisterInfo());
LiveRegs.addLiveIns(MBB);
unsigned ScratchExecCopy
= findScratchNonCalleeSaveRegister(MF, LiveRegs,
AMDGPU::SReg_64_XEXECRegClass);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), ScratchExecCopy)
.addImm(-1);
for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
: FuncInfo->getSGPRSpillVGPRs()) {
if (!Reg.FI.hasValue())
continue;
TII->loadRegFromStackSlot(MBB, MBBI, Reg.VGPR,
Reg.FI.getValue(), &AMDGPU::VGPR_32RegClass,
&TII->getRegisterInfo());
}
// FIXME: Split block and make terminator.
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addReg(ScratchExecCopy);
}
unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
@ -645,8 +687,6 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint32_t NumBytes = MFI.getStackSize();
DebugLoc DL;
// FIXME: Clarify distinction between no set SP and SP. For callee functions,
// it's really whether we need SP to be accurate or not.

View File

@ -30,11 +30,11 @@ entry:
; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf:
; GCN: s_mov_b32 s5, s32
; GCN: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: buffer_store_dword v32
; GCN-DAG: buffer_store_dword v33
; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32
; GCN-DAG: v_writelane_b32
; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]]
; GCN-DAG: buffer_store_dword [[ADD0]], off, s[0:3], s5 offset:4{{$}}

View File

@ -38,8 +38,8 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_mov_b32 s5, s33
; GCN: v_readlane_b32 s37, v32, 4
; GCN-DAG: s_mov_b32 s5, s33
; GCN-DAG: v_readlane_b32 s37, v32, 4
; GCN: v_readlane_b32 s36, v32, 3
; GCN: v_readlane_b32 s35, v32, 2
; GCN: v_readlane_b32 s34, v32, 1
@ -59,7 +59,7 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: s_mov_b32 s5, s33
; GCN: s_mov_b32 s5, s33
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
@ -175,7 +175,7 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(
; GCN-NEXT: ; clobber
; GCN-NEXT: #ASMEND
; GCN-NEXT: v_readlane_b32 s33, v0, 0
; GCN-NEXT: s_setpc_b64
; GCN: s_setpc_b64
define hidden void @void_func_void_clobber_s33() #2 {
call void asm sideeffect "; clobber", "~{s33}"() #0
ret void

View File

@ -37,19 +37,19 @@ define void @callee_with_stack() #0 {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt
; GCN: s_mov_b32 s5, s32
; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8
; GCN-DAG: v_writelane_b32 v32, s33,
; GCN-DAG: v_writelane_b32 v32, s34,
; GCN-DAG: v_writelane_b32 v32, s35,
; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s5 offset:4{{$}}
; GCN-DAG: s_mov_b32 s33, s5
; GCN: s_swappc_b64
; GCN: s_mov_b32 s5, s33
; GCN-DAG: s_mov_b32 s5, s33
; GCN-DAG: v_readlane_b32 s35,
; GCN-DAG: v_readlane_b32 s34,
; GCN-DAG: v_readlane_b32 s33,
@ -72,7 +72,9 @@ define void @callee_with_stack_and_call() #0 {
; GCN-LABEL: {{^}}callee_no_stack_with_call:
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v32, s33, 0
; GCN-DAG: v_writelane_b32 v32, s34, 1
; GCN: s_mov_b32 s33, s5
@ -81,9 +83,12 @@ define void @callee_with_stack_and_call() #0 {
; GCN-DAG: v_readlane_b32 s34, v32, 1
; GCN-DAG: v_readlane_b32 s33, v32, 0
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_setpc_b64
define void @callee_no_stack_with_call() #0 {
call void @external_void_func_void()
@ -94,11 +99,18 @@ declare void @external_void_func_void() #0
; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and restored
; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN: v_writelane_b32 v32
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s{{[0-9]+}}, v32
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {

View File

@ -326,8 +326,8 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}

View File

@ -28,10 +28,12 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v2f32@rel32@lo+4
@ -39,12 +41,14 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -62,10 +66,12 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v3f32@rel32@lo+4
@ -73,12 +79,14 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -96,10 +104,12 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_v4f16@rel32@lo+4
@ -107,12 +117,14 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@ -130,10 +142,12 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_mov_b32 s5, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: v_writelane_b32 v32, s33, 0
; GCN-NEXT: v_writelane_b32 v32, s34, 1
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: v_writelane_b32 v32, s35, 2
; GCN-NEXT: s_getpc_b64 s[6:7]
; GCN-NEXT: s_add_u32 s6, s6, func_struct@rel32@lo+4
@ -141,13 +155,15 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN-NEXT: s_mov_b64 s[34:35], s[30:31]
; GCN-NEXT: s_mov_b32 s33, s5
; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7]
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: s_mov_b64 s[30:31], s[34:35]
; GCN-NEXT: v_readlane_b32 s35, v32, 2
; GCN-NEXT: s_mov_b32 s5, s33
; GCN-NEXT: v_readlane_b32 s34, v32, 1
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: v_readlane_b32 s33, v32, 0
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]

View File

@ -10,9 +10,12 @@ declare void @external_void_func_i32(i32) #0
; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm:
; GCN: s_waitcnt
; GCN: s_mov_b32 s5, s32
; Spill CSR VGPR used for SGPR spilling
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4
; GCN-DAG: s_add_u32 s32, s32, 0x400
; Spill CSR VGPR used for SGPR spilling
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s5 offset:4
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v32, s33, 0
; GCN-DAG: v_writelane_b32 v32, s34, 1
; GCN-DAG: v_writelane_b32 v32, s35, 2
@ -22,7 +25,10 @@ declare void @external_void_func_i32(i32) #0
; GCN: v_readlane_b32 s35, v32, 2
; GCN: v_readlane_b32 s34, v32, 1
; GCN: v_readlane_b32 s33, v32, 0
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_i32_imm() #0 {

View File

@ -207,13 +207,17 @@ entry:
; Have another non-tail in the function
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:12
; GCN: s_add_u32 s32, s32, 0x400
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s5 offset:12
; GCN-NEXT: s_mov_b64 exec
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v34, s33, 0
; GCN-DAG: v_writelane_b32 v34, s34, 1
; GCN-DAG: v_writelane_b32 v34, s35, 2
; GCN-DAG: s_add_u32 s32, s32, 0x400
; GCN-DAG: s_getpc_b64
; GCN: s_swappc_b64
@ -228,7 +232,10 @@ entry:
; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:4
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:8
; GCN: buffer_load_dword v34, off, s[0:3], s5 offset:12
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s5 offset:12
; GCN-NEXT: s_mov_b64 exec
; GCN: s_sub_u32 s32, s32, 0x400
; GCN: s_setpc_b64 s[6:7]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {