AMDGPU: Fix not respecting byval alignment in call frame setup

This was hackily adding in the 4-bytes reserved for the callee's
emergency stack slot. Treat it like a normal stack allocation
so we get the correct alignment padding behavior. This fixes
an inconsistency between the caller and callee.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@340396 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matt Arsenault 2018-08-22 11:09:45 +00:00
parent 4d6d5a12e8
commit ad45fb5af4
6 changed files with 174 additions and 41 deletions

View File

@ -4003,13 +4003,12 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getObjectPtrOffset(SL, StackPtr, Offset);
SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
MachineMemOperand::MODereferenceable);
return Store;

View File

@ -287,7 +287,6 @@ public:
SDValue storeStackInputValue(SelectionDAG &DAG,
const SDLoc &SL,
SDValue Chain,
SDValue StackPtr,
SDValue ArgVal,
int64_t Offset) const;

View File

@ -2181,11 +2181,11 @@ SDValue SITargetLowering::LowerCallResult(
// from the explicit user arguments present in the IR.
void SITargetLowering::passSpecialInputs(
CallLoweringInfo &CLI,
CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
SDValue Chain,
SDValue StackPtr) const {
SDValue Chain) const {
// If we don't have a call site, this was a call inserted by
// legalization. These can never use special inputs.
if (!CLI.CS)
@ -2253,9 +2253,9 @@ void SITargetLowering::passSpecialInputs(
if (OutgoingArg->isRegister()) {
RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
} else {
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
InputReg,
OutgoingArg->getStackOffset());
unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
SpecialArgOffset);
MemOpChains.push_back(ArgStore);
}
}
@ -2401,8 +2401,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// The first 4 bytes are reserved for the callee's emergency stack slot.
const unsigned CalleeUsableStackOffset = 4;
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@ -2441,6 +2439,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
// The first 4 bytes are reserved for the callee's emergency stack slot.
CCInfo.AllocateStack(4, 4);
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
@ -2488,10 +2490,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
}
// Stack pointer relative accesses are done by changing the offset SGPR. This
// is just the VGPR offset component.
SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
SmallVector<SDValue, 8> MemOpChains;
MVT PtrVT = MVT::i32;
@ -2535,7 +2533,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset;
SDValue PtrOff = DAG.getObjectPtrOffset(DL, StackPtr, Offset);
SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
if (IsTailCall) {
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
@ -2545,8 +2543,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Offset = Offset + FPDiff;
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getObjectPtrOffset(DL, DAG.getFrameIndex(FI, PtrVT),
StackPtr);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
// Make sure any stack arguments overlapping with where we're storing
@ -2581,7 +2578,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Copy special input registers after user input arguments.
passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

View File

@ -265,11 +265,11 @@ public:
void passSpecialInputs(
CallLoweringInfo &CLI,
CCState &CCInfo,
const SIMachineFunctionInfo &Info,
SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
SmallVectorImpl<SDValue> &MemOpChains,
SDValue Chain,
SDValue StackPtr) const;
SDValue Chain) const;
SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,

View File

@ -110,7 +110,7 @@ entry:
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @call_void_func_byval_struct_func() #0 {
define void @call_void_func_byval_struct_func() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
@ -163,7 +163,7 @@ entry:
; GCN: s_swappc_b64
; GCN-NOT: s_sub_u32 s32
; GCN: s_endpgm
define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
@ -181,6 +181,146 @@ entry:
ret void
}
; GCN-LABEL: {{^}}void_func_byval_struct_align8:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
; GCN-NOT: s32
; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s5 offset:8{{$}}
; GCN-NOT: s32
; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:24{{$}}
; GCN-NOT: s32
; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:24{{$}}
; GCN-NOT: s32
define void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 {
entry:
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
%tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8
%add = add nsw i32 %tmp, 1
store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
%tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8
%add3 = add nsw i32 %tmp1, 2
store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8
store volatile i32 9, i32 addrspace(1)* null, align 4
ret void
}
; Make sure the byval alignment is respected in the call frame setup
; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel:
; GCN: s_mov_b32 s33, s7
; GCN: s_add_u32 s32, s33, 0xc00{{$}}
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8
; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24
; GCN-NOT: s_add_u32 s32, s32, 0x800
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
; GCN: s_swappc_b64
; GCN-NOT: s_sub_u32 s32
; GCN: s_endpgm
define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 {
entry:
%arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
%tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
%tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
ret void
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func:
; GCN: s_mov_b32 s5, s32
; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}}
; GCN-DAG: v_writelane_b32
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
; GCN-NOT: s_add_u32 s32, s32, 0x800
; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8{{$}}
; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:20
; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:32
; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:36
; GCN: s_swappc_b64
; GCN-NOT: v_readlane_b32 s32
; GCN: v_readlane_b32
; GCN-NOT: v_readlane_b32 s32
; GCN-NOT: s_sub_u32 s32, s32, 0x800
; GCN: s_sub_u32 s32, s32, 0xc00{{$}}
; GCN-NEXT: s_waitcnt
; GCN-NEXT: s_setpc_b64
define void @call_void_func_byval_struct_align8_func() #0 {
entry:
%arg0 = alloca %struct.ByValStruct, align 8, addrspace(5)
%arg1 = alloca %struct.ByValStruct, align 8, addrspace(5)
%tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
%tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
%arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8
%arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8
call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1)
call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
ret void
}
; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
entry:

View File

@ -290,7 +290,7 @@ define void @too_many_args_use_workitem_id_x(
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN: s_mov_b32 s4, s33
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
@ -308,7 +308,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 {
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x:
; GCN: s_mov_b32 s5, s32
; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:8
; GCN: buffer_store_dword v1, off, s[0:3], s32 offset:
; GCN: s_swappc_b64
define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
store volatile i32 %arg0, i32 addrspace(1)* undef
@ -330,7 +330,7 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; GCN: s_add_u32 s32, s32, 0x400{{$}}
; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:8{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4{{$}}
; GCN: s_swappc_b64
@ -428,7 +428,7 @@ define void @too_many_args_use_workitem_id_x_byval(
; GCN-NOT: s32
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s33 offset:4
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
@ -453,7 +453,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1
; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval:
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
; GCN: buffer_store_dword [[K]], off, s[0:3], s5 offset:4
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:12
; GCN: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s5 offset:4
; GCN: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}}
@ -539,11 +539,10 @@ define void @too_many_args_use_workitem_id_xyz(
ret void
}
; frame[0] = kernel emergency stack slot
; frame[1] = callee emergency stack slot
; frame[2] = ID X
; frame[3] = ID Y
; frame[4] = ID Z
; frame[0] = callee emergency stack slot
; frame[1] = ID X
; frame[2] = ID Y
; frame[3] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz:
; GCN: enable_vgpr_workitem_id = 2
@ -551,9 +550,9 @@ define void @too_many_args_use_workitem_id_xyz(
; GCN: s_mov_b32 s33, s7
; GCN: s_mov_b32 s32, s33
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:12
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:16
; GCN-DAG: buffer_store_dword v0, off, s[0:3], s32 offset:4
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
call void @too_many_args_use_workitem_id_xyz(
@ -635,10 +634,9 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
ret void
}
; frame[0] = kernel emergency stack slot
; frame[1] = callee emergency stack slot
; frame[2] = ID Y
; frame[3] = ID Z
; frame[0] = callee emergency stack slot
; frame[1] = ID Y
; frame[2] = ID Z
; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz:
; GCN: enable_vgpr_workitem_id = 2
@ -647,8 +645,8 @@ define void @too_many_args_use_workitem_id_x_stack_yz(
; GCN: s_mov_b32 s32, s33
; GCN-DAG: v_mov_b32_e32 v31, v0
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:8
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:12
; GCN-DAG: buffer_store_dword v1, off, s[0:3], s32 offset:4
; GCN-DAG: buffer_store_dword v2, off, s[0:3], s32 offset:8
; GCN: s_swappc_b64
define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 {
call void @too_many_args_use_workitem_id_x_stack_yz(