AMDGPU: Insert moves of frame index to value operands

Strengthen tests of storing frame indices.

Right now this just creates irrelevant scheduling changes.

We don't want to have multiple frame index operands
on an instruction. There seem to be various assumptions
that at least the same frame index will not appear twice
in the LocalStackSlotAllocation pass.

There's no reason to have this happen, and it just
makes it easy to introduce bugs where the immediate
offset is appplied to the storing instruction when it should
really be applied to the value being stored as a separate
add.

This might not be sufficient. It might still be problematic
to have an add fi, fi situation, but that's even less unlikely
to happen in real code.

llvm-svn: 264200
This commit is contained in:
Matt Arsenault 2016-03-23 21:49:25 +00:00
parent 4458d58ad9
commit 61a2a42381
3 changed files with 204 additions and 6 deletions

View File

@ -19,6 +19,7 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
@ -1557,6 +1558,61 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
bool Modified = false;
MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo();
// Handle the perverse case where a frame index is being stored. We don't
// want to see multiple frame index operands on the same instruction since
// it complicates things and violates some assumptions about frame index
// lowering.
for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd();
I != E; ++I) {
SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
// It's possible that we have a frame index defined in the function that
// isn't used in this block.
if (FI.use_empty())
continue;
// Skip over the AssertZext inserted during lowering.
SDValue EffectiveFI = FI;
auto It = FI->use_begin();
if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
EffectiveFI = SDValue(*It, 0);
It = EffectiveFI->use_begin();
}
for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
SDUse &Use = It.getUse();
SDNode *User = Use.getUser();
unsigned OpIdx = It.getOperandNo();
++It;
if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
if (OpIdx == PtrIdx)
continue;
unsigned OpN = OpN = M->getNumOperands();
SDValue NewOps[8];
assert(OpN < array_lengthof(NewOps));
for (unsigned Op = 0; Op != OpN; ++Op) {
if (Op != OpIdx) {
NewOps[Op] = M->getOperand(Op);
continue;
}
MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
SDLoc(M), MVT::i32, FI);
NewOps[Op] = SDValue(Mov, 0);
}
CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
Modified = true;
}
}
}
// XXX - Other targets seem to be able to do this without a worklist.
SmallVector<LoadSDNode *, 8> LoadsToReplace;
SmallVector<StoreSDNode *, 8> StoresToReplace;

View File

@ -0,0 +1,119 @@
; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}stored_fi_to_lds:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
%tmp = alloca float
store float 4.0, float *%tmp
store float* %tmp, float* addrspace(3)* %ptr
ret void
}
; Offset is applied
; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[FI1]]
; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]]
define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
%tmp0 = alloca float
%tmp1 = alloca float
store float 4.0, float *%tmp0
store float 4.0, float *%tmp1
store volatile float* %tmp0, float* addrspace(3)* %ptr
store volatile float* %tmp1, float* addrspace(3)* %ptr
ret void
}
; Same frame index is used multiple times in the store
; GCN-LABEL: {{^}}stored_fi_to_self:
define void @stored_fi_to_self() #0 {
%tmp = alloca i32*
; Avoid optimizing everything out
store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
%bitcast = bitcast i32** %tmp to i32*
store volatile i32* %bitcast, i32** %tmp
ret void
}
; GCN-LABEL: {{^}}stored_fi_to_fi:
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN: buffer_store_dword
; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
; GCN: buffer_store_dword [[FI1]], [[FI2]]
; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
; GCN: buffer_store_dword [[FI2]], [[FI1]]
define void @stored_fi_to_fi() #0 {
%tmp0 = alloca i32*
%tmp1 = alloca i32*
%tmp2 = alloca i32*
store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0
store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1
store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2
%bitcast1 = bitcast i32** %tmp1 to i32*
%bitcast2 = bitcast i32** %tmp2 to i32* ; at offset 8
store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8
store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4
ret void
}
; GCN-LABEL: {{^}}stored_fi_to_global:
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword [[FI]]
define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
%tmp = alloca float
store float 0.0, float *%tmp
store float* %tmp, float* addrspace(1)* %ptr
ret void
}
; Offset is applied
; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
; GCN: buffer_store_dword [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
; GCN: buffer_store_dword [[FI2]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
%tmp0 = alloca float
%tmp1 = alloca float
%tmp2 = alloca float
store volatile float 0.0, float *%tmp0
store volatile float 0.0, float *%tmp1
store volatile float 0.0, float *%tmp2
store volatile float* %tmp1, float* addrspace(1)* %ptr
store volatile float* %tmp2, float* addrspace(1)* %ptr
ret void
}
attributes #0 = { nounwind }

View File

@ -1,4 +1,5 @@
; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Pointer value is stored in a candidate for LDS usage.
@ -11,6 +12,18 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
ret void
}
; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
; GCN: buffer_store_dword v
define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
%tmp0 = alloca float
%tmp1 = alloca float
store float 0.0, float *%tmp0
store float 0.0, float *%tmp1
store volatile float* %tmp0, float* addrspace(1)* %ptr
store volatile float* %tmp1, float* addrspace(1)* %ptr
ret void
}
; GCN-LABEL: {{^}}stored_lds_pointer_value_gep:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
@ -36,17 +49,27 @@ bb:
define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
entry:
%tmp0 = alloca [4 x i32]
%x = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
%y = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
%z = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
%w = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
%x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
%y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
%z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
%w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
store i32 0, i32* %x
store i32 1, i32* %y
store i32 2, i32* %z
store i32 3, i32* %w
%tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
%tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
store i32* %tmp1, i32* addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}stored_fi_to_self:
; GCN-NOT: ds_
define void @stored_fi_to_self() #0 {
%tmp = alloca i32*
store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
%bitcast = bitcast i32** %tmp to i32*
store volatile i32* %bitcast, i32** %tmp
ret void
}
attributes #0 = { nounwind }