AMDGPU: Insert moves of frame index to value operands

Strengthen tests of storing frame indices. Right now this just creates irrelevant scheduling changes. We don't want to have multiple frame index operands on an instruction. There seem to be various assumptions that at least the same frame index will not appear twice in the LocalStackSlotAllocation pass. There's no reason to have this happen, and it just makes it easy to introduce bugs where the immediate offset is appplied to the storing instruction when it should really be applied to the value being stored as a separate add. This might not be sufficient. It might still be problematic to have an add fi, fi situation, but that's even less unlikely to happen in real code. llvm-svn: 264200
2024-12-02 08:26:29 +00:00 · 2016-03-23 21:49:25 +00:00 · 2016-03-23 21:49:25 +00:00 · 61a2a42381
commit 61a2a42381
parent 4458d58ad9
3 changed files with 204 additions and 6 deletions
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -19,6 +19,7 @@
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
@ -1557,6 +1558,61 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
  bool Modified = false;

+  MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo();
+
+  // Handle the perverse case where a frame index is being stored. We don't
+  // want to see multiple frame index operands on the same instruction since
+  // it complicates things and violates some assumptions about frame index
+  // lowering.
+  for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd();
+       I != E; ++I) {
+    SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
+
+    // It's possible that we have a frame index defined in the function that
+    // isn't used in this block.
+    if (FI.use_empty())
+      continue;
+
+    // Skip over the AssertZext inserted during lowering.
+    SDValue EffectiveFI = FI;
+    auto It = FI->use_begin();
+    if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
+      EffectiveFI = SDValue(*It, 0);
+      It = EffectiveFI->use_begin();
+    }
+
+    for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
+      SDUse &Use = It.getUse();
+      SDNode *User = Use.getUser();
+      unsigned OpIdx = It.getOperandNo();
+      ++It;
+
+      if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
+        unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
+        if (OpIdx == PtrIdx)
+          continue;
+
+        unsigned OpN = OpN = M->getNumOperands();
+        SDValue NewOps[8];
+
+        assert(OpN < array_lengthof(NewOps));
+        for (unsigned Op = 0; Op != OpN; ++Op) {
+          if (Op != OpIdx) {
+            NewOps[Op] = M->getOperand(Op);
+            continue;
+          }
+
+          MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+                                                      SDLoc(M), MVT::i32, FI);
+          NewOps[Op] = SDValue(Mov, 0);
+        }
+
+        CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
+        Modified = true;
+      }
+    }
+  }
+
  // XXX - Other targets seem to be able to do this without a worklist.
  SmallVector<LoadSDNode *, 8> LoadsToReplace;
  SmallVector<StoreSDNode *, 8> StoresToReplace;
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@ -0,0 +1,119 @@
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}stored_fi_to_lds:
+; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+
+; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+
+; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
+define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+  %tmp = alloca float
+  store float 4.0, float *%tmp
+  store float* %tmp, float* addrspace(3)* %ptr
+  ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
+; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[FI1]]
+
+
+; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
+define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+  %tmp0 = alloca float
+  %tmp1 = alloca float
+  store float 4.0, float *%tmp0
+  store float 4.0, float *%tmp1
+  store volatile float* %tmp0, float* addrspace(3)* %ptr
+  store volatile float* %tmp1, float* addrspace(3)* %ptr
+  ret void
+}
+
+; Same frame index is used multiple times in the store
+; GCN-LABEL: {{^}}stored_fi_to_self:
+define void @stored_fi_to_self() #0 {
+  %tmp = alloca i32*
+
+  ; Avoid optimizing everything out
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+  %bitcast = bitcast i32** %tmp to i32*
+  store volatile i32* %bitcast, i32** %tmp
+  ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_fi:
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI1]], [[FI2]]
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], [[FI1]]
+define void @stored_fi_to_fi() #0 {
+  %tmp0 = alloca i32*
+  %tmp1 = alloca i32*
+  %tmp2 = alloca i32*
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0
+  store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1
+  store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2
+
+  %bitcast1 = bitcast i32** %tmp1 to i32*
+  %bitcast2 = bitcast i32** %tmp2 to i32* ;  at offset 8
+
+  store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8
+  store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_global:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+  %tmp = alloca float
+  store float 0.0, float *%tmp
+  store float* %tmp, float* addrspace(1)* %ptr
+  ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+  %tmp0 = alloca float
+  %tmp1 = alloca float
+  %tmp2 = alloca float
+  store volatile float 0.0, float *%tmp0
+  store volatile float 0.0, float *%tmp1
+  store volatile float 0.0, float *%tmp2
+  store volatile float* %tmp1, float* addrspace(1)* %ptr
+  store volatile float* %tmp2, float* addrspace(1)* %ptr
+  ret void
+}
+
+attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; Pointer value is stored in a candidate for LDS usage.

@ -11,6 +12,18 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
  ret void
 }

+; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
+; GCN: buffer_store_dword v
+define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+  %tmp0 = alloca float
+  %tmp1 = alloca float
+  store float 0.0, float *%tmp0
+  store float 0.0, float *%tmp1
+  store volatile float* %tmp0, float* addrspace(1)* %ptr
+  store volatile float* %tmp1, float* addrspace(1)* %ptr
+  ret void
+}
+
 ; GCN-LABEL: {{^}}stored_lds_pointer_value_gep:
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
@ -36,17 +49,27 @@ bb:
 define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
 entry:
  %tmp0 = alloca [4 x i32]
-  %x = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
-  %y = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
-  %z = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
-  %w = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
+  %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
+  %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
+  %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
  store i32 0, i32* %x
  store i32 1, i32* %y
  store i32 2, i32* %z
  store i32 3, i32* %w
-  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
+  %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
  store i32* %tmp1, i32* addrspace(1)* %out
  ret void
 }

+; GCN-LABEL: {{^}}stored_fi_to_self:
+; GCN-NOT: ds_
+define void @stored_fi_to_self() #0 {
+  %tmp = alloca i32*
+  store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+  %bitcast = bitcast i32** %tmp to i32*
+  store volatile i32* %bitcast, i32** %tmp
+  ret void
+}
+
 attributes #0 = { nounwind }