diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 35d33cb60bc4..36af767a70b0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -166,6 +166,9 @@ extern char &SILowerI1CopiesID; void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &); extern char &AMDGPUGlobalISelDivergenceLoweringID; +void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &); +extern char &AMDGPUMarkLastScratchLoadID; + void initializeSILowerSGPRSpillsPass(PassRegistry &); extern char &SILowerSGPRSpillsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp new file mode 100644 index 000000000000..0692a12a4061 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp @@ -0,0 +1,142 @@ +//===-- AMDGPUMarkLastScratchLoad.cpp -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Mark scratch load/spill instructions which are guaranteed to be the last time +// this scratch slot is used so it can be evicted from caches. +// +// TODO: Handle general stack accesses not just spilling. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineOperand.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-mark-last-scratch-load" + +namespace { + +class AMDGPUMarkLastScratchLoad : public MachineFunctionPass { +private: + LiveStacks *LS = nullptr; + LiveIntervals *LIS = nullptr; + SlotIndexes *SI = nullptr; + const SIInstrInfo *SII = nullptr; + +public: + static char ID; + + AMDGPUMarkLastScratchLoad() : MachineFunctionPass(ID) { + initializeAMDGPUMarkLastScratchLoadPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "AMDGPU Mark Last Scratch Load"; + } +}; + +} // end anonymous namespace + +bool AMDGPUMarkLastScratchLoad::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + + const GCNSubtarget &ST = MF.getSubtarget(); + if (ST.getGeneration() < AMDGPUSubtarget::GFX12) + return false; + + LS = &getAnalysis(); + LIS = &getAnalysis(); + SI = &getAnalysis(); + SII = ST.getInstrInfo(); + SlotIndexes &Slots = *LIS->getSlotIndexes(); + + const unsigned NumSlots = LS->getNumIntervals(); + if (NumSlots == 0) { + LLVM_DEBUG(dbgs() << "No live slots, skipping\n"); + return false; + } + + LLVM_DEBUG(dbgs() << LS->getNumIntervals() << " intervals\n"); + + bool Changed = false; + + for (auto &[SS, LI] : *LS) { + for (const LiveRange::Segment &Segment : LI.segments) { + + // Ignore segments that run to the end of basic block because in this case + // slot is still live at the end of it. + if (Segment.end.isBlock()) + continue; + + const int FrameIndex = Register::stackSlot2Index(LI.reg()); + MachineInstr *LastLoad = nullptr; + + MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end); + + // If there is no instruction at this slot because it was deleted take the + // instruction from the next slot. + if (!MISegmentEnd) { + SlotIndex NextSlot = Slots.getNextNonNullIndex(Segment.end); + MISegmentEnd = SI->getInstructionFromIndex(NextSlot); + } + + MachineInstr *MISegmentStart = SI->getInstructionFromIndex(Segment.start); + MachineBasicBlock *BB = MISegmentEnd->getParent(); + + // Start iteration backwards from segment end until the start of basic + // block or start of segment if it is in the same basic block. + auto End = BB->rend(); + if (MISegmentStart && MISegmentStart->getParent() == BB) + End = MISegmentStart->getReverseIterator(); + + for (auto MI = MISegmentEnd->getReverseIterator(); MI != End; ++MI) { + int LoadFI = 0; + + if (SII->isLoadFromStackSlot(*MI, LoadFI) && LoadFI == FrameIndex) { + LastLoad = &*MI; + break; + } + } + + if (LastLoad && !LastLoad->memoperands_empty()) { + MachineMemOperand *MMO = *LastLoad->memoperands_begin(); + MMO->setFlags(MOLastUse); + Changed = true; + LLVM_DEBUG(dbgs() << " Found last load: " << *LastLoad); + } + } + } + + return Changed; +} + +char AMDGPUMarkLastScratchLoad::ID = 0; + +char &llvm::AMDGPUMarkLastScratchLoadID = AMDGPUMarkLastScratchLoad::ID; + +INITIALIZE_PASS_BEGIN(AMDGPUMarkLastScratchLoad, DEBUG_TYPE, + "AMDGPU Mark last scratch load", false, false) +INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(LiveStacks) +INITIALIZE_PASS_END(AMDGPUMarkLastScratchLoad, DEBUG_TYPE, + "AMDGPU Mark last scratch load", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0f3bb3e7b0d8..b8a7a5e20802 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -382,6 +382,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILowerI1CopiesPass(*PR); initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR); initializeSILowerWWMCopiesPass(*PR); + initializeAMDGPUMarkLastScratchLoadPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -1424,6 +1425,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() { addPreRewrite(); addPass(&VirtRegRewriterID); + addPass(&AMDGPUMarkLastScratchLoadID); + return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 2c92e7a07388..9a974eaf50d2 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUIGroupLP.cpp AMDGPUInsertSingleUseVDST.cpp + AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp AMDGPUPerfHintAnalysis.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index aa98a4b860dd..93fdcefc0413 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -8756,6 +8756,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const { static const std::pair TargetFlags[] = { {MONoClobber, "amdgpu-noclobber"}, + {MOLastUse, "amdgpu-last-use"}, }; return ArrayRef(TargetFlags); diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index daef6031dd07..f3fe0a85e63f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -41,6 +41,10 @@ class ScheduleHazardRecognizer; static const MachineMemOperand::Flags MONoClobber = MachineMemOperand::MOTargetFlag1; +/// Mark the MMO of a load as the last use. +static const MachineMemOperand::Flags MOLastUse = + MachineMemOperand::MOTargetFlag2; + /// Utility to store machine instructions worklist. struct SIInstrWorklist { SIInstrWorklist() = default; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index a93cf5cad411..a2cacb5cbaa3 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1657,8 +1657,12 @@ void SIRegisterInfo::buildSpillLoadStore( } else { MIB.addReg(SOffset, SOffsetRegState); } - MIB.addImm(Offset + RegOffset) - .addImm(0); // cpol + + MIB.addImm(Offset + RegOffset); + + bool LastUse = MMO->getFlags() & MOLastUse; + MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol + if (!IsFlat) MIB.addImm(0); // swz MIB.addMemOperand(NewMMO); @@ -2241,6 +2245,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(), RS->isRegUsed(AMDGPU::SCC)); } + buildSpillLoadStore( *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg, TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 8b0b62638322..48f00a82e3e1 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -359,6 +359,7 @@ ; GCN-O1-NEXT: SI Lower WWM Copies ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter +; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O1-NEXT: Stack Slot Coloring ; GCN-O1-NEXT: Machine Copy Propagation Pass ; GCN-O1-NEXT: Machine Loop Invariant Code Motion @@ -655,6 +656,7 @@ ; GCN-O1-OPTS-NEXT: SI Lower WWM Copies ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter +; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O1-OPTS-NEXT: Stack Slot Coloring ; GCN-O1-OPTS-NEXT: Machine Copy Propagation Pass ; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion @@ -957,6 +959,7 @@ ; GCN-O2-NEXT: SI Lower WWM Copies ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter +; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O2-NEXT: Stack Slot Coloring ; GCN-O2-NEXT: Machine Copy Propagation Pass ; GCN-O2-NEXT: Machine Loop Invariant Code Motion @@ -1271,6 +1274,7 @@ ; GCN-O3-NEXT: SI Lower WWM Copies ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter +; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load ; GCN-O3-NEXT: Stack Slot Coloring ; GCN-O3-NEXT: Machine Copy Propagation Pass ; GCN-O3-NEXT: Machine Loop Invariant Code Motion diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll index 7c8507fe4559..17a19116735e 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -25,6 +25,7 @@ ; DEFAULT-NEXT: SI Lower WWM Copies ; DEFAULT-NEXT: GCN NSA Reassign ; DEFAULT-NEXT: Virtual Register Rewriter +; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load ; DEFAULT-NEXT: Stack Slot Coloring ; O0: Fast Register Allocator @@ -61,6 +62,7 @@ ; BASIC-DEFAULT-NEXT: SI Lower WWM Copies ; BASIC-DEFAULT-NEXT: GCN NSA Reassign ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter +; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load ; BASIC-DEFAULT-NEXT: Stack Slot Coloring @@ -75,6 +77,7 @@ ; DEFAULT-BASIC-NEXT: SI Lower WWM Copies ; DEFAULT-BASIC-NEXT: GCN NSA Reassign ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter +; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load ; DEFAULT-BASIC-NEXT: Stack Slot Coloring @@ -95,6 +98,7 @@ ; BASIC-BASIC-NEXT: SI Lower WWM Copies ; BASIC-BASIC-NEXT: GCN NSA Reassign ; BASIC-BASIC-NEXT: Virtual Register Rewriter +; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load ; BASIC-BASIC-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll new file mode 100644 index 000000000000..ab112e606c0a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll @@ -0,0 +1,281 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -o - %s | FileCheck -check-prefix=CHECK %s + +define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" { +; CHECK-LABEL: max_6_vgprs: +; CHECK: ; %bb.0: +; CHECK-NEXT: global_load_b32 v2, v[0:1], off th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_lshlrev_b64_e32 v[2:3], 2, v[2:3] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo +; CHECK-NEXT: global_load_b32 v5, v[0:1], off th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:160 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v0, off offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_b32 v[0:1], v5, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm + %tid = load volatile i32, ptr addrspace(1) undef + %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid + %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4 + %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8 + %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12 + %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16 + %v1 = load volatile i32, ptr addrspace(1) %p1 + %v2 = load volatile i32, ptr addrspace(1) %p2 + %v3 = load volatile i32, ptr addrspace(1) %p3 + %v4 = load volatile i32, ptr addrspace(1) %p4 + %v5 = load volatile i32, ptr addrspace(1) %p5 + call void asm sideeffect "", "~{v[0:4]}" () + store volatile i32 %v1, ptr addrspace(1) undef + store volatile i32 %v2, ptr addrspace(1) undef + store volatile i32 %v3, ptr addrspace(1) undef + store volatile i32 %v4, ptr addrspace(1) undef + store volatile i32 %v5, ptr addrspace(1) undef + ret void +} + +define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgpu-num-vgpr"="11" { +; CHECK-LABEL: max_11_vgprs_branch: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: global_load_b32 v3, v[0:1], off th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_lshlrev_b64_e32 v[3:4], 2, v[3:4] +; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_2) +; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo +; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:336 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:12 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:448 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:16 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:576 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v3, v[0:1], off offset:720 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v3, off offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: v_cmpx_eq_u32_e32 0, v2 +; CHECK-NEXT: s_xor_b32 s0, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execz .LBB1_2 +; CHECK-NEXT: ; %bb.1: ; %.false +; CHECK-NEXT: global_load_b32 v10, v[0:1], off th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_b32 v[0:1], v10, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr0 +; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1 +; CHECK-NEXT: .LBB1_2: ; %Flow +; CHECK-NEXT: s_and_not1_saveexec_b32 s0, s0 +; CHECK-NEXT: s_cbranch_execz .LBB1_4 +; CHECK-NEXT: ; %bb.3: ; %.true +; CHECK-NEXT: global_load_b32 v10, v[0:1], off th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:16 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:20 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:48 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:24 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:96 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:28 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v2, v[0:1], off offset:160 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v2, off offset:32 ; 4-byte Folded Spill +; CHECK-NEXT: global_load_b32 v0, v[0:1], off offset:240 th:TH_LOAD_RT_NT +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: scratch_store_b32 off, v0, off offset:36 ; 4-byte Folded Spill +; CHECK-NEXT: ;;#ASMSTART +; CHECK-NEXT: ;;#ASMEND +; CHECK-NEXT: global_store_b32 v[0:1], v10, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: .LBB1_4: ; %.exit +; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; CHECK-NEXT: s_endpgm +.entry: + %tid = load volatile i32, ptr addrspace(1) undef + %p1 = getelementptr inbounds i32, ptr addrspace(1) %p, i32 %tid + %p2 = getelementptr inbounds i32, ptr addrspace(1) %p1, i32 4 + %p3 = getelementptr inbounds i32, ptr addrspace(1) %p2, i32 8 + %p4 = getelementptr inbounds i32, ptr addrspace(1) %p3, i32 12 + %p5 = getelementptr inbounds i32, ptr addrspace(1) %p4, i32 16 + %p6 = getelementptr inbounds i32, ptr addrspace(1) %p5, i32 20 + %p7 = getelementptr inbounds i32, ptr addrspace(1) %p6, i32 24 + %p8 = getelementptr inbounds i32, ptr addrspace(1) %p7, i32 28 + %p9 = getelementptr inbounds i32, ptr addrspace(1) %p8, i32 32 + %p10 = getelementptr inbounds i32, ptr addrspace(1) %p9, i32 36 + %v7 = load volatile i32, ptr addrspace(1) %p7 + %v8 = load volatile i32, ptr addrspace(1) %p8 + %v9 = load volatile i32, ptr addrspace(1) %p9 + %v10 = load volatile i32, ptr addrspace(1) %p10 + %cmp = icmp ne i32 %tmp, 0 + br i1 %cmp, label %.true, label %.false + +.true: + %v1_t = load volatile i32, ptr addrspace(1) %p1 + %v2_t = load volatile i32, ptr addrspace(1) %p2 + %v3_t = load volatile i32, ptr addrspace(1) %p3 + %v4_t = load volatile i32, ptr addrspace(1) %p4 + %v5_t = load volatile i32, ptr addrspace(1) %p5 + %v6_t = load volatile i32, ptr addrspace(1) %p6 + call void asm sideeffect "", "~{v[0:9]}" () + store volatile i32 %v1_t, ptr addrspace(1) undef + store volatile i32 %v2_t, ptr addrspace(1) undef + store volatile i32 %v3_t, ptr addrspace(1) undef + store volatile i32 %v4_t, ptr addrspace(1) undef + store volatile i32 %v5_t, ptr addrspace(1) undef + store volatile i32 %v6_t, ptr addrspace(1) undef + store volatile i32 %v7, ptr addrspace(1) undef + store volatile i32 %v8, ptr addrspace(1) undef + + br label %.exit + +.false: + %v1_f = load volatile i32, ptr addrspace(1) %p1 + %v2_f = load volatile i32, ptr addrspace(1) %p2 + %v3_f = load volatile i32, ptr addrspace(1) %p3 + %v4_f = load volatile i32, ptr addrspace(1) %p4 + %v5_f = load volatile i32, ptr addrspace(1) %p5 + %v6_f = load volatile i32, ptr addrspace(1) %p6 + call void asm sideeffect "", "~{v[0:9]}" () + store volatile i32 %v1_f, ptr addrspace(1) undef + store volatile i32 %v2_f, ptr addrspace(1) undef + store volatile i32 %v3_f, ptr addrspace(1) undef + store volatile i32 %v4_f, ptr addrspace(1) undef + store volatile i32 %v5_f, ptr addrspace(1) undef + store volatile i32 %v6_f, ptr addrspace(1) undef + store volatile i32 %v7, ptr addrspace(1) undef + store volatile i32 %v8, ptr addrspace(1) undef + + br label %.exit + +.exit: + store volatile i32 %v9, ptr addrspace(1) undef + store volatile i32 %v10, ptr addrspace(1) undef + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir new file mode 100644 index 000000000000..ab80b7d20984 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir @@ -0,0 +1,303 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -o - %s -run-pass=greedy -run-pass=amdgpu-mark-last-scratch-load | FileCheck -check-prefix=CHECK %s + +--- | + define amdgpu_cs void @test_spill_12x32() "amdgpu-num-vgpr"="12" { + ret void + } + define amdgpu_cs void @test_spill_384() "amdgpu-num-vgpr"="12" { + ret void + } + define amdgpu_ps void @test_loop_12() "amdgpu-num-vgpr"="12" { + ret void + } +... +--- +name: test_spill_12x32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + + ; CHECK-LABEL: name: test_spill_12x32 + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr1, %stack.1, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr2, %stack.2, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr3, %stack.3, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr4, %stack.4, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr5, %stack.5, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr7, %stack.7, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr8, %stack.8, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr9, %stack.9, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr10, %stack.10, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr11, %stack.11, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.8, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.9, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.10, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.11, addrspace 5) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[SI_SPILL_V32_RESTORE]], implicit [[SI_SPILL_V32_RESTORE1]], implicit [[SI_SPILL_V32_RESTORE2]], implicit [[SI_SPILL_V32_RESTORE3]], implicit [[SI_SPILL_V32_RESTORE4]], implicit [[SI_SPILL_V32_RESTORE5]], implicit [[SI_SPILL_V32_RESTORE6]], implicit [[SI_SPILL_V32_RESTORE7]], implicit [[SI_SPILL_V32_RESTORE8]], implicit [[SI_SPILL_V32_RESTORE9]], implicit [[SI_SPILL_V32_RESTORE10]], implicit [[SI_SPILL_V32_RESTORE11]] + %0:vgpr_32 = COPY $vgpr0 + %1:vgpr_32 = COPY $vgpr1 + %2:vgpr_32 = COPY $vgpr2 + %3:vgpr_32 = COPY $vgpr3 + %4:vgpr_32 = COPY $vgpr4 + %5:vgpr_32 = COPY $vgpr5 + %6:vgpr_32 = COPY $vgpr6 + %7:vgpr_32 = COPY $vgpr7 + %8:vgpr_32 = COPY $vgpr8 + %9:vgpr_32 = COPY $vgpr9 + %10:vgpr_32 = COPY $vgpr10 + %11:vgpr_32 = COPY $vgpr11 + INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + S_ENDPGM 0, implicit %0, implicit %1, implicit %2, implicit %3, implicit %4, implicit %5, implicit %6, implicit %7, implicit %8, implicit %9, implicit %10, implicit %11 +... + +--- +name: test_spill_384 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + + ; CHECK-LABEL: name: test_spill_384 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_SPILL_V384_SAVE $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, %stack.0, $sp_reg, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + ; CHECK-NEXT: [[SI_SPILL_V384_RESTORE:%[0-9]+]]:vreg_384 = SI_SPILL_V384_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s384) from %stack.0, align 4, addrspace 5) + ; CHECK-NEXT: S_ENDPGM 0, implicit [[SI_SPILL_V384_RESTORE]] + %0:vreg_384 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 + S_ENDPGM 0, implicit %0 +... + +--- +name: test_loop_12 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_loop_12 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr12, %stack.0, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr11, %stack.1, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr10, %stack.2, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr9, %stack.3, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr8, %stack.4, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr7, %stack.5, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr5, %stack.7, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr4, %stack.8, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr3, %stack.9, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr2, %stack.10, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr1, %stack.11, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.12, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_MOV_B32_e32_3]], %stack.16, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5) + ; CHECK-NEXT: %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5) + ; CHECK-NEXT: %vcmp:sreg_32 = V_CMP_LT_I32_e64 0, [[SI_SPILL_V32_RESTORE]], implicit $exec + ; CHECK-NEXT: %mask:sreg_32 = COPY $exec_lo, implicit-def $exec_lo + ; CHECK-NEXT: %sand:sreg_32 = S_AND_B32 %mask, %vcmp, implicit-def dead $scc + ; CHECK-NEXT: $exec_lo = S_MOV_B32_term %sand + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %mask2:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: %count:sgpr_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, %mask, implicit-def $scc + ; CHECK-NEXT: S_BRANCH %bb.5 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: successors: %bb.4(0x04000000), %bb.3(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_MOV_B32_e32_1]], %stack.14, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5) + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_]], 0, [[SI_SPILL_V32_RESTORE1]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.14, addrspace 5) + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_MOV_B32_e32_]], %stack.13, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5) + ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_1]], 0, [[SI_SPILL_V32_RESTORE2]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5) + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_2]], 0, [[SI_SPILL_V32_RESTORE3]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_MOV_B32_e32_2]], %stack.15, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5) + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5) + ; CHECK-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_4]], 0, [[SI_SPILL_V32_RESTORE4]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5) + ; CHECK-NEXT: %res5:vgpr_32 = V_ADD_F32_e64 0, %res5, 0, [[SI_SPILL_V32_RESTORE5]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5) + ; CHECK-NEXT: %res6:vgpr_32 = V_ADD_F32_e64 0, %res6, 0, [[SI_SPILL_V32_RESTORE6]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5) + ; CHECK-NEXT: %res7:vgpr_32 = V_ADD_F32_e64 0, %res7, 0, [[SI_SPILL_V32_RESTORE7]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5) + ; CHECK-NEXT: %res8:vgpr_32 = V_ADD_F32_e64 0, %res8, 0, [[SI_SPILL_V32_RESTORE8]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5) + ; CHECK-NEXT: %res9:vgpr_32 = V_ADD_F32_e64 0, %res9, 0, [[SI_SPILL_V32_RESTORE9]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) + ; CHECK-NEXT: %res10:vgpr_32 = V_ADD_F32_e64 0, %res10, 0, [[SI_SPILL_V32_RESTORE10]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) + ; CHECK-NEXT: %res11:vgpr_32 = V_ADD_F32_e64 0, %res11, 0, [[SI_SPILL_V32_RESTORE11]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE12:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5) + ; CHECK-NEXT: %res12:vgpr_32 = V_ADD_F32_e64 0, %res12, 0, [[SI_SPILL_V32_RESTORE12]], 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5) + ; CHECK-NEXT: %count:sgpr_32 = nuw nsw S_ADD_I32 %count, 1, implicit-def dead $scc + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE13:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5) + ; CHECK-NEXT: %vcmp2:sreg_32 = V_CMP_GE_I32_e64 %count, [[SI_SPILL_V32_RESTORE13]], implicit $exec + ; CHECK-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.15, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5) + ; CHECK-NEXT: %mask2:sgpr_32 = S_OR_B32 %vcmp2, %mask2, implicit-def $scc + ; CHECK-NEXT: $exec_lo = S_ANDN2_B32_term $exec_lo, %mask2, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.3, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $exec_lo = S_OR_B32 $exec_lo, %mask2, implicit-def $scc + ; CHECK-NEXT: SI_SPILL_V32_SAVE [[V_MOV_B32_e32_4]], %stack.16, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5) + ; CHECK-NEXT: S_BRANCH %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5: + ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE14:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.16, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.16, addrspace 5) + ; CHECK-NEXT: EXP_DONE 0, [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_2]], [[SI_SPILL_V32_RESTORE14]], -1, 0, 15, implicit $exec + ; CHECK-NEXT: EXP_DONE 0, %res5, %res6, %res7, %res8, -1, 0, 15, implicit $exec + ; CHECK-NEXT: EXP_DONE 0, %res9, %res10, %res11, %res12, -1, 0, 15, implicit $exec + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: ; entry + successors: %bb.1(0x40000000), %bb.2(0x40000000) + liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12 + + %12:vgpr_32 = COPY $vgpr12 + %11:vgpr_32 = COPY $vgpr11 + %10:vgpr_32 = COPY $vgpr10 + %9:vgpr_32 = COPY $vgpr9 + %8:vgpr_32 = COPY $vgpr8 + %7:vgpr_32 = COPY $vgpr7 + %6:vgpr_32 = COPY $vgpr6 + %5:vgpr_32 = COPY $vgpr5 + %4:vgpr_32 = COPY $vgpr4 + %3:vgpr_32 = COPY $vgpr3 + %2:vgpr_32 = COPY $vgpr2 + %1:vgpr_32 = COPY $vgpr1 + %loop_end:vgpr_32 = COPY $vgpr0 + %res1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %vcmp:sreg_32 = V_CMP_LT_I32_e64 0, %loop_end, implicit $exec + %mask:sreg_32 = COPY $exec_lo, implicit-def $exec_lo + %sand:sreg_32 = S_AND_B32 %mask, %vcmp, implicit-def dead $scc + $exec_lo = S_MOV_B32_term %sand + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: ; loop preheader + successors: %bb.3(0x80000000) + + %mask2:sgpr_32 = S_MOV_B32 0 + %count:sgpr_32 = S_MOV_B32 0 + %res1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res4:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res9:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: ; flow + successors: %bb.5(0x80000000) + + $exec_lo = S_OR_B32 $exec_lo, %mask, implicit-def $scc + S_BRANCH %bb.5 + + bb.3: ; loop + successors: %bb.4(0x04000000), %bb.3(0x7c000000) + + %res1:vgpr_32 = V_ADD_F32_e64 0, %res1, 0, %1, 0, 0, implicit $mode, implicit $exec + %res2:vgpr_32 = V_ADD_F32_e64 0, %res2, 0, %2, 0, 0, implicit $mode, implicit $exec + %res3:vgpr_32 = V_ADD_F32_e64 0, %res3, 0, %3, 0, 0, implicit $mode, implicit $exec + %res4:vgpr_32 = V_ADD_F32_e64 0, %res4, 0, %4, 0, 0, implicit $mode, implicit $exec + %res5:vgpr_32 = V_ADD_F32_e64 0, %res5, 0, %5, 0, 0, implicit $mode, implicit $exec + %res6:vgpr_32 = V_ADD_F32_e64 0, %res6, 0, %6, 0, 0, implicit $mode, implicit $exec + %res7:vgpr_32 = V_ADD_F32_e64 0, %res7, 0, %7, 0, 0, implicit $mode, implicit $exec + %res8:vgpr_32 = V_ADD_F32_e64 0, %res8, 0, %8, 0, 0, implicit $mode, implicit $exec + %res9:vgpr_32 = V_ADD_F32_e64 0, %res9, 0, %9, 0, 0, implicit $mode, implicit $exec + %res10:vgpr_32 = V_ADD_F32_e64 0, %res10, 0, %10, 0, 0, implicit $mode, implicit $exec + %res11:vgpr_32 = V_ADD_F32_e64 0, %res11, 0, %11, 0, 0, implicit $mode, implicit $exec + %res12:vgpr_32 = V_ADD_F32_e64 0, %res12, 0, %12, 0, 0, implicit $mode, implicit $exec + %count:sgpr_32 = nuw nsw S_ADD_I32 %count, 1, implicit-def dead $scc + %vcmp2:sreg_32 = V_CMP_GE_I32_e64 %count, %loop_end, implicit $exec + %mask2:sgpr_32 = S_OR_B32 %vcmp2, %mask2, implicit-def $scc + $exec_lo = S_ANDN2_B32_term $exec_lo, %mask2, implicit-def $scc + S_CBRANCH_EXECNZ %bb.3, implicit $exec + S_BRANCH %bb.4 + + bb.4: ; flow + successors: %bb.2(0x80000000) + + $exec_lo = S_OR_B32 $exec_lo, %mask2, implicit-def $scc + S_BRANCH %bb.2 + + bb.5: ; exit + EXP_DONE 0, %res1, %res2, %res3, %res4, -1, 0, 15, implicit $exec + EXP_DONE 0, %res5, %res6, %res7, %res8, -1, 0, 15, implicit $exec + EXP_DONE 0, %res9, %res10, %res11, %res12, -1, 0, 15, implicit $exec + S_ENDPGM 0 + +... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir index b705506af045..82140c815684 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/target-memoperands.mir @@ -2,7 +2,7 @@ # RUN: llc -mtriple=amdgcn -run-pass none -o - %s | FileCheck %s --- -name: target_memoperands +name: target_memoperands_noclobber body: | bb.0: liveins: $sgpr0_sgpr1 @@ -12,3 +12,15 @@ body: | %0:_(p4) = COPY $sgpr0_sgpr1 %1:_(s32) = G_LOAD %0 :: ("amdgpu-noclobber" load (s32)) ... + +--- +name: target_memoperands_last_use +body: | + bb.0: + liveins: $sgpr0_sgpr1 + ; CHECK-LABEL: name: target_memoperands + ; CHECK: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1 + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: ("amdgpu-last-use" load (s32)) + %0:_(p4) = COPY $sgpr0_sgpr1 + %1:_(s32) = G_LOAD %0 :: ("amdgpu-last-use" load (s32)) +...