diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 59dfacb8c9a..2f79687de85 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -221,6 +221,9 @@ ModulePass *createAMDGPUOpenCLEnqueuedBlockLoweringPass(); void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &); extern char &AMDGPUOpenCLEnqueuedBlockLoweringID; +void initializeGCNNSAReassignPass(PassRegistry &); +extern char &GCNNSAReassignID; + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index f2408819ce9..3c41ecdf597 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -149,6 +149,12 @@ static cl::opt EnableLowerKernelArguments( cl::init(true), cl::Hidden); +static cl::opt EnableRegReassign( + "amdgpu-reassign-regs", + cl::desc("Enable register reassign optimizations on gfx10+"), + cl::init(true), + cl::Hidden); + // Enable atomic optimization static cl::opt EnableAtomicOptimizations( "amdgpu-atomic-optimizations", @@ -228,6 +234,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeGCNNSAReassignPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -605,6 +612,7 @@ public: void addFastRegAlloc() override; void addOptimizedRegAlloc() override; void addPreRegAlloc() override; + bool addPreRewrite() override; void addPostRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -926,6 +934,13 @@ void GCNPassConfig::addOptimizedRegAlloc() { TargetPassConfig::addOptimizedRegAlloc(); } +bool GCNPassConfig::addPreRewrite() { + if (EnableRegReassign) { + addPass(&GCNNSAReassignID); + } + return true; +} + void GCNPassConfig::addPostRegAlloc() { addPass(&SIFixVGPRCopiesID); if (getOptLevel() > CodeGenOpt::None) diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 019bf85e5d8..08a64cd9512 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -116,6 +116,7 @@ add_llvm_target(AMDGPUCodeGen SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp + GCNNSAReassign.cpp GCNDPPCombine.cpp SIModeRegister.cpp ) diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp new file mode 100644 index 00000000000..0511c3a18de --- /dev/null +++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp @@ -0,0 +1,343 @@ +//===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Try to reassign registers on GFX10+ from non-sequential to sequential +/// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA +/// with sequential versions where possible. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/LiveInterval.h" +#include "llvm/CodeGen/LiveIntervals.h" +#include "llvm/CodeGen/LiveRegMatrix.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/VirtRegMap.h" +#include "llvm/Support/MathExtras.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-nsa-reassign" + +STATISTIC(NumNSAInstructions, + "Number of NSA instructions with non-sequential address found"); +STATISTIC(NumNSAConverted, + "Number of NSA instructions changed to sequential"); + +namespace { + +class GCNNSAReassign : public MachineFunctionPass { +public: + static char ID; + + GCNNSAReassign() : MachineFunctionPass(ID) { + initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "GCN NSA Reassign"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + typedef enum { + NOT_NSA, // Not an NSA instruction + FIXED, // NSA which we cannot modify + NON_CONTIGUOUS, // NSA with non-sequential address which we can try + // to optimize. + CONTIGUOUS // NSA with all sequential address registers + } NSA_Status; + + const GCNSubtarget *ST; + + const MachineRegisterInfo *MRI; + + const SIRegisterInfo *TRI; + + VirtRegMap *VRM; + + LiveRegMatrix *LRM; + + LiveIntervals *LIS; + + unsigned MaxNumVGPRs; + + const MCPhysReg *CSRegs; + + NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const; + + bool tryAssignRegisters(SmallVectorImpl &Intervals, + unsigned StartReg) const; + + bool canAssign(unsigned StartReg, unsigned NumRegs) const; + + bool scavengeRegs(SmallVectorImpl &Intervals) const; +}; + +} // End anonymous namespace. + +INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) +INITIALIZE_PASS_DEPENDENCY(VirtRegMap) +INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix) +INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign", + false, false) + + +char GCNNSAReassign::ID = 0; + +char &llvm::GCNNSAReassignID = GCNNSAReassign::ID; + +bool +GCNNSAReassign::tryAssignRegisters(SmallVectorImpl &Intervals, + unsigned StartReg) const { + unsigned NumRegs = Intervals.size(); + + for (unsigned N = 0; N < NumRegs; ++N) + if (VRM->hasPhys(Intervals[N]->reg)) + LRM->unassign(*Intervals[N]); + + for (unsigned N = 0; N < NumRegs; ++N) + if (LRM->checkInterference(*Intervals[N], StartReg + N)) + return false; + + for (unsigned N = 0; N < NumRegs; ++N) + LRM->assign(*Intervals[N], StartReg + N); + + return true; +} + +bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const { + for (unsigned N = 0; N < NumRegs; ++N) { + unsigned Reg = StartReg + N; + if (!MRI->isAllocatable(Reg)) + return false; + + for (unsigned I = 0; CSRegs[I]; ++I) + if (TRI->isSubRegisterEq(Reg, CSRegs[I]) && + !LRM->isPhysRegUsed(CSRegs[I])) + return false; + } + + return true; +} + +bool +GCNNSAReassign::scavengeRegs(SmallVectorImpl &Intervals) const { + unsigned NumRegs = Intervals.size(); + + if (NumRegs > MaxNumVGPRs) + return false; + unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0; + + for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) { + if (!canAssign(Reg, NumRegs)) + continue; + + if (tryAssignRegisters(Intervals, Reg)) + return true; + } + + return false; +} + +GCNNSAReassign::NSA_Status +GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const { + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode()); + if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA) + return NSA_Status::NOT_NSA; + + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0); + + unsigned VgprBase = 0; + bool NSA = false; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI.getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg)) + return NSA_Status::FIXED; + + unsigned PhysReg = VRM->getPhys(Reg); + + if (!Fast) { + if (!PhysReg) + return NSA_Status::FIXED; + + // Bail if address is not a VGPR32. That should be possible to extend the + // optimization to work with subregs of a wider register tuples, but the + // logic to find free registers will be much more complicated with much + // less chances for success. That seems reasonable to assume that in most + // cases a tuple is used because a vector variable contains different + // parts of an address and it is either already consequitive or cannot + // be reassigned if not. If needed it is better to rely on register + // coalescer to process such address tuples. + if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg()) + return NSA_Status::FIXED; + + const MachineInstr *Def = MRI->getUniqueVRegDef(Reg); + + if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg) + return NSA_Status::FIXED; + + for (auto U : MRI->use_nodbg_operands(Reg)) { + if (U.isImplicit()) + return NSA_Status::FIXED; + const MachineInstr *UseInst = U.getParent(); + if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg) + return NSA_Status::FIXED; + } + + if (!LIS->hasInterval(Reg)) + return NSA_Status::FIXED; + } + + if (I == 0) + VgprBase = PhysReg; + else if (VgprBase + I != PhysReg) + NSA = true; + } + + return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS; +} + +bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) { + ST = &MF.getSubtarget(); + if (ST->getGeneration() < GCNSubtarget::GFX10) + return false; + + MRI = &MF.getRegInfo(); + TRI = ST->getRegisterInfo(); + VRM = &getAnalysis(); + LRM = &getAnalysis(); + LIS = &getAnalysis(); + + const SIMachineFunctionInfo *MFI = MF.getInfo(); + MaxNumVGPRs = ST->getMaxNumVGPRs(MF); + MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs); + CSRegs = TRI->getCalleeSavedRegs(&MF); + + using Candidate = std::pair; + SmallVector Candidates; + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + switch (CheckNSA(MI)) { + default: + continue; + case NSA_Status::CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, true)); + break; + case NSA_Status::NON_CONTIGUOUS: + Candidates.push_back(std::make_pair(&MI, false)); + ++NumNSAInstructions; + break; + } + } + } + + bool Changed = false; + for (auto &C : Candidates) { + if (C.second) + continue; + + const MachineInstr *MI = C.first; + if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) { + // Already happen to be fixed. + C.second = true; + ++NumNSAConverted; + continue; + } + + const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode()); + int VAddr0Idx = + AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0); + + SmallVector Intervals; + SmallVector OrigRegs; + SlotIndex MinInd, MaxInd; + for (unsigned I = 0; I < Info->VAddrDwords; ++I) { + const MachineOperand &Op = MI->getOperand(VAddr0Idx + I); + unsigned Reg = Op.getReg(); + LiveInterval *LI = &LIS->getInterval(Reg); + if (llvm::find(Intervals, LI) != Intervals.end()) { + // Same register used, unable to make sequential + Intervals.clear(); + break; + } + Intervals.push_back(LI); + OrigRegs.push_back(VRM->getPhys(Reg)); + MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex(); + MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex(); + } + + if (Intervals.empty()) + continue; + + LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI + << "\tOriginal allocation:\t"; + for(auto *LI : Intervals) + dbgs() << " " << llvm::printReg((VRM->getPhys(LI->reg)), TRI); + dbgs() << '\n'); + + bool Success = scavengeRegs(Intervals); + if (!Success) { + LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n"); + if (VRM->hasPhys(Intervals.back()->reg)) // Did not change allocation. + continue; + } else { + // Check we did not make it worse for other instructions. + auto I = std::lower_bound(Candidates.begin(), &C, MinInd, + [this](const Candidate &C, SlotIndex I) { + return LIS->getInstructionIndex(*C.first) < I; + }); + for (auto E = Candidates.end(); Success && I != E && + LIS->getInstructionIndex(*I->first) < MaxInd; ++I) { + if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) { + Success = false; + LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first); + } + } + } + + if (!Success) { + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + if (VRM->hasPhys(Intervals[I]->reg)) + LRM->unassign(*Intervals[I]); + + for (unsigned I = 0; I < Info->VAddrDwords; ++I) + LRM->assign(*Intervals[I], OrigRegs[I]); + + continue; + } + + C.second = true; + ++NumNSAConverted; + LLVM_DEBUG(dbgs() << "\tNew allocation:\t\t [" + << llvm::printReg((VRM->getPhys(Intervals.front()->reg)), TRI) + << " : " + << llvm::printReg((VRM->getPhys(Intervals.back()->reg)), TRI) + << "]\n"); + Changed = true; + } + + return Changed; +} diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll index 6084789e59e..c57c27d0ccf 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -1,10 +1,12 @@ -; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SIVI,PRT %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,PRT %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,PRT %s +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789,SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789,GFX8910,SIVI,PRT %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6789,PRT %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,NOPRT %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; GCN-LABEL: {{^}}load_1d: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -22,7 +24,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} +; GFX6789: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}} +; GFX10: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -45,7 +48,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} +; GFX6789: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}} +; GFX10: image_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -58,7 +62,8 @@ main_body: } ; GCN-LABEL: {{^}}load_2d: -; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -76,7 +81,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) { @@ -89,7 +95,8 @@ main_body: } ; GCN-LABEL: {{^}}load_3d: -; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -107,7 +114,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) { @@ -120,7 +128,8 @@ main_body: } ; GCN-LABEL: {{^}}load_cube: -; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -138,7 +147,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { @@ -151,7 +161,8 @@ main_body: } ; GCN-LABEL: {{^}}load_1darray: -; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -169,7 +180,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) { @@ -182,7 +194,8 @@ main_body: } ; GCN-LABEL: {{^}}load_2darray: -; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -200,7 +213,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) { @@ -213,7 +227,8 @@ main_body: } ; GCN-LABEL: {{^}}load_2dmsaa: -; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -231,7 +246,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) { @@ -244,7 +260,8 @@ main_body: } ; GCN-LABEL: {{^}}load_2darraymsaa: -; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -262,7 +279,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GFX6789: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}} +; GFX10: image_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) { @@ -275,7 +293,8 @@ main_body: } ; GCN-LABEL: {{^}}load_mip_1d: -; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -293,7 +312,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} +; GFX6789: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}} +; GFX10: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) { @@ -306,7 +326,8 @@ main_body: } ; GCN-LABEL: {{^}}load_mip_2d: -; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -324,7 +345,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 ; NOPRT-NOT: v_mov_b32_e32 v3 -; GCN: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GFX6789: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}} +; GFX10: image_load_mip v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; ; SIVI: buffer_store_dword v4, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4 define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) { @@ -432,7 +454,8 @@ main_body: ; NOPRT-NOT: v_mov_b32_e32 v0 ; NOPRT-NOT: v_mov_b32_e32 v1 ; NOPRT-NOT: v_mov_b32_e32 v2 -; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}} +; GFX6789: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}} +; GFX10: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; ; SIVI: buffer_store_dword v3, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v3 define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -451,7 +474,8 @@ main_body: ; NOPRT: v_mov_b32_e32 v2, 0 ; NOPRT-NOT: v_mov_b32_e32 v0 ; NOPRT-NOT: v_mov_b32_e32 v1 -; GCN: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} +; GFX6789: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}} +; GFX10: image_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; ; SIVI: buffer_store_dword v2, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2 define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -468,7 +492,8 @@ main_body: ; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 ; NOPRT: v_mov_b32_e32 v1, 0 ; NOPRT-NOT: v_mov_b32_e32 v0 -; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}} +; GFX6789: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}} +; GFX10: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; ; SIVI: buffer_store_dword v1, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1 define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -485,7 +510,8 @@ main_body: ; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0 ; NOPRT: v_mov_b32_e32 v1, 0 ; NOPRT-NOT: v_mov_b32_e32 v0 -; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}} +; GFX6789: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}} +; GFX10: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; ; SIVI: buffer_store_dword v1, off, s[8:11], 0 ; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1 define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) { @@ -499,7 +525,8 @@ main_body: ; GCN-LABEL: {{^}}load_mip_3d: -; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -507,7 +534,8 @@ main_body: } ; GCN-LABEL: {{^}}load_mip_cube: -; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; define amdgpu_ps <4 x float> @load_mip_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -515,7 +543,8 @@ main_body: } ; GCN-LABEL: {{^}}load_mip_1darray: -; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load_mip v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; define amdgpu_ps <4 x float> @load_mip_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -523,7 +552,8 @@ main_body: } ; GCN-LABEL: {{^}}load_mip_2darray: -; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; define amdgpu_ps <4 x float> @load_mip_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.mip.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -531,7 +561,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1d: -; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store v[0:3], v4, s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps void @store_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -539,7 +570,8 @@ main_body: } ; GCN-LABEL: {{^}}store_2d: -; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; define amdgpu_ps void @store_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t) { main_body: call void @llvm.amdgcn.image.store.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -547,7 +579,8 @@ main_body: } ; GCN-LABEL: {{^}}store_3d: -; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; define amdgpu_ps void @store_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r) { main_body: call void @llvm.amdgcn.image.store.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) @@ -555,7 +588,8 @@ main_body: } ; GCN-LABEL: {{^}}store_cube: -; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; define amdgpu_ps void @store_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { main_body: call void @llvm.amdgcn.image.store.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -563,7 +597,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1darray: -; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; define amdgpu_ps void @store_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice) { main_body: call void @llvm.amdgcn.image.store.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -571,7 +606,8 @@ main_body: } ; GCN-LABEL: {{^}}store_2darray: -; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; define amdgpu_ps void @store_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice) { main_body: call void @llvm.amdgcn.image.store.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) @@ -579,7 +615,8 @@ main_body: } ; GCN-LABEL: {{^}}store_2dmsaa: -; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; define amdgpu_ps void @store_2dmsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %fragid) { main_body: call void @llvm.amdgcn.image.store.2dmsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -587,7 +624,8 @@ main_body: } ; GCN-LABEL: {{^}}store_2darraymsaa: -; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; define amdgpu_ps void @store_2darraymsaa(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %fragid) { main_body: call void @llvm.amdgcn.image.store.2darraymsaa.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) @@ -595,7 +633,8 @@ main_body: } ; GCN-LABEL: {{^}}store_mip_1d: -; GCN: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store_mip v[0:3], v[4:5], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps void @store_mip_1d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -603,7 +642,8 @@ main_body: } ; GCN-LABEL: {{^}}store_mip_2d: -; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; define amdgpu_ps void @store_mip_2d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.2d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -611,7 +651,8 @@ main_body: } ; GCN-LABEL: {{^}}store_mip_3d: -; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; define amdgpu_ps void @store_mip_3d(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %r, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.3d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %r, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -619,7 +660,8 @@ main_body: } ; GCN-LABEL: {{^}}store_mip_cube: -; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; define amdgpu_ps void @store_mip_cube(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.cube.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -627,7 +669,8 @@ main_body: } ; GCN-LABEL: {{^}}store_mip_1darray: -; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store_mip v[0:3], v[4:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; define amdgpu_ps void @store_mip_1darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %slice, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.1darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -635,7 +678,8 @@ main_body: } ; GCN-LABEL: {{^}}store_mip_2darray: -; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; define amdgpu_ps void @store_mip_2darray(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s, i32 %t, i32 %slice, i32 %mip) { main_body: call void @llvm.amdgcn.image.store.mip.2darray.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, i32 %t, i32 %slice, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -643,7 +687,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_1d: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps <4 x float> @getresinfo_1d(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -651,7 +696,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_2d: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; define amdgpu_ps <4 x float> @getresinfo_2d(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -659,7 +705,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_3d: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ; define amdgpu_ps <4 x float> @getresinfo_3d(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.3d.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -667,7 +714,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_cube: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm ; define amdgpu_ps <4 x float> @getresinfo_cube(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.cube.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -675,7 +723,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_1darray: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ; define amdgpu_ps <4 x float> @getresinfo_1darray(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.1darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -683,7 +732,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_2darray: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ; define amdgpu_ps <4 x float> @getresinfo_2darray(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darray.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -691,7 +741,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_2dmsaa: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ; define amdgpu_ps <4 x float> @getresinfo_2dmsaa(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2dmsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -699,7 +750,8 @@ main_body: } ; GCN-LABEL: {{^}}getresinfo_2darraymsaa: -; GCN: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX6789: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf unorm da{{$}} +; GFX10: image_get_resinfo v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; define amdgpu_ps <4 x float> @getresinfo_2darraymsaa(<8 x i32> inreg %rsrc, i32 %mip) { main_body: %v = call <4 x float> @llvm.amdgcn.image.getresinfo.2darraymsaa.v4f32.i32(i32 15, i32 %mip, <8 x i32> %rsrc, i32 0, i32 0) @@ -707,7 +759,8 @@ main_body: } ; GCN-LABEL: {{^}}load_1d_V1: -; GCN: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}} +; GFX6789: image_load v0, v0, s[0:7] dmask:0x8 unorm{{$}} +; GFX10: image_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call float @llvm.amdgcn.image.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -715,7 +768,8 @@ main_body: } ; GCN-LABEL: {{^}}load_1d_V2: -; GCN: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}} +; GFX6789: image_load v[0:1], v0, s[0:7] dmask:0x9 unorm{{$}} +; GFX10: image_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <2 x float> @llvm.amdgcn.image.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -723,7 +777,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1d_V1: -; GCN: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}} +; GFX6789: image_store v0, v1, s[0:7] dmask:0x2 unorm{{$}} +; GFX10: image_store v0, v1, s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps void @store_1d_V1(<8 x i32> inreg %rsrc, float %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.f32.i32(float %vdata, i32 2, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -731,7 +786,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1d_V2: -; GCN: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}} +; GFX6789: image_store v[0:1], v2, s[0:7] dmask:0xc unorm{{$}} +; GFX10: image_store v[0:1], v2, s[0:7] dmask:0xc dim:SQ_RSRC_IMG_1D unorm ; define amdgpu_ps void @store_1d_V2(<8 x i32> inreg %rsrc, <2 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v2f32.i32(<2 x float> %vdata, i32 12, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -739,7 +795,8 @@ main_body: } ; GCN-LABEL: {{^}}load_1d_glc: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}} +; GFX6789: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc{{$}} +; GFX10: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ; define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -747,7 +804,8 @@ main_body: } ; GCN-LABEL: {{^}}load_1d_slc: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}} +; GFX6789: image_load v[0:3], v0, s[0:7] dmask:0xf unorm slc{{$}} +; GFX10: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ; define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -755,7 +813,8 @@ main_body: } ; GCN-LABEL: {{^}}load_1d_glc_slc: -; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}} +; GFX6789: image_load v[0:3], v0, s[0:7] dmask:0xf unorm glc slc{{$}} +; GFX10: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ; define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) { main_body: %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -763,7 +822,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1d_glc: -; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}} +; GFX6789: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc{{$}} +; GFX10: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ; define amdgpu_ps void @store_1d_glc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1) @@ -771,7 +831,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1d_slc: -; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}} +; GFX6789: image_store v[0:3], v4, s[0:7] dmask:0xf unorm slc{{$}} +; GFX10: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ; define amdgpu_ps void @store_1d_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) @@ -779,7 +840,8 @@ main_body: } ; GCN-LABEL: {{^}}store_1d_glc_slc: -; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}} +; GFX6789: image_store v[0:3], v4, s[0:7] dmask:0xf unorm glc slc{{$}} +; GFX10: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ; define amdgpu_ps void @store_1d_glc_slc(<8 x i32> inreg %rsrc, <4 x float> %vdata, i32 %s) { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %vdata, i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3) @@ -798,11 +860,11 @@ main_body: ; Ideally, the register allocator would avoid the wait here ; ; GCN-LABEL: {{^}}image_store_wait: -; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm +; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf ; SI: s_waitcnt expcnt(0) -; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm +; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf ; GCN: s_waitcnt vmcnt(0) -; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm +; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 { main_body: call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %arg3, i32 15, i32 %arg4, <8 x i32> %arg, i32 0, i32 0) @@ -812,10 +874,10 @@ main_body: } ; SI won't merge ds memory operations, because of the signed offset bug, so -; we only have check lines for VI. -; VI-LABEL: image_load_mmo -; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 +; we only have check lines for VI+. +; GFX8910-LABEL: image_load_mmo +; GFX8910: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 +; GFX8910: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4 define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)* %lds, <2 x i32> %c) #0 { store float 0.000000e+00, float addrspace(3)* %lds %c0 = extractelement <2 x i32> %c, i32 0 diff --git a/test/CodeGen/AMDGPU/nsa-reassign.ll b/test/CodeGen/AMDGPU/nsa-reassign.ll new file mode 100644 index 00000000000..a668a19c2f0 --- /dev/null +++ b/test/CodeGen/AMDGPU/nsa-reassign.ll @@ -0,0 +1,102 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}sample_contig_nsa: +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +define amdgpu_ps <2 x float> @sample_contig_nsa(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { +main_body: + %zcompare.1 = fadd float %zcompare, 1.0 + %s1.1 = fadd float %s1, 1.0 + %t1.1 = fadd float %t1, 1.0 + %r1.1 = fadd float %r1, 1.0 + %s2.1 = fadd float %s2, 1.0 + %t2.1 = fadd float %t2, 1.0 + %r2.1 = fadd float %r2, 1.0 + %lod.1 = fadd float %lod, 1.0 + %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare.1, float %s1.1, float %t1.1, float %r1.1, float %lod.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2.1, float %t2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %r.0 = insertelement <2 x float> undef, float %v1, i32 0 + %r = insertelement <2 x float> %r.0, float %v2, i32 1 + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}sample_contig_nsa_10vgprs: +; GCN-DAG: image_sample_c_l v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], +define amdgpu_ps <2 x float> @sample_contig_nsa_10vgprs(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) #0 { +main_body: + %zcompare.1 = fadd float %zcompare, 1.0 + %s1.1 = fadd float %s1, 1.0 + %t1.1 = fadd float %t1, 1.0 + %r1.1 = fadd float %r1, 1.0 + %s2.1 = fadd float %s2, 1.0 + %t2.1 = fadd float %t2, 1.0 + %r2.1 = fadd float %r2, 1.0 + %lod.1 = fadd float %lod, 1.0 + %v1 = call float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32 1, float %zcompare.1, float %s1.1, float %t1.1, float %r1.1, float %lod.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2.1, float %t2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %r.0 = insertelement <2 x float> undef, float %v1, i32 0 + %r = insertelement <2 x float> %r.0, float %v2, i32 1 + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}sample_contig_nsa_conflict: +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], +define amdgpu_ps <2 x float> @sample_contig_nsa_conflict(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { +main_body: + %zcompare.1 = fadd float %zcompare, 1.0 + %s1.1 = fadd float %s1, 1.0 + %t1.1 = fadd float %t1, 1.0 + %r1.1 = fadd float %r1, 1.0 + %s2.1 = fadd float %s2, 1.0 + %t2.1 = fadd float %t2, 1.0 + %r2.1 = fadd float %r2, 1.0 + %lod.1 = fadd float %lod, 1.0 + %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2.1, float %t2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %v1 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %t2.1, float %s2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %r.0 = insertelement <2 x float> undef, float %v1, i32 0 + %r = insertelement <2 x float> %r.0, float %v2, i32 1 + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}sample_contig_nsa_same_addr: +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +; GCN-DAG: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], +define amdgpu_ps <2 x float> @sample_contig_nsa_same_addr(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { +main_body: + %zcompare.1 = fadd float %zcompare, 1.0 + %s1.1 = fadd float %s1, 1.0 + %t1.1 = fadd float %t1, 1.0 + %r1.1 = fadd float %r1, 1.0 + %s2.1 = fadd float %s2, 1.0 + %t2.1 = fadd float %t2, 1.0 + %r2.1 = fadd float %r2, 1.0 + %lod.1 = fadd float %lod, 1.0 + %v2 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2.1, float %t2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 1) + %v1 = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %s2.1, float %t2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + %r.0 = insertelement <2 x float> undef, float %v1, i32 0 + %r = insertelement <2 x float> %r.0, float %v2, i32 1 + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}sample_contig_nsa_same_reg: +; GCN-DAG: image_sample v{{[0-9]+}}, [{{v[0-9]+, v[0-9]+, v[0-9]+}}], +define amdgpu_ps float @sample_contig_nsa_same_reg(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %s1, float %t1, float %r1, float %lod, float %r2, float %s2, float %t2) { +main_body: + %zcompare.1 = fadd float %zcompare, 1.0 + %s1.1 = fadd float %s1, 1.0 + %t1.1 = fadd float %t1, 1.0 + %r1.1 = fadd float %r1, 1.0 + %s2.1 = fadd float %s2, 1.0 + %t2.1 = fadd float %t2, 1.0 + %r2.1 = fadd float %r2, 1.0 + %lod.1 = fadd float %lod, 1.0 + %v = call float @llvm.amdgcn.image.sample.3d.f32.f32(i32 1, float %t2.1, float %t2.1, float %r2.1, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0) + ret float %v +} + +declare float @llvm.amdgcn.image.sample.3d.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) +declare float @llvm.amdgcn.image.sample.c.l.3d.f32.f32(i32, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) + +attributes #0 = {"amdgpu-num-vgpr"="10"}