mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-03 19:02:35 +00:00
Add NVPTXPeephole pass to reduce unnecessary address cast
Summary: This patch first change the register that holds local address for stack frame to %SPL. Then the new NVPTXPeephole pass will try to scan the following pattern %vreg0<def> = LEA_ADDRi64 <fi#0>, 4 %vreg1<def> = cvta_to_local %vreg0 and transform it into %vreg1<def> = LEA_ADDRi64 %VRFrameLocal, 4 Patched by Xuetian Weng Test Plan: test/CodeGen/NVPTX/local-stack-frame.ll Reviewers: jholewinski, jingyue Reviewed By: jingyue Subscribers: eliben, jholewinski, llvm-commits Differential Revision: http://reviews.llvm.org/D10549 llvm-svn: 240587
This commit is contained in:
parent
2663b7baad
commit
7ff7eb51d2
@ -22,6 +22,7 @@ set(NVPTXCodeGen_sources
|
|||||||
NVPTXLowerAggrCopies.cpp
|
NVPTXLowerAggrCopies.cpp
|
||||||
NVPTXLowerKernelArgs.cpp
|
NVPTXLowerKernelArgs.cpp
|
||||||
NVPTXLowerAlloca.cpp
|
NVPTXLowerAlloca.cpp
|
||||||
|
NVPTXPeephole.cpp
|
||||||
NVPTXMCExpr.cpp
|
NVPTXMCExpr.cpp
|
||||||
NVPTXPrologEpilogPass.cpp
|
NVPTXPrologEpilogPass.cpp
|
||||||
NVPTXRegisterInfo.cpp
|
NVPTXRegisterInfo.cpp
|
||||||
|
@ -71,6 +71,7 @@ MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
|
|||||||
FunctionPass *createNVPTXImageOptimizerPass();
|
FunctionPass *createNVPTXImageOptimizerPass();
|
||||||
FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
|
FunctionPass *createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM);
|
||||||
BasicBlockPass *createNVPTXLowerAllocaPass();
|
BasicBlockPass *createNVPTXLowerAllocaPass();
|
||||||
|
MachineFunctionPass *createNVPTXPeephole();
|
||||||
|
|
||||||
bool isImageOrSamplerVal(const Value *, const Module *);
|
bool isImageOrSamplerVal(const Value *, const Module *);
|
||||||
|
|
||||||
|
@ -36,33 +36,40 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
|
|||||||
if (MF.getFrameInfo()->hasStackObjects()) {
|
if (MF.getFrameInfo()->hasStackObjects()) {
|
||||||
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
|
assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
|
||||||
// Insert "mov.u32 %SP, %Depot"
|
// Insert "mov.u32 %SP, %Depot"
|
||||||
MachineBasicBlock::iterator MBBI = MBB.begin();
|
MachineInstr *MI = MBB.begin();
|
||||||
|
MachineRegisterInfo &MR = MF.getRegInfo();
|
||||||
|
|
||||||
// This instruction really occurs before first instruction
|
// This instruction really occurs before first instruction
|
||||||
// in the BB, so giving it no debug location.
|
// in the BB, so giving it no debug location.
|
||||||
DebugLoc dl = DebugLoc();
|
DebugLoc dl = DebugLoc();
|
||||||
|
|
||||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
|
||||||
|
|
||||||
// mov %SPL, %depot;
|
// mov %SPL, %depot;
|
||||||
// cvta.local %SP, %SPL;
|
// cvta.local %SP, %SPL;
|
||||||
if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
|
if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
|
||||||
unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
|
// Check if %SP is actually used
|
||||||
MachineInstr *MI =
|
if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) {
|
||||||
BuildMI(MBB, MBBI, dl, MF.getSubtarget().getInstrInfo()->get(
|
MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(
|
||||||
NVPTX::cvta_local_yes_64),
|
NVPTX::cvta_local_yes_64),
|
||||||
NVPTX::VRFrame).addReg(LocalReg);
|
NVPTX::VRFrame)
|
||||||
|
.addReg(NVPTX::VRFrameLocal);
|
||||||
|
}
|
||||||
|
|
||||||
BuildMI(MBB, MI, dl,
|
BuildMI(MBB, MI, dl,
|
||||||
MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
|
MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
|
||||||
LocalReg).addImm(MF.getFunctionNumber());
|
NVPTX::VRFrameLocal)
|
||||||
|
.addImm(MF.getFunctionNumber());
|
||||||
} else {
|
} else {
|
||||||
unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
|
// Check if %SP is actually used
|
||||||
MachineInstr *MI =
|
if (MR.hasOneNonDBGUse(NVPTX::VRFrame)) {
|
||||||
BuildMI(MBB, MBBI, dl,
|
MI = BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(
|
||||||
MF.getSubtarget().getInstrInfo()->get(NVPTX::cvta_local_yes),
|
NVPTX::cvta_local_yes),
|
||||||
NVPTX::VRFrame).addReg(LocalReg);
|
NVPTX::VRFrame)
|
||||||
|
.addReg(NVPTX::VRFrameLocal);
|
||||||
|
}
|
||||||
BuildMI(MBB, MI, dl,
|
BuildMI(MBB, MI, dl,
|
||||||
MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
|
MF.getSubtarget().getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
|
||||||
LocalReg).addImm(MF.getFunctionNumber());
|
NVPTX::VRFrameLocal)
|
||||||
|
.addImm(MF.getFunctionNumber());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
151
lib/Target/NVPTX/NVPTXPeephole.cpp
Normal file
151
lib/Target/NVPTX/NVPTXPeephole.cpp
Normal file
@ -0,0 +1,151 @@
|
|||||||
|
//===-- NVPTXPeephole.cpp - NVPTX Peephole Optimiztions -------------------===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
// In NVPTX, NVPTXFrameLowering will emit following instruction at the beginning
|
||||||
|
// of a MachineFunction.
|
||||||
|
//
|
||||||
|
// mov %SPL, %depot
|
||||||
|
// cvta.local %SP, %SPL
|
||||||
|
//
|
||||||
|
// Because Frame Index is a generic address and alloca can only return generic
|
||||||
|
// pointer, without this pass the instructions producing alloca'ed address will
|
||||||
|
// be based on %SP. NVPTXLowerAlloca tends to help replace store and load on
|
||||||
|
// this address with their .local versions, but this may introduce a lot of
|
||||||
|
// cvta.to.local instructions. Performance can be improved if we avoid casting
|
||||||
|
// address back and forth and directly calculate local address based on %SPL.
|
||||||
|
// This peephole pass optimizes these cases, for example
|
||||||
|
//
|
||||||
|
// It will transform the following pattern
|
||||||
|
// %vreg0<def> = LEA_ADDRi64 <fi#0>, 4
|
||||||
|
// %vreg1<def> = cvta_to_local_yes_64 %vreg0
|
||||||
|
//
|
||||||
|
// into
|
||||||
|
// %vreg1<def> = LEA_ADDRi64 %VRFrameLocal, 4
|
||||||
|
//
|
||||||
|
// %VRFrameLocal is the virtual register name of %SPL
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "NVPTX.h"
|
||||||
|
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||||
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||||
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||||
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||||
|
#include "llvm/Target/TargetRegisterInfo.h"
|
||||||
|
#include "llvm/Target/TargetInstrInfo.h"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "nvptx-peephole"
|
||||||
|
|
||||||
|
namespace llvm {
|
||||||
|
void initializeNVPTXPeepholePass(PassRegistry &);
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
struct NVPTXPeephole : public MachineFunctionPass {
|
||||||
|
public:
|
||||||
|
static char ID;
|
||||||
|
NVPTXPeephole() : MachineFunctionPass(ID) {
|
||||||
|
initializeNVPTXPeepholePass(*PassRegistry::getPassRegistry());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||||
|
|
||||||
|
const char *getPassName() const override {
|
||||||
|
return "NVPTX optimize redundant cvta.to.local instruction";
|
||||||
|
}
|
||||||
|
|
||||||
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||||
|
MachineFunctionPass::getAnalysisUsage(AU);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
char NVPTXPeephole::ID = 0;
|
||||||
|
|
||||||
|
INITIALIZE_PASS(NVPTXPeephole, "nvptx-peephole", "NVPTX Peephole", false, false)
|
||||||
|
|
||||||
|
static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
|
||||||
|
auto &MBB = *Root.getParent();
|
||||||
|
auto &MF = *MBB.getParent();
|
||||||
|
// Check current instruction is cvta.to.local
|
||||||
|
if (Root.getOpcode() != NVPTX::cvta_to_local_yes_64 &&
|
||||||
|
Root.getOpcode() != NVPTX::cvta_to_local_yes)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto &Op = Root.getOperand(1);
|
||||||
|
const auto &MRI = MF.getRegInfo();
|
||||||
|
MachineInstr *GenericAddrDef = nullptr;
|
||||||
|
if (Op.isReg() && TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
|
||||||
|
GenericAddrDef = MRI.getUniqueVRegDef(Op.getReg());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the register operand is uniquely defined by LEA_ADDRi instruction
|
||||||
|
if (!GenericAddrDef || GenericAddrDef->getParent() != &MBB ||
|
||||||
|
(GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi64 &&
|
||||||
|
GenericAddrDef->getOpcode() != NVPTX::LEA_ADDRi)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check the LEA_ADDRi operand is Frame index
|
||||||
|
auto &BaseAddrOp = GenericAddrDef->getOperand(1);
|
||||||
|
if (BaseAddrOp.getType() == MachineOperand::MO_FrameIndex) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void CombineCVTAToLocal(MachineInstr &Root) {
|
||||||
|
auto &MBB = *Root.getParent();
|
||||||
|
auto &MF = *MBB.getParent();
|
||||||
|
const auto &MRI = MF.getRegInfo();
|
||||||
|
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
|
||||||
|
auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
|
||||||
|
|
||||||
|
// Get the correct offset
|
||||||
|
int FrameIndex = Prev.getOperand(1).getIndex();
|
||||||
|
int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
|
||||||
|
Prev.getOperand(2).getImm();
|
||||||
|
|
||||||
|
MachineInstrBuilder MIB =
|
||||||
|
BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
|
||||||
|
Root.getOperand(0).getReg())
|
||||||
|
.addReg(NVPTX::VRFrameLocal)
|
||||||
|
.addOperand(MachineOperand::CreateImm(Offset));
|
||||||
|
|
||||||
|
MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
|
||||||
|
|
||||||
|
// Check if MRI has only one non dbg use, which is Root
|
||||||
|
if (MRI.hasOneNonDBGUse(Prev.getOperand(0).getReg())) {
|
||||||
|
Prev.eraseFromParentAndMarkDBGValuesForRemoval();
|
||||||
|
}
|
||||||
|
Root.eraseFromParentAndMarkDBGValuesForRemoval();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
|
||||||
|
bool Changed = false;
|
||||||
|
// Loop over all of the basic blocks.
|
||||||
|
for (auto &MBB : MF) {
|
||||||
|
// Traverse the basic block.
|
||||||
|
auto BlockIter = MBB.begin();
|
||||||
|
|
||||||
|
while (BlockIter != MBB.end()) {
|
||||||
|
auto &MI = *BlockIter++;
|
||||||
|
if (isCVTAToLocalCombinationCandidate(MI)) {
|
||||||
|
CombineCVTAToLocal(MI);
|
||||||
|
Changed = true;
|
||||||
|
}
|
||||||
|
} // Instruction
|
||||||
|
} // Basic Block
|
||||||
|
return Changed;
|
||||||
|
}
|
||||||
|
|
||||||
|
MachineFunctionPass *llvm::createNVPTXPeephole() { return new NVPTXPeephole(); }
|
@ -65,5 +65,5 @@ def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
|
|||||||
def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
|
def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
|
||||||
|
|
||||||
// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
|
// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
|
||||||
def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot,
|
def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot,
|
||||||
(sequence "ENVREG%u", 0, 31))>;
|
(sequence "ENVREG%u", 0, 31))>;
|
||||||
|
@ -205,6 +205,8 @@ bool NVPTXPassConfig::addInstSelector() {
|
|||||||
if (!ST.hasImageHandles())
|
if (!ST.hasImageHandles())
|
||||||
addPass(createNVPTXReplaceImageHandlesPass());
|
addPass(createNVPTXReplaceImageHandlesPass());
|
||||||
|
|
||||||
|
addPass(createNVPTXPeephole());
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,8 +20,7 @@ entry:
|
|||||||
%buf = alloca [16 x i8], align 4
|
%buf = alloca [16 x i8], align 4
|
||||||
|
|
||||||
; CHECK: .local .align 4 .b8 __local_depot0[16]
|
; CHECK: .local .align 4 .b8 __local_depot0[16]
|
||||||
; CHECK: mov.u64 %rd[[BUF_REG:[0-9]+]]
|
; CHECK: mov.u64 %SPL
|
||||||
; CHECK: cvta.local.u64 %SP, %rd[[BUF_REG]]
|
|
||||||
|
|
||||||
; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
|
; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
|
||||||
; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
|
; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
|
||||||
|
@ -3,12 +3,12 @@
|
|||||||
|
|
||||||
; Ensure we access the local stack properly
|
; Ensure we access the local stack properly
|
||||||
|
|
||||||
; PTX32: mov.u32 %r{{[0-9]+}}, __local_depot{{[0-9]+}};
|
; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
|
||||||
; PTX32: cvta.local.u32 %SP, %r{{[0-9]+}};
|
; PTX32: cvta.local.u32 %SP, %SPL;
|
||||||
; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
|
; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
|
||||||
; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
|
; PTX32: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
|
||||||
; PTX64: mov.u64 %rd{{[0-9]+}}, __local_depot{{[0-9]+}};
|
; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
|
||||||
; PTX64: cvta.local.u64 %SP, %rd{{[0-9]+}};
|
; PTX64: cvta.local.u64 %SP, %SPL;
|
||||||
; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
|
; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo_param_0];
|
||||||
; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
|
; PTX64: st.volatile.u32 [%SP+0], %r{{[0-9]+}};
|
||||||
define void @foo(i32 %a) {
|
define void @foo(i32 %a) {
|
||||||
@ -16,3 +16,43 @@ define void @foo(i32 %a) {
|
|||||||
store volatile i32 %a, i32* %local
|
store volatile i32 %a, i32* %local
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
|
||||||
|
; PTX32: cvta.local.u32 %SP, %SPL;
|
||||||
|
; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0];
|
||||||
|
; PTX32: add.u32 %r[[SP_REG:[0-9]+]], %SPL, 0;
|
||||||
|
; PTX32: st.local.u32 [%r[[SP_REG]]], %r{{[0-9]+}};
|
||||||
|
; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
|
||||||
|
; PTX64: cvta.local.u64 %SP, %SPL;
|
||||||
|
; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo2_param_0];
|
||||||
|
; PTX64: add.u64 %rd[[SP_REG:[0-9]+]], %SPL, 0;
|
||||||
|
; PTX64: st.local.u32 [%rd[[SP_REG]]], %r{{[0-9]+}};
|
||||||
|
define void @foo2(i32 %a) {
|
||||||
|
%local = alloca i32, align 4
|
||||||
|
store i32 %a, i32* %local
|
||||||
|
call void @bar(i32* %local)
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
declare void @bar(i32* %a)
|
||||||
|
|
||||||
|
!nvvm.annotations = !{!0}
|
||||||
|
!0 = !{void (i32)* @foo2, !"kernel", i32 1}
|
||||||
|
|
||||||
|
; PTX32: mov.u32 %SPL, __local_depot{{[0-9]+}};
|
||||||
|
; PTX32-NOT: cvta.local.u32 %SP, %SPL;
|
||||||
|
; PTX32: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0];
|
||||||
|
; PTX32: add.u32 %r{{[0-9]+}}, %SPL, 0;
|
||||||
|
; PTX32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}};
|
||||||
|
; PTX64: mov.u64 %SPL, __local_depot{{[0-9]+}};
|
||||||
|
; PTX64-NOT: cvta.local.u64 %SP, %SPL;
|
||||||
|
; PTX64: ld.param.u32 %r{{[0-9]+}}, [foo3_param_0];
|
||||||
|
; PTX64: add.u64 %rd{{[0-9]+}}, %SPL, 0;
|
||||||
|
; PTX64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}};
|
||||||
|
define void @foo3(i32 %a) {
|
||||||
|
%local = alloca [3 x i32], align 4
|
||||||
|
%1 = bitcast [3 x i32]* %local to i32*
|
||||||
|
%2 = getelementptr inbounds i32, i32* %1, i32 %a
|
||||||
|
store i32 %a, i32* %2
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user