MachinePipeliner pass that implements Swing Modulo Scheduling

Software pipelining is an optimization for improving ILP by
overlapping loop iterations. Swing Modulo Scheduling (SMS) is
an implementation of software pipelining that attempts to
reduce register pressure and generate efficient pipelines with
a low compile-time cost.

This implementaion of SMS is a target-independent back-end pass.
When enabled, the pass should run just prior to the register
allocation pass, while the machine IR is in SSA form. If the pass
is successful, then the original loop is replaced by the optimized
loop. The optimized loop contains one or more prolog blocks, the
pipelined kernel, and one or more epilog blocks.

This pass is enabled for Hexagon only. To enable for other targets,
a couple of target specific hooks must be implemented, and the
pass needs to be called from the target's TargetMachine
implementation.

Differential Review: http://reviews.llvm.org/D16829


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277169 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Brendon Cahoon 2016-07-29 16:44:44 +00:00
parent a6ad276d07
commit c1359c9fbb
21 changed files with 4583 additions and 7 deletions

View File

@ -374,6 +374,9 @@ namespace llvm {
/// and propagates register usage information of callee to caller
/// if available with PysicalRegisterUsageInfo pass.
FunctionPass *createRegUsageInfoPropPass();
/// This pass performs software pipelining on machine instructions.
extern char &MachinePipelinerID;
} // End llvm namespace
/// Target machine pass initializer for passes with dependencies. Use with

View File

@ -223,6 +223,7 @@ void initializeMachineLegalizePassPass(PassRegistry&);
void initializeMachineLICMPass(PassRegistry&);
void initializeMachineLoopInfoPass(PassRegistry&);
void initializeMachineModuleInfoPass(PassRegistry&);
void initializeMachinePipelinerPass(PassRegistry&);
void initializeMachinePostDominatorTreePass(PassRegistry&);
void initializeMachineRegionInfoPassPass(PassRegistry&);
void initializeMachineSchedulerPass(PassRegistry&);

View File

@ -18,6 +18,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Target/TargetRegisterInfo.h"
@ -551,6 +552,26 @@ public:
llvm_unreachable("Target didn't implement TargetInstrInfo::InsertBranch!");
}
/// Analyze the loop code, return true if it cannot be understoo. Upon
/// success, this function returns false and returns information about the
/// induction variable and compare instruction used at the end.
virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const {
return true;
}
/// Generate code to reduce the loop iteration by one and check if the loop is
/// finished. Return the value/register of the the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
MachineInstr *IndVar, MachineInstr *Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const {
llvm_unreachable("Target didn't implement ReduceLoopCount");
}
/// Delete the instruction OldInst and everything after it, replacing it with
/// an unconditional branch to NewDest. This is used by the tail merging pass.
virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
@ -1009,6 +1030,20 @@ public:
return false;
}
/// Return true if the instruction contains a base register and offset. If
/// true, the function also sets the operand position in the instruction
/// for the base register and offset.
virtual bool getBaseAndOffsetPosition(const MachineInstr *MI,
unsigned &BasePos,
unsigned &OffsetPos) const {
return false;
}
/// If the instruction is an increment of a constant value, return the amount.
virtual bool getIncrementValue(const MachineInstr *MI, int &Value) const {
return false;
}
virtual bool enableClusterLoads() const { return false; }
virtual bool enableClusterStores() const { return false; }
@ -1041,6 +1076,10 @@ public:
/// Return the noop instruction to use for a noop.
virtual void getNoopForMachoTarget(MCInst &NopInst) const;
/// Return true for post-incremented instructions.
virtual bool isPostIncrement(const MachineInstr* MI) const {
return false;
}
/// Returns true if the instruction is already predicated.
virtual bool isPredicated(const MachineInstr &MI) const {

View File

@ -70,6 +70,7 @@ add_llvm_library(LLVMCodeGen
MachineModuleInfo.cpp
MachineModuleInfoImpls.cpp
MachinePassRegistry.cpp
MachinePipeliner.cpp
MachinePostDominators.cpp
MachineRegionInfo.cpp
MachineRegisterInfo.cpp

View File

@ -53,6 +53,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeMachineLICMPass(Registry);
initializeMachineLoopInfoPass(Registry);
initializeMachineModuleInfoPass(Registry);
initializeMachinePipelinerPass(Registry);
initializeMachinePostDominatorTreePass(Registry);
initializeMachineSchedulerPass(Registry);
initializeMachineSinkingPass(Registry);

File diff suppressed because it is too large Load Diff

View File

@ -660,6 +660,85 @@ unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,
return 2;
}
/// Analyze the loop code to find the loop induction variable and compare used
/// to compute the number of iterations. Currently, we analyze loop that are
/// controlled using hardware loops. In this case, the induction variable
/// instruction is null. For all other cases, this function returns true, which
/// means we're unable to analyze it.
bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const {
MachineBasicBlock *LoopEnd = L.getBottomBlock();
MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
// We really "analyze" only hardware loops right now.
if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) {
IndVarInst = nullptr;
CmpInst = &*I;
return false;
}
return true;
}
/// Generate code to reduce the loop iteration by one and check if the loop is
/// finished. Return the value/register of the new loop count. this function
/// assumes the nth iteration is peeled first.
unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
MachineInstr *IndVar, MachineInstr *Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const {
// We expect a hardware loop currently. This means that IndVar is set
// to null, and the compare is the ENDLOOP instruction.
assert((!IndVar) && isEndLoopN(Cmp->getOpcode())
&& "Expecting a hardware loop");
MachineFunction *MF = MBB.getParent();
DebugLoc DL = Cmp->getDebugLoc();
SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
MachineInstr *Loop = findLoopInstr(&MBB, Cmp->getOpcode(), VisitedBBs);
if (!Loop)
return 0;
// If the loop trip count is a compile-time value, then just change the
// value.
if (Loop->getOpcode() == Hexagon::J2_loop0i ||
Loop->getOpcode() == Hexagon::J2_loop1i) {
int64_t Offset = Loop->getOperand(1).getImm();
if (Offset <= 1)
Loop->eraseFromParent();
else
Loop->getOperand(1).setImm(Offset - 1);
return Offset - 1;
}
// The loop trip count is a run-time value. We generate code to subtract
// one from the trip count, and update the loop instruction.
assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
unsigned LoopCount = Loop->getOperand(1).getReg();
// Check if we're done with the loop.
unsigned LoopEnd = createVR(MF, MVT::i1);
MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd).
addReg(LoopCount).addImm(1);
unsigned NewLoopCount = createVR(MF, MVT::i32);
MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount).
addReg(LoopCount).addImm(-1);
// Update the previously generated instructions with the new loop counter.
for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(),
E = PrevInsts.end(); I != E; ++I)
(*I)->substituteRegister(LoopCount, NewLoopCount, 0, getRegisterInfo());
PrevInsts.clear();
PrevInsts.push_back(NewCmp);
PrevInsts.push_back(NewAdd);
// Insert the new loop instruction if this is the last time the loop is
// decremented.
if (Iter == MaxIter)
BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)).
addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount);
// Delete the old loop instruction.
if (Iter == 0)
Loop->eraseFromParent();
Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
Cond.push_back(NewCmp->getOperand(0));
return NewLoopCount;
}
bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
unsigned NumCycles, unsigned ExtraPredCycles,
@ -1592,6 +1671,22 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
}
/// If the instruction is an increment of a constant value, return the amount.
bool HexagonInstrInfo::getIncrementValue(const MachineInstr *MI,
int &Value) const {
if (isPostIncrement(MI)) {
unsigned AccessSize;
return getBaseAndOffset(MI, Value, AccessSize);
}
if (MI->getOpcode() == Hexagon::A2_addi) {
Value = MI->getOperand(2).getImm();
return true;
}
return false;
}
unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
MachineRegisterInfo &MRI = MF->getRegInfo();
const TargetRegisterClass *TRC;
@ -2878,6 +2973,18 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr *MI1,
}
/// \brief Get the base register and byte offset of a load/store instr.
bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
const {
unsigned AccessSize = 0;
int OffsetVal = 0;
BaseReg = getBaseAndOffset(&LdSt, OffsetVal, AccessSize);
Offset = OffsetVal;
return BaseReg != 0;
}
/// \brief Can these instructions execute at the same time in a bundle.
bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First,
const MachineInstr *Second) const {

View File

@ -103,6 +103,22 @@ public:
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL) const override;
/// Analyze the loop code, return true if it cannot be understood. Upon
/// success, this function returns false and returns information about the
/// induction variable and compare instruction used at the end.
bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const override;
/// Generate code to reduce the loop iteration by one and check if the loop is
/// finished. Return the value/register of the the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
unsigned reduceLoopCount(MachineBasicBlock &MBB,
MachineInstr *IndVar, MachineInstr *Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const override;
/// Return true if it's profitable to predicate
/// instructions with accumulated instruction latency of "NumCycles"
/// of the specified basic block, where the probability of the instructions
@ -172,6 +188,11 @@ public:
/// anything was changed.
bool expandPostRAPseudo(MachineInstr &MI) const override;
/// \brief Get the base register and byte offset of a load/store instr.
bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
/// Reverses the branch condition of the specified condition list,
/// returning false on success and true if it cannot be reversed.
bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
@ -248,6 +269,14 @@ public:
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const override;
/// For instructions with a base and offset, return the position of the
/// base register and offset operands.
bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
unsigned &OffsetPos) const override;
/// If the instruction is an increment of a constant value, return the amount.
bool getIncrementValue(const MachineInstr *MI, int &Value) const override;
/// HexagonInstrInfo specifics.
///
@ -297,7 +326,7 @@ public:
bool isNewValueStore(const MachineInstr* MI) const;
bool isNewValueStore(unsigned Opcode) const;
bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const;
bool isPostIncrement(const MachineInstr* MI) const;
bool isPostIncrement(const MachineInstr* MI) const override;
bool isPredicatedNew(const MachineInstr &MI) const;
bool isPredicatedNew(unsigned Opcode) const;
bool isPredicatedTrue(const MachineInstr &MI) const;
@ -348,8 +377,6 @@ public:
unsigned getAddrMode(const MachineInstr* MI) const;
unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset,
unsigned &AccessSize) const;
bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
unsigned &OffsetPos) const;
short getBaseWithLongOffset(short Opcode) const;
short getBaseWithLongOffset(const MachineInstr *MI) const;
short getBaseWithRegOffset(const MachineInstr *MI) const;

View File

@ -27,7 +27,6 @@
using namespace llvm;
static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
cl::init(true), cl::desc("Enable RDF-based optimizations"));
@ -292,6 +291,8 @@ void HexagonPassConfig::addPreRegAlloc() {
if (!DisableHardwareLoops)
addPass(createHexagonHardwareLoops(), false);
}
if (TM->getOptLevel() >= CodeGenOpt::Default)
addPass(&MachinePipelinerID);
}
void HexagonPassConfig::addPostRegAlloc() {

View File

@ -1,4 +1,4 @@
; RUN: llc -march=hexagon -disable-hsdr < %s | FileCheck %s
; RUN: llc -march=hexagon -disable-hsdr -hexagon-subreg-liveness < %s | FileCheck %s
; Check that we don't generate any bitwise operations.
; CHECK-NOT: = or(

View File

@ -1,4 +1,4 @@
; RUN: llc -march=hexagon < %s | FileCheck %s
; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
; Check that we generate hardware loop instructions.
; Case 1 : Loop with a constant number of iterations.

View File

@ -0,0 +1,51 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -verify-machineinstrs < %s | FileCheck %s
; If the trip count is a compile-time constant, then decrement it instead
; of computing a new LC0 value.
; CHECK-LABEL: @test
; CHECK: loop0(.LBB0_1, #998)
define i32 @test(i32* %A, i32* %B, i32 %count) {
entry:
br label %for.body
for.body:
%sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx.phi = phi i32* [ %A, %entry ], [ %arrayidx.inc, %for.body ]
%i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%0 = load i32, i32* %arrayidx.phi, align 4
%add = add nsw i32 %0, %sum.02
%inc = add nsw i32 %i.01, 1
%exitcond = icmp eq i32 %inc, 1000
%arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret i32 %add
}
; The constant trip count is small enough that the kernel is not executed.
; CHECK-LABEL: @test1
; CHECK-NOT: loop0(
define i32 @test1(i32* %A, i32* %B, i32 %count) {
entry:
br label %for.body
for.body:
%sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx.phi = phi i32* [ %A, %entry ], [ %arrayidx.inc, %for.body ]
%i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%0 = load i32, i32* %arrayidx.phi, align 4
%add = add nsw i32 %0, %sum.02
%inc = add nsw i32 %i.01, 1
%exitcond = icmp eq i32 %inc, 1
%arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret i32 %add
}

View File

@ -0,0 +1,42 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 < %s
; REQUIRES: asserts
; This tests check that a dependence is created between a Phi and it's uses.
; An assert occurs if the Phi dependences are not correct.
define void @test1(i32* %f2, i32 %nc) {
entry:
%i.011 = add i32 %nc, -1
%cmp12 = icmp sgt i32 %i.011, 1
br i1 %cmp12, label %for.body.preheader, label %for.end
for.body.preheader:
%0 = add i32 %nc, -2
%scevgep = getelementptr i32, i32* %f2, i32 %0
%sri = load i32, i32* %scevgep, align 4
%scevgep15 = getelementptr i32, i32* %f2, i32 %i.011
%sri16 = load i32, i32* %scevgep15, align 4
br label %for.body
for.body:
%i.014 = phi i32 [ %i.0, %for.body ], [ %i.011, %for.body.preheader ]
%i.0.in13 = phi i32 [ %i.014, %for.body ], [ %nc, %for.body.preheader ]
%sr = phi i32 [ %1, %for.body ], [ %sri, %for.body.preheader ]
%sr17 = phi i32 [ %sr, %for.body ], [ %sri16, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %f2, i32 %i.014
%sub1 = add nsw i32 %i.0.in13, -3
%arrayidx2 = getelementptr inbounds i32, i32* %f2, i32 %sub1
%1 = load i32, i32* %arrayidx2, align 4
%sub3 = sub nsw i32 %sr17, %1
store i32 %sub3, i32* %arrayidx, align 4
%i.0 = add nsw i32 %i.014, -1
%cmp = icmp sgt i32 %i.0, 1
br i1 %cmp, label %for.body, label %for.end.loopexit
for.end.loopexit:
br label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,65 @@
; RUN: llc -fp-contract=fast -O3 -march=hexagon -mcpu=hexagonv5 < %s
; REQUIRES: asserts
; Test that the pipeliner doesn't ICE due because the PHI generation
; code in the epilog does not attempt to reuse an existing PHI.
define void @test(float* noalias %srcImg, i32 %width, float* noalias %dstImg) {
entry.split:
%shr = lshr i32 %width, 1
%incdec.ptr253 = getelementptr inbounds float, float* %dstImg, i32 2
br i1 undef, label %for.body, label %for.end
for.body:
%dst.21518.reg2mem.0 = phi float* [ null, %while.end712 ], [ %incdec.ptr253, %entry.split ]
%dstEnd.01519 = phi float* [ %add.ptr725, %while.end712 ], [ undef, %entry.split ]
%add.ptr367 = getelementptr inbounds float, float* %srcImg, i32 undef
%dst.31487 = getelementptr inbounds float, float* %dst.21518.reg2mem.0, i32 1
br i1 undef, label %while.body661.preheader, label %while.end712
while.body661.preheader:
%scevgep1941 = getelementptr float, float* %add.ptr367, i32 1
br label %while.body661.ur
while.body661.ur:
%lsr.iv1942 = phi float* [ %scevgep1941, %while.body661.preheader ], [ undef, %while.body661.ur ]
%col1.31508.reg2mem.0.ur = phi float [ %col3.31506.reg2mem.0.ur, %while.body661.ur ], [ undef, %while.body661.preheader ]
%col4.31507.reg2mem.0.ur = phi float [ %add710.ur, %while.body661.ur ], [ 0.000000e+00, %while.body661.preheader ]
%col3.31506.reg2mem.0.ur = phi float [ %add689.ur, %while.body661.ur ], [ undef, %while.body661.preheader ]
%dst.41511.ur = phi float* [ %incdec.ptr674.ur, %while.body661.ur ], [ %dst.31487, %while.body661.preheader ]
%mul662.ur = fmul float %col1.31508.reg2mem.0.ur, 4.000000e+00
%add663.ur = fadd float undef, %mul662.ur
%add665.ur = fadd float %add663.ur, undef
%add667.ur = fadd float undef, %add665.ur
%add669.ur = fadd float undef, %add667.ur
%add670.ur = fadd float %col4.31507.reg2mem.0.ur, %add669.ur
%conv673.ur = fmul float %add670.ur, 3.906250e-03
%incdec.ptr674.ur = getelementptr inbounds float, float* %dst.41511.ur, i32 1
store float %conv673.ur, float* %dst.41511.ur, align 4
%scevgep1959 = getelementptr float, float* %lsr.iv1942, i32 -1
%0 = load float, float* %scevgep1959, align 4
%mul680.ur = fmul float %0, 4.000000e+00
%add681.ur = fadd float undef, %mul680.ur
%add684.ur = fadd float undef, %add681.ur
%add687.ur = fadd float undef, %add684.ur
%add689.ur = fadd float undef, %add687.ur
%add699.ur = fadd float undef, undef
%add703.ur = fadd float undef, %add699.ur
%add707.ur = fadd float undef, %add703.ur
%add710.ur = fadd float undef, %add707.ur
%cmp660.ur = icmp ult float* %incdec.ptr674.ur, %dstEnd.01519
br i1 %cmp660.ur, label %while.body661.ur, label %while.end712
while.end712:
%dst.4.lcssa.reg2mem.0 = phi float* [ %dst.31487, %for.body ], [ undef, %while.body661.ur ]
%conv721 = fpext float undef to double
%mul722 = fmul double %conv721, 0x3F7111112119E8FB
%conv723 = fptrunc double %mul722 to float
store float %conv723, float* %dst.4.lcssa.reg2mem.0, align 4
%add.ptr725 = getelementptr inbounds float, float* %dstEnd.01519, i32 %shr
%cmp259 = icmp ult i32 undef, undef
br i1 %cmp259, label %for.body, label %for.end
for.end:
ret void
}

View File

@ -0,0 +1,75 @@
; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-bsb-sched=0 -enable-pipeliner < %s | FileCheck %s
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
; From coremark. Test that we pipeline the matrix multiplication bitextract
; function. The pipelined code should have two packets.
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: = extractu([[REG2:(r[0-9]+)]],
; CHECK: = extractu([[REG2]],
; CHECK: [[REG0:(r[0-9]+)]] = memh
; CHECK: [[REG1:(r[0-9]+)]] = memh
; CHECK: += mpyi
; CHECK: [[REG2]] = mpyi([[REG0]], [[REG1]])
; CHECK: endloop0
%union_h2_sem_t = type { i32 }
@sem_i = common global [0 x %union_h2_sem_t] zeroinitializer, align 4
define void @matrix_mul_matrix_bitextract(i32 %N, i32* %C, i16* %A, i16* %B) {
entry:
%cmp53 = icmp eq i32 %N, 0
br i1 %cmp53, label %for_end27, label %for_body3_lr_ph_us
for_body3_lr_ph_us:
%i_054_us = phi i32 [ %inc26_us, %for_cond1_for_inc25_crit_edge_us ], [ 0, %entry ]
%0 = mul i32 %i_054_us, %N
%arrayidx9_us_us_gep = getelementptr i16, i16* %A, i32 %0
br label %for_body3_us_us
for_cond1_for_inc25_crit_edge_us:
%inc26_us = add i32 %i_054_us, 1
%exitcond89 = icmp eq i32 %inc26_us, %N
br i1 %exitcond89, label %for_end27, label %for_body3_lr_ph_us
for_body3_us_us:
%j_052_us_us = phi i32 [ %inc23_us_us, %for_cond4_for_inc22_crit_edge_us_us ], [ 0, %for_body3_lr_ph_us ]
%add_us_us = add i32 %j_052_us_us, %0
%arrayidx_us_us = getelementptr inbounds i32, i32* %C, i32 %add_us_us
store i32 0, i32* %arrayidx_us_us, align 4
br label %for_body6_us_us
for_cond4_for_inc22_crit_edge_us_us:
store i32 %add21_us_us, i32* %arrayidx_us_us, align 4
%inc23_us_us = add i32 %j_052_us_us, 1
%exitcond88 = icmp eq i32 %inc23_us_us, %N
br i1 %exitcond88, label %for_cond1_for_inc25_crit_edge_us, label %for_body3_us_us
for_body6_us_us:
%1 = phi i32 [ 0, %for_body3_us_us ], [ %add21_us_us, %for_body6_us_us ]
%arrayidx9_us_us_phi = phi i16* [ %arrayidx9_us_us_gep, %for_body3_us_us ], [ %arrayidx9_us_us_inc, %for_body6_us_us ]
%k_050_us_us = phi i32 [ 0, %for_body3_us_us ], [ %inc_us_us, %for_body6_us_us ]
%2 = load i16, i16* %arrayidx9_us_us_phi, align 2
%conv_us_us = sext i16 %2 to i32
%mul10_us_us = mul i32 %k_050_us_us, %N
%add11_us_us = add i32 %mul10_us_us, %j_052_us_us
%arrayidx12_us_us = getelementptr inbounds i16, i16* %B, i32 %add11_us_us
%3 = load i16, i16* %arrayidx12_us_us, align 2
%conv13_us_us = sext i16 %3 to i32
%mul14_us_us = mul nsw i32 %conv13_us_us, %conv_us_us
%shr47_us_us = lshr i32 %mul14_us_us, 2
%and_us_us = and i32 %shr47_us_us, 15
%shr1548_us_us = lshr i32 %mul14_us_us, 5
%and16_us_us = and i32 %shr1548_us_us, 127
%mul17_us_us = mul i32 %and_us_us, %and16_us_us
%add21_us_us = add i32 %mul17_us_us, %1
%inc_us_us = add i32 %k_050_us_us, 1
%exitcond87 = icmp eq i32 %inc_us_us, %N
%arrayidx9_us_us_inc = getelementptr i16, i16* %arrayidx9_us_us_phi, i32 1
br i1 %exitcond87, label %for_cond4_for_inc22_crit_edge_us_us, label %for_body6_us_us
for_end27:
ret void
}

View File

@ -0,0 +1,42 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \
; RUN: -pipeliner-max-stages=2 < %s | FileCheck %s
@A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8
define i32 @test(i32 %Left, i32 %Right) {
entry:
%add = add nsw i32 %Right, %Left
%div = sdiv i32 %add, 2
%cmp9 = icmp slt i32 %div, %Left
br i1 %cmp9, label %for.end, label %for.body.preheader
for.body.preheader:
br label %for.body
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: [[REG1:(r[0-9]+)]] = max(r{{[0-9]+}}, [[REG1]])
; CHECK: [[REG0:(r[0-9]+)]] = add([[REG2:(r[0-9]+)]], [[REG0]])
; CHECK: [[REG2]] = memw
; CHECK: endloop0
for.body:
%MaxLeftBorderSum.012 = phi i32 [ %MaxLeftBorderSum.1, %for.body ], [ 0, %for.body.preheader ]
%i.011 = phi i32 [ %dec, %for.body ], [ %div, %for.body.preheader ]
%LeftBorderSum.010 = phi i32 [ %add1, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @A, i32 0, i32 %i.011
%0 = load i32, i32* %arrayidx, align 4
%add1 = add nsw i32 %0, %LeftBorderSum.010
%cmp2 = icmp sgt i32 %add1, %MaxLeftBorderSum.012
%MaxLeftBorderSum.1 = select i1 %cmp2, i32 %add1, i32 %MaxLeftBorderSum.012
%dec = add nsw i32 %i.011, -1
%cmp = icmp slt i32 %dec, %Left
br i1 %cmp, label %for.end.loopexit, label %for.body
for.end.loopexit:
br label %for.end
for.end:
%MaxLeftBorderSum.0.lcssa = phi i32 [ 0, %entry ], [ %MaxLeftBorderSum.1, %for.end.loopexit ]
ret i32 %MaxLeftBorderSum.0.lcssa
}

View File

@ -0,0 +1,75 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
; Make sure we attempt to pipeline all inner most loops.
; Check if the first loop is pipelined.
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
; CHECK-NEXT: endloop0
; Check if the second loop is pipelined.
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
; CHECK-NEXT: endloop0
define i32 @test(i32* %a, i32 %n, i32 %l) {
entry:
%cmp23 = icmp sgt i32 %n, 0
br i1 %cmp23, label %for.body3.lr.ph.preheader, label %for.end14
for.body3.lr.ph.preheader:
br label %for.body3.lr.ph
for.body3.lr.ph:
%sum1.026 = phi i32 [ %add8, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
%sum.025 = phi i32 [ %add, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
%j.024 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
br label %for.body3
for.body3:
%sum.118 = phi i32 [ %sum.025, %for.body3.lr.ph ], [ %add, %for.body3 ]
%arrayidx.phi = phi i32* [ %a, %for.body3.lr.ph ], [ %arrayidx.inc, %for.body3 ]
%i.017 = phi i32 [ 0, %for.body3.lr.ph ], [ %inc, %for.body3 ]
%0 = load i32, i32* %arrayidx.phi, align 4
%add = add nsw i32 %0, %sum.118
%inc = add nsw i32 %i.017, 1
%exitcond = icmp eq i32 %inc, %n
%arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
br i1 %exitcond, label %for.end, label %for.body3
for.end:
tail call void @bar(i32* %a) #2
br label %for.body6
for.body6:
%sum1.121 = phi i32 [ %sum1.026, %for.end ], [ %add8, %for.body6 ]
%arrayidx7.phi = phi i32* [ %a, %for.end ], [ %arrayidx7.inc, %for.body6 ]
%i.120 = phi i32 [ 0, %for.end ], [ %inc10, %for.body6 ]
%1 = load i32, i32* %arrayidx7.phi, align 4
%add8 = add nsw i32 %1, %sum1.121
%inc10 = add nsw i32 %i.120, 1
%exitcond29 = icmp eq i32 %inc10, %n
%arrayidx7.inc = getelementptr i32, i32* %arrayidx7.phi, i32 1
br i1 %exitcond29, label %for.inc12, label %for.body6
for.inc12:
%inc13 = add nsw i32 %j.024, 1
%exitcond30 = icmp eq i32 %inc13, %n
br i1 %exitcond30, label %for.end14.loopexit, label %for.body3.lr.ph
for.end14.loopexit:
br label %for.end14
for.end14:
%sum1.0.lcssa = phi i32 [ 0, %entry ], [ %add8, %for.end14.loopexit ]
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end14.loopexit ]
%add15 = add nsw i32 %sum1.0.lcssa, %sum.0.lcssa
ret i32 %add15
}
declare void @bar(i32*)

View File

@ -0,0 +1,41 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s | FileCheck %s
; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
;
; Check that we pipeline a vectorized dot product in a single packet.
;
; CHECK: {
; CHECK: += mpyi
; CHECK: += mpyi
; CHECK: memd
; CHECK: memd
; CHECK: } :endloop0
@a = common global [5000 x i32] zeroinitializer, align 8
@b = common global [5000 x i32] zeroinitializer, align 8
define i32 @vecMultGlobal() {
entry:
br label %polly.loop_body
polly.loop_after:
%0 = extractelement <2 x i32> %addp_vec, i32 0
%1 = extractelement <2 x i32> %addp_vec, i32 1
%add_sum = add i32 %0, %1
ret i32 %add_sum
polly.loop_body:
%polly.loopiv13 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
%reduction.012 = phi <2 x i32> [ zeroinitializer, %entry ], [ %addp_vec, %polly.loop_body ]
%polly.next_loopiv = add nsw i32 %polly.loopiv13, 2
%p_arrayidx1 = getelementptr [5000 x i32], [5000 x i32]* @b, i32 0, i32 %polly.loopiv13
%p_arrayidx = getelementptr [5000 x i32], [5000 x i32]* @a, i32 0, i32 %polly.loopiv13
%vector_ptr = bitcast i32* %p_arrayidx1 to <2 x i32>*
%_p_vec_full = load <2 x i32>, <2 x i32>* %vector_ptr, align 8
%vector_ptr7 = bitcast i32* %p_arrayidx to <2 x i32>*
%_p_vec_full8 = load <2 x i32>, <2 x i32>* %vector_ptr7, align 8
%mulp_vec = mul <2 x i32> %_p_vec_full8, %_p_vec_full
%addp_vec = add <2 x i32> %mulp_vec, %reduction.012
%2 = icmp slt i32 %polly.next_loopiv, 5000
br i1 %2, label %polly.loop_body, label %polly.loop_after
}

View File

@ -0,0 +1,33 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
; Multiply and accumulate
; CHECK: mpyi([[REG0:r([0-9]+)]], [[REG1:r([0-9]+)]])
; CHECK-NEXT: add(r{{[0-9]+}}, #4)
; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
; CHECK-NEXT: endloop0
define i32 @foo(i32* %a, i32* %b, i32 %n) {
entry:
br label %for.body
for.body:
%sum.03 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
%arrayidx1.phi = phi i32* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
%i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%0 = load i32, i32* %arrayidx.phi, align 4
%1 = load i32, i32* %arrayidx1.phi, align 4
%mul = mul nsw i32 %1, %0
%add = add nsw i32 %mul, %sum.03
%inc = add nsw i32 %i.02, 1
%exitcond = icmp eq i32 %inc, 10000
%arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
%arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret i32 %add
}

View File

@ -0,0 +1,29 @@
; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
; Simple vector total.
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: add([[REG:r([0-9]+)]], r{{[0-9]+}})
; CHECK-NEXT: add(r{{[0-9]+}}, #4)
; CHECK-NEXT: [[REG]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
; CHECK-NEXT: endloop0
define i32 @foo(i32* %a, i32 %n) {
entry:
br label %for.body
for.body:
%sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
%i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%0 = load i32, i32* %arrayidx.phi, align 4
%add = add nsw i32 %0, %sum.02
%inc = add nsw i32 %i.01, 1
%exitcond = icmp eq i32 %inc, 10000
%arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
br i1 %exitcond, label %for.end, label %for.body
for.end:
ret i32 %add
}

View File

@ -1,4 +1,4 @@
; RUN: llc -march=hexagon < %s | FileCheck %s
; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
; Test that we generate a .cur