MachinePipeliner pass that implements Swing Modulo Scheduling

Software pipelining is an optimization for improving ILP by overlapping loop iterations. Swing Modulo Scheduling (SMS) is an implementation of software pipelining that attempts to reduce register pressure and generate efficient pipelines with a low compile-time cost. This implementaion of SMS is a target-independent back-end pass. When enabled, the pass should run just prior to the register allocation pass, while the machine IR is in SSA form. If the pass is successful, then the original loop is replaced by the optimized loop. The optimized loop contains one or more prolog blocks, the pipelined kernel, and one or more epilog blocks. This pass is enabled for Hexagon only. To enable for other targets, a couple of target specific hooks must be implemented, and the pass needs to be called from the target's TargetMachine implementation. Differential Review: http://reviews.llvm.org/D16829 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277169 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-10 22:46:25 +00:00 · 2016-07-29 16:44:44 +00:00 · 2016-07-29 16:44:44 +00:00 · c1359c9fbb
commit c1359c9fbb
parent a6ad276d07
21 changed files with 4583 additions and 7 deletions
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@ -374,6 +374,9 @@ namespace llvm {
  /// and propagates register usage information of callee to caller
  /// if available with PysicalRegisterUsageInfo pass.
  FunctionPass *createRegUsageInfoPropPass();
+
+  /// This pass performs software pipelining on machine instructions.
+  extern char &MachinePipelinerID;
 } // End llvm namespace

 /// Target machine pass initializer for passes with dependencies. Use with
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@ -223,6 +223,7 @@ void initializeMachineLegalizePassPass(PassRegistry&);
 void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
 void initializeMachineModuleInfoPass(PassRegistry&);
+void initializeMachinePipelinerPass(PassRegistry&);
 void initializeMachinePostDominatorTreePass(PassRegistry&);
 void initializeMachineRegionInfoPassPass(PassRegistry&);
 void initializeMachineSchedulerPass(PassRegistry&);
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@ -551,6 +552,26 @@ public:
    llvm_unreachable("Target didn't implement TargetInstrInfo::InsertBranch!");
  }

+  /// Analyze the loop code, return true if it cannot be understoo. Upon
+  /// success, this function returns false and returns information about the
+  /// induction variable and compare instruction used at the end.
+  virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                           MachineInstr *&CmpInst) const {
+    return true;
+  }
+
+  /// Generate code to reduce the loop iteration by one and check if the loop is
+  /// finished.  Return the value/register of the the new loop count.  We need
+  /// this function when peeling off one or more iterations of a loop. This
+  /// function assumes the nth iteration is peeled first.
+  virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
+                                   MachineInstr *IndVar, MachineInstr *Cmp,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   SmallVectorImpl<MachineInstr *> &PrevInsts,
+                                   unsigned Iter, unsigned MaxIter) const {
+    llvm_unreachable("Target didn't implement ReduceLoopCount");
+  }
+
  /// Delete the instruction OldInst and everything after it, replacing it with
  /// an unconditional branch to NewDest. This is used by the tail merging pass.
  virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
@ -1009,6 +1030,20 @@ public:
    return false;
  }

+  /// Return true if the instruction contains a base register and offset. If
+  /// true, the function also sets the operand position in the instruction
+  /// for the base register and offset.
+  virtual bool getBaseAndOffsetPosition(const MachineInstr *MI,
+                                        unsigned &BasePos,
+                                        unsigned &OffsetPos) const {
+    return false;
+  }
+
+  /// If the instruction is an increment of a constant value, return the amount.
+  virtual bool getIncrementValue(const MachineInstr *MI, int &Value) const {
+    return false;
+  }
+
  virtual bool enableClusterLoads() const { return false; }

  virtual bool enableClusterStores() const { return false; }
@ -1041,6 +1076,10 @@ public:
  /// Return the noop instruction to use for a noop.
  virtual void getNoopForMachoTarget(MCInst &NopInst) const;

+  /// Return true for post-incremented instructions.
+  virtual bool isPostIncrement(const MachineInstr* MI) const {
+    return false;
+  }

  /// Returns true if the instruction is already predicated.
  virtual bool isPredicated(const MachineInstr &MI) const {
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@ -70,6 +70,7 @@ add_llvm_library(LLVMCodeGen
  MachineModuleInfo.cpp
  MachineModuleInfoImpls.cpp
  MachinePassRegistry.cpp
+  MachinePipeliner.cpp
  MachinePostDominators.cpp
  MachineRegionInfo.cpp
  MachineRegisterInfo.cpp
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@ -53,6 +53,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
  initializeMachineLICMPass(Registry);
  initializeMachineLoopInfoPass(Registry);
  initializeMachineModuleInfoPass(Registry);
+  initializeMachinePipelinerPass(Registry);
  initializeMachinePostDominatorTreePass(Registry);
  initializeMachineSchedulerPass(Registry);
  initializeMachineSinkingPass(Registry);
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@ -660,6 +660,85 @@ unsigned HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,
  return 2;
 }

+/// Analyze the loop code to find the loop induction variable and compare used
+/// to compute the number of iterations. Currently, we analyze loop that are
+/// controlled using hardware loops.  In this case, the induction variable
+/// instruction is null.  For all other cases, this function returns true, which
+/// means we're unable to analyze it.
+bool HexagonInstrInfo::analyzeLoop(MachineLoop &L,
+                                   MachineInstr *&IndVarInst,
+                                   MachineInstr *&CmpInst) const {
+
+  MachineBasicBlock *LoopEnd = L.getBottomBlock();
+  MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator();
+  // We really "analyze" only hardware loops right now.
+  if (I != LoopEnd->end() && isEndLoopN(I->getOpcode())) {
+    IndVarInst = nullptr;
+    CmpInst = &*I;
+    return false;
+  }
+  return true;
+}
+
+/// Generate code to reduce the loop iteration by one and check if the loop is
+/// finished. Return the value/register of the new loop count. this function
+/// assumes the nth iteration is peeled first.
+unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
+      MachineInstr *IndVar, MachineInstr *Cmp,
+      SmallVectorImpl<MachineOperand> &Cond,
+      SmallVectorImpl<MachineInstr *> &PrevInsts,
+      unsigned Iter, unsigned MaxIter) const {
+  // We expect a hardware loop currently. This means that IndVar is set
+  // to null, and the compare is the ENDLOOP instruction.
+  assert((!IndVar) && isEndLoopN(Cmp->getOpcode())
+                   && "Expecting a hardware loop");
+  MachineFunction *MF = MBB.getParent();
+  DebugLoc DL = Cmp->getDebugLoc();
+  SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
+  MachineInstr *Loop = findLoopInstr(&MBB, Cmp->getOpcode(), VisitedBBs);
+  if (!Loop)
+    return 0;
+  // If the loop trip count is a compile-time value, then just change the
+  // value.
+  if (Loop->getOpcode() == Hexagon::J2_loop0i ||
+      Loop->getOpcode() == Hexagon::J2_loop1i) {
+    int64_t Offset = Loop->getOperand(1).getImm();
+    if (Offset <= 1)
+      Loop->eraseFromParent();
+    else
+      Loop->getOperand(1).setImm(Offset - 1);
+    return Offset - 1;
+  }
+  // The loop trip count is a run-time value. We generate code to subtract
+  // one from the trip count, and update the loop instruction.
+  assert(Loop->getOpcode() == Hexagon::J2_loop0r && "Unexpected instruction");
+  unsigned LoopCount = Loop->getOperand(1).getReg();
+  // Check if we're done with the loop.
+  unsigned LoopEnd = createVR(MF, MVT::i1);
+  MachineInstr *NewCmp = BuildMI(&MBB, DL, get(Hexagon::C2_cmpgtui), LoopEnd).
+    addReg(LoopCount).addImm(1);
+  unsigned NewLoopCount = createVR(MF, MVT::i32);
+  MachineInstr *NewAdd = BuildMI(&MBB, DL, get(Hexagon::A2_addi), NewLoopCount).
+    addReg(LoopCount).addImm(-1);
+  // Update the previously generated instructions with the new loop counter.
+  for (SmallVectorImpl<MachineInstr *>::iterator I = PrevInsts.begin(),
+         E = PrevInsts.end(); I != E; ++I)
+    (*I)->substituteRegister(LoopCount, NewLoopCount, 0, getRegisterInfo());
+  PrevInsts.clear();
+  PrevInsts.push_back(NewCmp);
+  PrevInsts.push_back(NewAdd);
+  // Insert the new loop instruction if this is the last time the loop is
+  // decremented.
+  if (Iter == MaxIter)
+    BuildMI(&MBB, DL, get(Hexagon::J2_loop0r)).
+      addMBB(Loop->getOperand(0).getMBB()).addReg(NewLoopCount);
+  // Delete the old loop instruction.
+  if (Iter == 0)
+    Loop->eraseFromParent();
+  Cond.push_back(MachineOperand::CreateImm(Hexagon::J2_jumpf));
+  Cond.push_back(NewCmp->getOperand(0));
+  return NewLoopCount;
+}

 bool HexagonInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
      unsigned NumCycles, unsigned ExtraPredCycles,
@ -1592,6 +1671,22 @@ bool HexagonInstrInfo::areMemAccessesTriviallyDisjoint(
 }


+/// If the instruction is an increment of a constant value, return the amount.
+bool HexagonInstrInfo::getIncrementValue(const MachineInstr *MI,
+      int &Value) const {
+  if (isPostIncrement(MI)) {
+    unsigned AccessSize;
+    return getBaseAndOffset(MI, Value, AccessSize);
+  }
+  if (MI->getOpcode() == Hexagon::A2_addi) {
+    Value = MI->getOperand(2).getImm();
+    return true;
+  }
+
+  return false;
+}
+
+
 unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
  MachineRegisterInfo &MRI = MF->getRegInfo();
  const TargetRegisterClass *TRC;
@ -2878,6 +2973,18 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr *MI1,
 }


+/// \brief Get the base register and byte offset of a load/store instr.
+bool HexagonInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt,
+      unsigned &BaseReg, int64_t &Offset, const TargetRegisterInfo *TRI)
+      const {
+  unsigned AccessSize = 0;
+  int OffsetVal = 0;
+  BaseReg = getBaseAndOffset(&LdSt, OffsetVal, AccessSize);
+  Offset = OffsetVal;
+  return BaseReg != 0;
+}
+
+
 /// \brief Can these instructions execute at the same time in a bundle.
 bool HexagonInstrInfo::canExecuteInBundle(const MachineInstr *First,
      const MachineInstr *Second) const {
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@ -103,6 +103,22 @@ public:
                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                        const DebugLoc &DL) const override;

+  /// Analyze the loop code, return true if it cannot be understood. Upon
+  /// success, this function returns false and returns information about the
+  /// induction variable and compare instruction used at the end.
+  bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
+                   MachineInstr *&CmpInst) const override;
+
+  /// Generate code to reduce the loop iteration by one and check if the loop is
+  /// finished.  Return the value/register of the the new loop count.  We need
+  /// this function when peeling off one or more iterations of a loop. This
+  /// function assumes the nth iteration is peeled first.
+  unsigned reduceLoopCount(MachineBasicBlock &MBB,
+                           MachineInstr *IndVar, MachineInstr *Cmp,
+                           SmallVectorImpl<MachineOperand> &Cond,
+                           SmallVectorImpl<MachineInstr *> &PrevInsts,
+                           unsigned Iter, unsigned MaxIter) const override;
+
  /// Return true if it's profitable to predicate
  /// instructions with accumulated instruction latency of "NumCycles"
  /// of the specified basic block, where the probability of the instructions
@ -172,6 +188,11 @@ public:
  /// anything was changed.
  bool expandPostRAPseudo(MachineInstr &MI) const override;

+  /// \brief Get the base register and byte offset of a load/store instr.
+  bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
+                             int64_t &Offset,
+                             const TargetRegisterInfo *TRI) const override;
+
  /// Reverses the branch condition of the specified condition list,
  /// returning false on success and true if it cannot be reversed.
  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
@ -248,6 +269,14 @@ public:
  areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                  AliasAnalysis *AA = nullptr) const override;

+  /// For instructions with a base and offset, return the position of the
+  /// base register and offset operands.
+  bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
+                                unsigned &OffsetPos) const override;
+
+  /// If the instruction is an increment of a constant value, return the amount.
+  bool getIncrementValue(const MachineInstr *MI, int &Value) const override;
+
  /// HexagonInstrInfo specifics.
  ///

@ -297,7 +326,7 @@ public:
  bool isNewValueStore(const MachineInstr* MI) const;
  bool isNewValueStore(unsigned Opcode) const;
  bool isOperandExtended(const MachineInstr *MI, unsigned OperandNum) const;
-  bool isPostIncrement(const MachineInstr* MI) const;
+  bool isPostIncrement(const MachineInstr* MI) const override;
  bool isPredicatedNew(const MachineInstr &MI) const;
  bool isPredicatedNew(unsigned Opcode) const;
  bool isPredicatedTrue(const MachineInstr &MI) const;
@ -348,8 +377,6 @@ public:
  unsigned getAddrMode(const MachineInstr* MI) const;
  unsigned getBaseAndOffset(const MachineInstr *MI, int &Offset,
                            unsigned &AccessSize) const;
-  bool getBaseAndOffsetPosition(const MachineInstr *MI, unsigned &BasePos,
-                                unsigned &OffsetPos) const;
  short getBaseWithLongOffset(short Opcode) const;
  short getBaseWithLongOffset(const MachineInstr *MI) const;
  short getBaseWithRegOffset(const MachineInstr *MI) const;
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@ -27,7 +27,6 @@

 using namespace llvm;

-
 static cl::opt<bool> EnableRDFOpt("rdf-opt", cl::Hidden, cl::ZeroOrMore,
  cl::init(true), cl::desc("Enable RDF-based optimizations"));

@ -292,6 +291,8 @@ void HexagonPassConfig::addPreRegAlloc() {
    if (!DisableHardwareLoops)
      addPass(createHexagonHardwareLoops(), false);
  }
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(&MachinePipelinerID);
 }

 void HexagonPassConfig::addPostRegAlloc() {
--- a/test/CodeGen/Hexagon/bit-gen-rseq.ll
+++ b/test/CodeGen/Hexagon/bit-gen-rseq.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -disable-hsdr < %s | FileCheck %s
+; RUN: llc -march=hexagon -disable-hsdr -hexagon-subreg-liveness < %s | FileCheck %s
 ; Check that we don't generate any bitwise operations.

 ; CHECK-NOT: = or(
--- a/test/CodeGen/Hexagon/hwloop1.ll
+++ b/test/CodeGen/Hexagon/hwloop1.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s
 ; Check that we generate hardware loop instructions.

 ; Case 1 : Loop with a constant number of iterations.
--- a/test/CodeGen/Hexagon/swp-const-tc.ll
+++ b/test/CodeGen/Hexagon/swp-const-tc.ll
@ -0,0 +1,51 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -verify-machineinstrs < %s | FileCheck %s
+
+; If the trip count is a compile-time constant, then decrement it instead
+; of computing a new LC0 value.
+
+; CHECK-LABEL: @test
+; CHECK: loop0(.LBB0_1, #998)
+
+define i32 @test(i32* %A, i32* %B, i32 %count) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %A, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 1000
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
+; The constant trip count is small enough that the kernel is not executed.
+
+; CHECK-LABEL: @test1
+; CHECK-NOT: loop0(
+
+define i32 @test1(i32* %A, i32* %B, i32 %count) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %A, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 1
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
--- a/test/CodeGen/Hexagon/swp-dag-phi.ll
+++ b/test/CodeGen/Hexagon/swp-dag-phi.ll
@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 < %s
+; REQUIRES: asserts
+
+; This tests check that a dependence is created between a Phi and it's uses.
+; An assert occurs if the Phi dependences are not correct.
+
+define void @test1(i32* %f2, i32 %nc) {
+entry:
+  %i.011 = add i32 %nc, -1
+  %cmp12 = icmp sgt i32 %i.011, 1
+  br i1 %cmp12, label %for.body.preheader, label %for.end
+
+for.body.preheader:
+  %0 = add i32 %nc, -2
+  %scevgep = getelementptr i32, i32* %f2, i32 %0
+  %sri = load i32, i32* %scevgep, align 4
+  %scevgep15 = getelementptr i32, i32* %f2, i32 %i.011
+  %sri16 = load i32, i32* %scevgep15, align 4
+  br label %for.body
+
+for.body:
+  %i.014 = phi i32 [ %i.0, %for.body ], [ %i.011, %for.body.preheader ]
+  %i.0.in13 = phi i32 [ %i.014, %for.body ], [ %nc, %for.body.preheader ]
+  %sr = phi i32 [ %1, %for.body ], [ %sri, %for.body.preheader ]
+  %sr17 = phi i32 [ %sr, %for.body ], [ %sri16, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %f2, i32 %i.014
+  %sub1 = add nsw i32 %i.0.in13, -3
+  %arrayidx2 = getelementptr inbounds i32, i32* %f2, i32 %sub1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %sub3 = sub nsw i32 %sr17, %1
+  store i32 %sub3, i32* %arrayidx, align 4
+  %i.0 = add nsw i32 %i.014, -1
+  %cmp = icmp sgt i32 %i.0, 1
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
--- a/test/CodeGen/Hexagon/swp-epilog-reuse.ll
+++ b/test/CodeGen/Hexagon/swp-epilog-reuse.ll
@ -0,0 +1,65 @@
+; RUN: llc -fp-contract=fast -O3 -march=hexagon -mcpu=hexagonv5 < %s
+; REQUIRES: asserts
+
+; Test that the pipeliner doesn't ICE due because the PHI generation
+; code in the epilog does not attempt to reuse an existing PHI.
+
+define void @test(float* noalias %srcImg, i32 %width, float* noalias %dstImg) {
+entry.split:
+  %shr = lshr i32 %width, 1
+  %incdec.ptr253 = getelementptr inbounds float, float* %dstImg, i32 2
+  br i1 undef, label %for.body, label %for.end
+
+for.body:
+  %dst.21518.reg2mem.0 = phi float* [ null, %while.end712 ], [ %incdec.ptr253, %entry.split ]
+  %dstEnd.01519 = phi float* [ %add.ptr725, %while.end712 ], [ undef, %entry.split ]
+  %add.ptr367 = getelementptr inbounds float, float* %srcImg, i32 undef
+  %dst.31487 = getelementptr inbounds float, float* %dst.21518.reg2mem.0, i32 1
+  br i1 undef, label %while.body661.preheader, label %while.end712
+
+while.body661.preheader:
+  %scevgep1941 = getelementptr float, float* %add.ptr367, i32 1
+  br label %while.body661.ur
+
+while.body661.ur:
+  %lsr.iv1942 = phi float* [ %scevgep1941, %while.body661.preheader ], [ undef, %while.body661.ur ]
+  %col1.31508.reg2mem.0.ur = phi float [ %col3.31506.reg2mem.0.ur, %while.body661.ur ], [ undef, %while.body661.preheader ]
+  %col4.31507.reg2mem.0.ur = phi float [ %add710.ur, %while.body661.ur ], [ 0.000000e+00, %while.body661.preheader ]
+  %col3.31506.reg2mem.0.ur = phi float [ %add689.ur, %while.body661.ur ], [ undef, %while.body661.preheader ]
+  %dst.41511.ur = phi float* [ %incdec.ptr674.ur, %while.body661.ur ], [ %dst.31487, %while.body661.preheader ]
+  %mul662.ur = fmul float %col1.31508.reg2mem.0.ur, 4.000000e+00
+  %add663.ur = fadd float undef, %mul662.ur
+  %add665.ur = fadd float %add663.ur, undef
+  %add667.ur = fadd float undef, %add665.ur
+  %add669.ur = fadd float undef, %add667.ur
+  %add670.ur = fadd float %col4.31507.reg2mem.0.ur, %add669.ur
+  %conv673.ur = fmul float %add670.ur, 3.906250e-03
+  %incdec.ptr674.ur = getelementptr inbounds float, float* %dst.41511.ur, i32 1
+  store float %conv673.ur, float* %dst.41511.ur, align 4
+  %scevgep1959 = getelementptr float, float* %lsr.iv1942, i32 -1
+  %0 = load float, float* %scevgep1959, align 4
+  %mul680.ur = fmul float %0, 4.000000e+00
+  %add681.ur = fadd float undef, %mul680.ur
+  %add684.ur = fadd float undef, %add681.ur
+  %add687.ur = fadd float undef, %add684.ur
+  %add689.ur = fadd float undef, %add687.ur
+  %add699.ur = fadd float undef, undef
+  %add703.ur = fadd float undef, %add699.ur
+  %add707.ur = fadd float undef, %add703.ur
+  %add710.ur = fadd float undef, %add707.ur
+  %cmp660.ur = icmp ult float* %incdec.ptr674.ur, %dstEnd.01519
+  br i1 %cmp660.ur, label %while.body661.ur, label %while.end712
+
+while.end712:
+  %dst.4.lcssa.reg2mem.0 = phi float* [ %dst.31487, %for.body ], [ undef, %while.body661.ur ]
+  %conv721 = fpext float undef to double
+  %mul722 = fmul double %conv721, 0x3F7111112119E8FB
+  %conv723 = fptrunc double %mul722 to float
+  store float %conv723, float* %dst.4.lcssa.reg2mem.0, align 4
+  %add.ptr725 = getelementptr inbounds float, float* %dstEnd.01519, i32 %shr
+  %cmp259 = icmp ult i32 undef, undef
+  br i1 %cmp259, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
--- a/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@ -0,0 +1,75 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-bsb-sched=0 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+
+; From coremark. Test that we pipeline the matrix multiplication bitextract
+; function. The pipelined code should have two packets.
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: = extractu([[REG2:(r[0-9]+)]],
+; CHECK: = extractu([[REG2]],
+; CHECK: [[REG0:(r[0-9]+)]] = memh
+; CHECK: [[REG1:(r[0-9]+)]] = memh
+; CHECK: += mpyi
+; CHECK: [[REG2]] = mpyi([[REG0]], [[REG1]])
+; CHECK: endloop0
+
+%union_h2_sem_t = type { i32 }
+
+@sem_i = common global [0 x %union_h2_sem_t] zeroinitializer, align 4
+
+define void @matrix_mul_matrix_bitextract(i32 %N, i32* %C, i16* %A, i16* %B) {
+entry:
+  %cmp53 = icmp eq i32 %N, 0
+  br i1 %cmp53, label %for_end27, label %for_body3_lr_ph_us
+
+for_body3_lr_ph_us:
+  %i_054_us = phi i32 [ %inc26_us, %for_cond1_for_inc25_crit_edge_us ], [ 0, %entry ]
+  %0 = mul i32 %i_054_us, %N
+  %arrayidx9_us_us_gep = getelementptr i16, i16* %A, i32 %0
+  br label %for_body3_us_us
+
+for_cond1_for_inc25_crit_edge_us:
+  %inc26_us = add i32 %i_054_us, 1
+  %exitcond89 = icmp eq i32 %inc26_us, %N
+  br i1 %exitcond89, label %for_end27, label %for_body3_lr_ph_us
+
+for_body3_us_us:
+  %j_052_us_us = phi i32 [ %inc23_us_us, %for_cond4_for_inc22_crit_edge_us_us ], [ 0, %for_body3_lr_ph_us ]
+  %add_us_us = add i32 %j_052_us_us, %0
+  %arrayidx_us_us = getelementptr inbounds i32, i32* %C, i32 %add_us_us
+  store i32 0, i32* %arrayidx_us_us, align 4
+  br label %for_body6_us_us
+
+for_cond4_for_inc22_crit_edge_us_us:
+  store i32 %add21_us_us, i32* %arrayidx_us_us, align 4
+  %inc23_us_us = add i32 %j_052_us_us, 1
+  %exitcond88 = icmp eq i32 %inc23_us_us, %N
+  br i1 %exitcond88, label %for_cond1_for_inc25_crit_edge_us, label %for_body3_us_us
+
+for_body6_us_us:
+  %1 = phi i32 [ 0, %for_body3_us_us ], [ %add21_us_us, %for_body6_us_us ]
+  %arrayidx9_us_us_phi = phi i16* [ %arrayidx9_us_us_gep, %for_body3_us_us ], [ %arrayidx9_us_us_inc, %for_body6_us_us ]
+  %k_050_us_us = phi i32 [ 0, %for_body3_us_us ], [ %inc_us_us, %for_body6_us_us ]
+  %2 = load i16, i16* %arrayidx9_us_us_phi, align 2
+  %conv_us_us = sext i16 %2 to i32
+  %mul10_us_us = mul i32 %k_050_us_us, %N
+  %add11_us_us = add i32 %mul10_us_us, %j_052_us_us
+  %arrayidx12_us_us = getelementptr inbounds i16, i16* %B, i32 %add11_us_us
+  %3 = load i16, i16* %arrayidx12_us_us, align 2
+  %conv13_us_us = sext i16 %3 to i32
+  %mul14_us_us = mul nsw i32 %conv13_us_us, %conv_us_us
+  %shr47_us_us = lshr i32 %mul14_us_us, 2
+  %and_us_us = and i32 %shr47_us_us, 15
+  %shr1548_us_us = lshr i32 %mul14_us_us, 5
+  %and16_us_us = and i32 %shr1548_us_us, 127
+  %mul17_us_us = mul i32 %and_us_us, %and16_us_us
+  %add21_us_us = add i32 %mul17_us_us, %1
+  %inc_us_us = add i32 %k_050_us_us, 1
+  %exitcond87 = icmp eq i32 %inc_us_us, %N
+  %arrayidx9_us_us_inc = getelementptr i16, i16* %arrayidx9_us_us_phi, i32 1
+  br i1 %exitcond87, label %for_cond4_for_inc22_crit_edge_us_us, label %for_body6_us_us
+
+for_end27:
+  ret void
+}
--- a/test/CodeGen/Hexagon/swp-max.ll
+++ b/test/CodeGen/Hexagon/swp-max.ll
@ -0,0 +1,42 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner \
+; RUN:     -pipeliner-max-stages=2 < %s | FileCheck %s
+
+@A = global [8 x i32] [i32 4, i32 -3, i32 5, i32 -2, i32 -1, i32 2, i32 6, i32 -2], align 8
+
+define i32 @test(i32 %Left, i32 %Right) {
+entry:
+  %add = add nsw i32 %Right, %Left
+  %div = sdiv i32 %add, 2
+  %cmp9 = icmp slt i32 %div, %Left
+  br i1 %cmp9, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: [[REG1:(r[0-9]+)]] = max(r{{[0-9]+}}, [[REG1]])
+; CHECK: [[REG0:(r[0-9]+)]] = add([[REG2:(r[0-9]+)]], [[REG0]])
+; CHECK: [[REG2]] = memw
+; CHECK: endloop0
+
+for.body:
+  %MaxLeftBorderSum.012 = phi i32 [ %MaxLeftBorderSum.1, %for.body ], [ 0, %for.body.preheader ]
+  %i.011 = phi i32 [ %dec, %for.body ], [ %div, %for.body.preheader ]
+  %LeftBorderSum.010 = phi i32 [ %add1, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* @A, i32 0, i32 %i.011
+  %0 = load i32, i32* %arrayidx, align 4
+  %add1 = add nsw i32 %0, %LeftBorderSum.010
+  %cmp2 = icmp sgt i32 %add1, %MaxLeftBorderSum.012
+  %MaxLeftBorderSum.1 = select i1 %cmp2, i32 %add1, i32 %MaxLeftBorderSum.012
+  %dec = add nsw i32 %i.011, -1
+  %cmp = icmp slt i32 %dec, %Left
+  br i1 %cmp, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  %MaxLeftBorderSum.0.lcssa = phi i32 [ 0, %entry ], [ %MaxLeftBorderSum.1, %for.end.loopexit ]
+  ret i32 %MaxLeftBorderSum.0.lcssa
+}
--- a/test/CodeGen/Hexagon/swp-multi-loops.ll
+++ b/test/CodeGen/Hexagon/swp-multi-loops.ll
@ -0,0 +1,75 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+
+; Make sure we attempt to pipeline all inner most loops.
+
+; Check if the first loop is pipelined.
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
+; CHECK-NEXT: endloop0
+
+; Check if the second loop is pipelined.
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
+; CHECK-NEXT: endloop0
+
+define i32 @test(i32* %a, i32 %n, i32 %l) {
+entry:
+  %cmp23 = icmp sgt i32 %n, 0
+  br i1 %cmp23, label %for.body3.lr.ph.preheader, label %for.end14
+
+for.body3.lr.ph.preheader:
+  br label %for.body3.lr.ph
+
+for.body3.lr.ph:
+  %sum1.026 = phi i32 [ %add8, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
+  %sum.025 = phi i32 [ %add, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
+  %j.024 = phi i32 [ %inc13, %for.inc12 ], [ 0, %for.body3.lr.ph.preheader ]
+  br label %for.body3
+
+for.body3:
+  %sum.118 = phi i32 [ %sum.025, %for.body3.lr.ph ], [ %add, %for.body3 ]
+  %arrayidx.phi = phi i32* [ %a, %for.body3.lr.ph ], [ %arrayidx.inc, %for.body3 ]
+  %i.017 = phi i32 [ 0, %for.body3.lr.ph ], [ %inc, %for.body3 ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.118
+  %inc = add nsw i32 %i.017, 1
+  %exitcond = icmp eq i32 %inc, %n
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body3
+
+for.end:
+  tail call void @bar(i32* %a) #2
+  br label %for.body6
+
+for.body6:
+  %sum1.121 = phi i32 [ %sum1.026, %for.end ], [ %add8, %for.body6 ]
+  %arrayidx7.phi = phi i32* [ %a, %for.end ], [ %arrayidx7.inc, %for.body6 ]
+  %i.120 = phi i32 [ 0, %for.end ], [ %inc10, %for.body6 ]
+  %1 = load i32, i32* %arrayidx7.phi, align 4
+  %add8 = add nsw i32 %1, %sum1.121
+  %inc10 = add nsw i32 %i.120, 1
+  %exitcond29 = icmp eq i32 %inc10, %n
+  %arrayidx7.inc = getelementptr i32, i32* %arrayidx7.phi, i32 1
+  br i1 %exitcond29, label %for.inc12, label %for.body6
+
+for.inc12:
+  %inc13 = add nsw i32 %j.024, 1
+  %exitcond30 = icmp eq i32 %inc13, %n
+  br i1 %exitcond30, label %for.end14.loopexit, label %for.body3.lr.ph
+
+for.end14.loopexit:
+  br label %for.end14
+
+for.end14:
+  %sum1.0.lcssa = phi i32 [ 0, %entry ], [ %add8, %for.end14.loopexit ]
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end14.loopexit ]
+  %add15 = add nsw i32 %sum1.0.lcssa, %sum.0.lcssa
+  ret i32 %add15
+}
+
+declare void @bar(i32*)
+
--- a/test/CodeGen/Hexagon/swp-vect-dotprod.ll
+++ b/test/CodeGen/Hexagon/swp-vect-dotprod.ll
@ -0,0 +1,41 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O2 < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+;
+; Check that we pipeline a vectorized dot product in a single packet.
+;
+; CHECK: {
+; CHECK: += mpyi
+; CHECK: += mpyi
+; CHECK: memd
+; CHECK: memd
+; CHECK: }      :endloop0
+
+@a = common global [5000 x i32] zeroinitializer, align 8
+@b = common global [5000 x i32] zeroinitializer, align 8
+
+define i32 @vecMultGlobal() {
+entry:
+  br label %polly.loop_body
+
+polly.loop_after:
+  %0 = extractelement <2 x i32> %addp_vec, i32 0
+  %1 = extractelement <2 x i32> %addp_vec, i32 1
+  %add_sum = add i32 %0, %1
+  ret i32 %add_sum
+
+polly.loop_body:
+  %polly.loopiv13 = phi i32 [ 0, %entry ], [ %polly.next_loopiv, %polly.loop_body ]
+  %reduction.012 = phi <2 x i32> [ zeroinitializer, %entry ], [ %addp_vec, %polly.loop_body ]
+  %polly.next_loopiv = add nsw i32 %polly.loopiv13, 2
+  %p_arrayidx1 = getelementptr [5000 x i32], [5000 x i32]* @b, i32 0, i32 %polly.loopiv13
+  %p_arrayidx = getelementptr [5000 x i32], [5000 x i32]* @a, i32 0, i32 %polly.loopiv13
+  %vector_ptr = bitcast i32* %p_arrayidx1 to <2 x i32>*
+  %_p_vec_full = load <2 x i32>, <2 x i32>* %vector_ptr, align 8
+  %vector_ptr7 = bitcast i32* %p_arrayidx to <2 x i32>*
+  %_p_vec_full8 = load <2 x i32>, <2 x i32>* %vector_ptr7, align 8
+  %mulp_vec = mul <2 x i32> %_p_vec_full8, %_p_vec_full
+  %addp_vec = add <2 x i32> %mulp_vec, %reduction.012
+  %2 = icmp slt i32 %polly.next_loopiv, 5000
+  br i1 %2, label %polly.loop_body, label %polly.loop_after
+}
--- a/test/CodeGen/Hexagon/swp-vmult.ll
+++ b/test/CodeGen/Hexagon/swp-vmult.ll
@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+
+; Multiply and accumulate
+; CHECK: mpyi([[REG0:r([0-9]+)]], [[REG1:r([0-9]+)]])
+; CHECK-NEXT: add(r{{[0-9]+}}, #4)
+; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK-NEXT: endloop0
+
+define i32 @foo(i32* %a, i32* %b, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.03 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
+  %arrayidx1.phi = phi i32* [ %b, %entry ], [ %arrayidx1.inc, %for.body ]
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %1 = load i32, i32* %arrayidx1.phi, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.03
+  %inc = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  %arrayidx1.inc = getelementptr i32, i32* %arrayidx1.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
+
--- a/test/CodeGen/Hexagon/swp-vsum.ll
+++ b/test/CodeGen/Hexagon/swp-vsum.ll
@ -0,0 +1,29 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
+
+; Simple vector total.
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: add([[REG:r([0-9]+)]], r{{[0-9]+}})
+; CHECK-NEXT: add(r{{[0-9]+}}, #4)
+; CHECK-NEXT: [[REG]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK-NEXT: endloop0
+
+define i32 @foo(i32* %a, i32 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.02 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  %arrayidx.phi = phi i32* [ %a, %entry ], [ %arrayidx.inc, %for.body ]
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = load i32, i32* %arrayidx.phi, align 4
+  %add = add nsw i32 %0, %sum.02
+  %inc = add nsw i32 %i.01, 1
+  %exitcond = icmp eq i32 %inc, 10000
+  %arrayidx.inc = getelementptr i32, i32* %arrayidx.phi, i32 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %add
+}
--- a/test/CodeGen/Hexagon/v60-cur.ll
+++ b/test/CodeGen/Hexagon/v60-cur.ll
@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -enable-pipeliner=false < %s | FileCheck %s

 ; Test that we generate a .cur