Vectorizer: Add support for loop reductions.

For example: for (i=0; i<n; i++) sum += A[i] + B[i] + i; git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166351 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-18 17:59:45 +00:00 · 2012-10-19 23:05:40 +00:00 · 2012-10-19 23:05:40 +00:00 · 5dbe64e2bc
commit 5dbe64e2bc
parent cfc6cb0c61
4 changed files with 518 additions and 114 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -10,6 +10,8 @@
 // This is a simple loop vectorizer. We currently only support single block
 // loops. We have a very simple and restrictive legality check: we need to read
 // and write from disjoint memory locations. We still don't have a cost model.
+// We do support integer reductions.
+//
 // This pass has three parts:
 // 1. The main loop pass that drives the different parts.
 // 2. LoopVectorizationLegality - A helper class that checks for the legality
@ -54,9 +56,11 @@ static cl::opt<unsigned>
 DefaultVectorizationFactor("default-loop-vectorize-width",
                          cl::init(4), cl::Hidden,
                          cl::desc("Set the default loop vectorization width"));
-
 namespace {

+// Forward declaration.
+class LoopVectorizationLegality;
+
 /// Vectorize a simple loop. This class performs the widening of simple single
 /// basic block loops into vectors. It does not perform any
 /// vectorization-legality checks, and just does it.  It widens the vectors
@ -67,23 +71,28 @@ public:
  SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
                            LPPassManager *Lpm, unsigned VecWidth):
  Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
-  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
+   Builder(0), Induction(0), OldInduction(0) { }
+
+  ~SingleBlockLoopVectorizer() {
+    delete Builder;
+  }

  // Perform the actual loop widening (vectorization).
-  void vectorize() {
+  void vectorize(LoopVectorizationLegality *Legal) {
    ///Create a new empty loop. Unlink the old loop and connect the new one.
    createEmptyLoop();
    /// Widen each instruction in the old loop to a new one in the new loop.
-    vectorizeLoop();
+    /// Use the Legality module to find the induction and reduction variables.
+   vectorizeLoop(Legal);
    // register the new loop.
    cleanup();
-  }
+ }

 private:
  /// Create an empty loop, based on the loop ranges of the old loop.
  void createEmptyLoop();
  /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop();
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
  /// Insert the new loop to the loop hierarchy and pass manager.
  void cleanup();

@ -113,6 +122,10 @@ private:
  /// broadcast them into a vector.
  Value *getVectorValue(Value *V);

+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
  typedef DenseMap<Value*, Value*> ValueMap;

  /// The original loop.
@ -127,10 +140,21 @@ private:
  unsigned VF;

  // The builder that we use
-  IRBuilder<> Builder;
+  IRBuilder<> *Builder;

  // --- Vectorization state ---

+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
  /// The new Induction variable which was added to the new block.
  PHINode *Induction;
  /// The induction variable of the old basic block.
@ -146,7 +170,23 @@ private:
 class LoopVectorizationLegality {
 public:
  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
-  TheLoop(Lp), SE(Se), DL(Dl) { }
+  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+
+  /// This represents the kinds of reductions that we support.
+  enum ReductionKind {
+    IntegerAdd, /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    NoReduction /// Not a reduction.
+  };
+
+  // Holds a pairing of reduction instruction and the reduction kind.
+  typedef std::pair<Instruction*, ReductionKind> ReductionPair;
+
+  /// ReductionList contains the reduction variables
+  /// as well as a single EXIT (from the block) value and the kind of
+  /// reduction variable..
+  /// Notice that the EXIT instruction can also be the PHI itself.
+  typedef DenseMap<PHINode*, ReductionPair> ReductionList;

  /// Returns the maximum vectorization factor that we *can* use to vectorize
  /// this loop. This does not mean that it is profitable to vectorize this
@ -154,6 +194,12 @@ public:
  /// can vectorize to any SIMD width below this number.
  unsigned getLoopMaxVF();

+  /// Returns the Induction variable.
+  PHINode *getInduction() {return Induction;}
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
 private:
  /// Check if a single basic block loop is vectorizable.
  /// At this point we know that this is a loop with a constant trip count
@ -164,12 +210,32 @@ private:
  // Example: Alloca, Global, NoAlias.
  bool isIdentifiedSafeObject(Value* Val);

+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Checks if a constant matches the reduction kind.
+  /// Sums starts with zero. Products start at one.
+  bool isReductionConstant(Value *V, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+
  /// The loop that we evaluate.
  Loop *TheLoop;
  /// Scev analysis.
  ScalarEvolution *SE;
  /// DataLayout analysis.
  DataLayout *DL;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the induction variable.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
 };

 struct LoopVectorize : public LoopPass {
@ -184,6 +250,7 @@ struct LoopVectorize : public LoopPass {
  LoopInfo *LI;

  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+
    // Only vectorize innermost loops.
    if (!L->empty())
      return false;
@ -209,7 +276,7 @@ struct LoopVectorize : public LoopPass {

    // If we decided that is is *legal* to vectorizer the loop. Do it.
    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
-    LB.vectorize();
+    LB.vectorize(&LVL);

    DEBUG(verifyFunction(*L->getHeader()->getParent()));
    return true;
@ -218,6 +285,7 @@ struct LoopVectorize : public LoopPass {
  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
    LoopPass::getAnalysisUsage(AU);
    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
    AU.addRequired<LoopInfo>();
    AU.addRequired<ScalarEvolution>();
  }
@ -237,10 +305,10 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
  Value *UndefVal = UndefValue::get(VTy);
  // Insert the value into a new vector.
-  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
+  Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero);
  // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                            "broadcast");
+  Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros,
+                                             "broadcast");
  // We are accessing the induction variable. Make sure to promote the
  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
  if (V == Induction)
@ -265,7 +333,7 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
  // Add the consecutive indices to the vector value.
  Constant *Cv = ConstantVector::get(Indices);
  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
-  return Builder.CreateAdd(Val, Cv, "induction");
+  return Builder->CreateAdd(Val, Cv, "induction");
 }


@ -297,10 +365,11 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 }

 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
  // If we saved a vectorized copy of V, use it.
  ValueMap::iterator it = WidenMap.find(V);
  if (it != WidenMap.end())
-    return it->second;
+     return it->second;

  // Broadcast V and save the value for future uses.
  Value *B = getBroadcastInstrs(V);
@ -308,6 +377,17 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
  return B;
 }

+Constant*
+SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+  SmallVector<Constant*, 8> Indices;
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VF; ++i)
+    Indices.push_back(ConstantInt::get(ScalarTy, Val));
+
+  // Add the consecutive indices to the vector value.
+  return ConstantVector::get(Indices);
+}
+
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
  // Holds vector parameters or scalars, in case of uniform vals.
@ -360,18 +440,18 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
      Value *Op = Params[op];
      // Param is a vector. Need to extract the right lane.
      if (Op->getType()->isVectorTy())
-        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
+        Op = Builder->CreateExtractElement(Op, Builder->getInt32(i));
      Cloned->setOperand(op, Op);
    }

    // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+    Builder->Insert(Cloned);

    // If the original scalar returns a value we need to place it in a vector
    // so that future users will be able to use it.
    if (!IsVoidRetTy)
-      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
-                                               Builder.getInt32(i));
+      VecResults = Builder->CreateInsertElement(VecResults, Cloned,
+                                               Builder->getInt32(i));
  }

  if (!IsVoidRetTy)
@ -417,16 +497,15 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
  assert(BypassBlock && "Invalid loop structure");

  BasicBlock *VectorPH =
-    BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                  "vector.body");
+                                                 "vector.body");

  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                     "middle.block");
+                                                  "middle.block");
  BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-
+          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                       "scalar.preheader");
  // Find the induction variable.
  BasicBlock *OldBasicBlock = Orig->getHeader();
  OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
@ -435,10 +514,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {

  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
  // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder = new IRBuilder<>(VecBody);
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());

  // Generate the induction variable.
-  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  Induction = Builder->CreatePHI(IdxTy, 2, "index");
  Constant *Zero = ConstantInt::get(IdxTy, 0);
  Constant *Step = ConstantInt::get(IdxTy, VF);

@ -489,12 +569,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
  MiddleBlock->getTerminator()->eraseFromParent();

  // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
+  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
  Induction->addIncoming(Zero, VectorPH);
  Induction->addIncoming(NextIdx, VecBody);
  // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
-  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
+  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);

  // Now we have two terminators. Remove the old one from the block.
  VecBody->getTerminator()->eraseFromParent();
@ -504,7 +584,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);

  // Get ready to start creating new instructions into the vectorized body.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());

  // Register the new loop.
  Loop* Lp = new Loop();
@ -518,22 +598,52 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
  }
+
+  // Save the state.
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+  LoopBypassBlock = BypassBlock;
 }

-void SingleBlockLoopVectorizer::vectorizeLoop() {
+void
+SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  typedef SmallVector<PHINode*, 4> PhiVector;
  BasicBlock &BB = *Orig->getHeader();

+  // In order to support reduction variables we need to be able to vectorize
+  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+  // steages. First, we create a new vector PHI node with no incoming edges.
+  // We use this value when we vectorize all of the instructions that use the
+  // PHI. Next, after all of the instructions in the block are complete we
+  // add the new incoming edges to the PHI. At this point all of the
+  // instructions in the basic block are vectorized, so we can use them to
+  // construct the PHI.
+  PhiVector PHIsToFix;
+
  // For each instruction in the old loop.
  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
    Instruction *Inst = it;

    switch (Inst->getOpcode()) {
-      case Instruction::PHI:
      case Instruction::Br:
        // Nothing to do for PHIs and BR, since we already took care of the
        // loop control flow instructions.
        continue;
-
+      case Instruction::PHI:{
+        PHINode* P = cast<PHINode>(Inst);
+        // Special handling for the induction var.
+        if (OldInduction == Inst)
+          continue;
+        // This is phase I of vectorizing PHIs.
+        // This has to be a reduction variable.
+        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
+        Type *VecTy = VectorType::get(Inst->getType(), VF);
+        WidenMap[Inst] = Builder->CreatePHI(VecTy, 2, "vec.phi");
+        PHIsToFix.push_back(P);
+        continue;
+      }
      case Instruction::Add:
      case Instruction::FAdd:
      case Instruction::Sub:
@ -557,15 +667,17 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
        Value *A = getVectorValue(Inst->getOperand(0));
        Value *B = getVectorValue(Inst->getOperand(1));
        // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B);
        break;
      }
      case Instruction::Select: {
        // Widen selects.
+        // TODO: If the selector is loop invariant we can issue a select
+        // instruction with a scalar condition.
        Value *A = getVectorValue(Inst->getOperand(0));
        Value *B = getVectorValue(Inst->getOperand(1));
        Value *C = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder.CreateSelect(A, B, C);
+        WidenMap[Inst] = Builder->CreateSelect(A, B, C);
        break;
      }

@ -577,9 +689,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
        Value *A = getVectorValue(Inst->getOperand(0));
        Value *B = getVectorValue(Inst->getOperand(1));
        if (FCmp)
-          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B);
        else
-          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B);
        break;
      }

@ -600,10 +712,10 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
        unsigned NumOperands = Gep->getNumOperands();
        Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+        Ptr = Builder->Insert(Gep2);
+        Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo());
        Value *Val = getVectorValue(SI->getValueOperand());
-        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+        Builder->CreateStore(Val, Ptr)->setAlignment(Alignment);
        break;
      }
      case Instruction::Load: {
@ -624,9 +736,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
        unsigned NumOperands = Gep->getNumOperands();
        Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder.Insert(Gep2);
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder.CreateLoad(Ptr);
+        Ptr = Builder->Insert(Gep2);
+        Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder->CreateLoad(Ptr);
        LI->setAlignment(Alignment);
        // Use this vector value for all users of the load.
        WidenMap[Inst] = LI;
@ -648,7 +760,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
        CastInst *CI = dyn_cast<CastInst>(Inst);
        Value *A = getVectorValue(Inst->getOperand(0));
        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+        WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy);
        break;
      }

@ -658,6 +770,102 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
        break;
    }// end of switch.
  }// end of for_each instr.
+
+  // At this point every instruction in the original loop is widended to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionPair ReductionVar =
+      (*Legal->getReductionVars())[RdxPhi];
+
+    // This is the vector-clone of the value that leaves the loop.
+    Value *VectorExit = getVectorValue(ReductionVar.first);
+    Type *VecTy = VectorExit->getType();
+
+    // This is the kind of reduction.
+    LoopVectorizationLegality::ReductionKind RdxKind = ReductionVar.second;
+    // Find the reduction identity variable.
+    // Zero for addition. One for Multiplication.
+    unsigned IdentitySclr =
+      (RdxKind == LoopVectorizationLegality::IntegerAdd ? 0 : 1);
+    Constant *Identity = getUniformVector(IdentitySclr, VecTy->getScalarType());
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+    VecRdxPhi->addIncoming(Identity, VecPreheader);
+    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder->SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    // This PHINode contains the vectorized reduction variable, or
+    // the identity vector, if we bypass the vector loop.
+    PHINode *NewPhi = Builder->CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    NewPhi->addIncoming(Identity, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(ReductionVar.first), LoopVectorBody);
+
+    // Extract the first scalar.
+    Value *Scalar0 =
+      Builder->CreateExtractElement(NewPhi, Builder->getInt32(0));
+    // Extract and sum the remaining vector elements.
+    for (unsigned i=1; i < VF; ++i) {
+      Value *Scalar1 =
+        Builder->CreateExtractElement(NewPhi, Builder->getInt32(i));
+      if (RdxKind == LoopVectorizationLegality::IntegerAdd) {
+        Scalar0 = Builder->CreateAdd(Scalar0, Scalar1);
+      } else {
+        Scalar0 = Builder->CreateMul(Scalar0, Scalar1);
+      }
+    }
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == ReductionVar.first) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, ReductionVar.first);
+  }// end of for each redux variable.
 }

 void SingleBlockLoopVectorizer::cleanup() {
@ -710,31 +918,35 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
  ValueVector Reads;
  ValueVector Writes;

-  SmallPtrSet<Value*, 16> AnalyzedPtrs;
-  unsigned NumPhis = 0;
  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
    Instruction *I = it;

    PHINode *Phi = dyn_cast<PHINode>(I);
    if (Phi) {
-      NumPhis++;
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
-        return false;
-      }
-
-      // If we found an induction variable.
-      if (NumPhis > 1) {
-        DEBUG(dbgs() << "LV: Found more than one PHI.\n");
-        return false;
-      }
-
      // This should not happen because the loop should be normalized.
      if (Phi->getNumIncomingValues() != 2) {
        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
        return false;
      }
+      // We only look at integer phi nodes.
+      if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+        return false;
+      }
+      if (AddReductionVar(Phi, IntegerAdd)) {
+        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerMult)) {
+        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (Induction) {
+        DEBUG(dbgs() << "LV: Found too many PHIs.\n");
+        return false;
+      }
+      // Found the induction variable.
+      Induction = Phi;

      // Check that the PHI is consecutive and starts at zero.
      const SCEV *PhiScev = SE->getSCEV(Phi);
@ -751,7 +963,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
        DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
        return false;
      }
-    }
+    }// end of PHI handling

    // If this is a load, record its pointer. If it is not a load, abort.
    // Notice that we don't handle function calls that read or write.
@ -764,8 +976,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
      }

      Value* Ptr = Ld->getPointerOperand();
-      if (AnalyzedPtrs.insert(Ptr))
-        GetUnderlyingObjects(Ptr, Reads, DL);
+      GetUnderlyingObjects(Ptr, Reads, DL);
    }

    // Record store pointers. Abort on all other instructions that write to
@ -779,8 +990,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
      }

      Value* Ptr = St->getPointerOperand();
-      if (AnalyzedPtrs.insert(Ptr))
-        GetUnderlyingObjects(St->getPointerOperand(), Writes, DL);
+      GetUnderlyingObjects(Ptr, Writes, DL);
    }

    // We still don't handle functions.
@ -797,21 +1007,26 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
      return false;
    }
-    //Check that all of the users of the loop are inside the BB.
-    for (Value::use_iterator it = I->use_begin(), e = I->use_end();
-         it != e; ++it) {
-      Instruction *U = cast<Instruction>(*it);
-      BasicBlock *Parent = U->getParent();
-      if (Parent != &BB) {
-        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-        return false;
-      }
+
+    // Reduction instructions are allowed to have exit users.
+    // All other instructions must not have external users.
+    if (!AllowedExit.count(I))
+      //Check that all of the users of the loop are inside the BB.
+      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
+           it != e; ++it) {
+        Instruction *U = cast<Instruction>(*it);
+        // This user may be a reduction exit value.
+        BasicBlock *Parent = U->getParent();
+        if (Parent != &BB) {
+          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+          return false;
+        }
    }
  } // next instr.

-  if (NumPhis != 1) {
-    DEBUG(dbgs() << "LV: Did not find a Phi node.\n");
-    return false;
+  if (!Induction) {
+      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
+      return false;
  }

  // Check that the underlying objects of the reads and writes are either
@ -866,6 +1081,110 @@ bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
  return A->hasNoAliasAttr();
 }

+bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
+                                                    ReductionKind Kind) {
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Find the possible incoming reduction variable.
+  BasicBlock *BB = Phi->getParent();
+  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
+  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
+  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+
+  // We must have a constant that starts the reduction.
+  if (!isReductionConstant(RdxStart, Kind))
+    return false;
+
+  // ExitInstruction is the single value which is used outside the loop.
+  // We only allow for a single reduction value to be used outside the loop.
+  // This includes users of the reduction, variables (which form a cycle
+  // which ends in the phi node).
+  Instruction *ExitInstruction = 0;
+
+  // Iter is our iterator. We start with the PHI node and scan for all of the
+  // users of this instruction. All users must be instructions which can be
+  // used as reduction variables (such as ADD). We may have a single
+  // out-of-block user. They cycle must end with the original PHI.
+  // Also, we can't have multiple block-local users.
+  Instruction *Iter = Phi;
+  while (true) {
+    // Any reduction instr must be of one of the allowed kinds.
+    if (!isReductionInstr(Iter, Kind))
+      return false;
+
+    // Did we found a user inside this block ?
+    bool FoundInBlockUser = false;
+    // Did we reach the initial PHI node ?
+    bool FoundStartPHI = false;
+    // For each of the *users* of iter.
+    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
+         it != e; ++it) {
+      Instruction *U = cast<Instruction>(*it);
+      // We already know that the PHI is a user.
+      if (U == Phi) {
+        FoundStartPHI = true;
+        continue;
+      }
+      // Check if we found the exit user.
+      BasicBlock *Parent = U->getParent();
+      if (Parent != BB) {
+        // We must have a single exit instruction.
+        if (ExitInstruction != 0)
+          return false;
+        ExitInstruction = Iter;
+      }
+      // We can't have multiple inside users.
+      if (FoundInBlockUser)
+        return false;
+      FoundInBlockUser = true;
+      Iter = U;
+    }
+
+    // We found a reduction var if we have reached the original
+    // phi node and we only have a single instruction with out-of-loop
+    // users.
+   if (FoundStartPHI && ExitInstruction) {
+     // This instruction is allowed to have out-of-loop users.
+     AllowedExit.insert(ExitInstruction);
+     // Mark this as a reduction var.
+     Reductions[Phi] = std::make_pair(ExitInstruction, Kind);
+     return true;
+   }
+  }
+}
+
+bool
+LoopVectorizationLegality::isReductionConstant(Value *V, ReductionKind Kind) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(V);
+  if (!CI)
+    return false;
+  if (Kind == IntegerMult && CI->isOne())
+    return true;
+  if (Kind == IntegerAdd && CI->isZero())
+    return true;
+  return false;
+}
+
+bool
+LoopVectorizationLegality::isReductionInstr(Instruction *I,
+                                            ReductionKind Kind) {
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::PHI:
+      // possibly.
+      return true;
+    case Instruction::Add:
+    case Instruction::Sub:
+      return Kind == IntegerAdd;
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      return Kind == IntegerMult;
+    }
+}
+
 } // namespace

 char LoopVectorize::ID = 0;
@ -880,6 +1199,5 @@ namespace llvm {
  Pass *createLoopVectorizePass() {
    return new LoopVectorize();
  }
-
 }

--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@ -202,9 +202,8 @@ define void @example8(i32 %x) nounwind uwtable ssp {
  ret void
 }

-; We can't vectorize because it has a reduction variable.
 ;CHECK: @example9
-;CHECK-NOT: <4 x i32>
+;CHECK: phi <4 x i32>
 ;CHECK: ret i32
 define i32 @example9() nounwind uwtable readonly ssp {
  br label %1
--- a/test/Transforms/LoopVectorize/increment.ll
+++ b/test/Transforms/LoopVectorize/increment.ll
@ -1,35 +0,0 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.8.0"
-
-@a = common global [2048 x i32] zeroinitializer, align 16
-
-; This is the loop.
-;  for (i=0; i<n; i++){
-;    a[i] += i;
-;  }
-;CHECK: @inc
-;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret void
-define void @inc(i32 %n) nounwind uwtable noinline ssp {
-  %1 = icmp sgt i32 %n, 0
-  br i1 %1, label %.lr.ph, label %._crit_edge
-
-.lr.ph:                                           ; preds = %0, %.lr.ph
-  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
-  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
-  %3 = load i32* %2, align 4
-  %4 = trunc i64 %indvars.iv to i32
-  %5 = add nsw i32 %3, %4
-  store i32 %5, i32* %2, align 4
-  %indvars.iv.next = add i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %._crit_edge, label %.lr.ph
-
-._crit_edge:                                      ; preds = %.lr.ph, %0
-  ret void
-}
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@ -0,0 +1,122 @@
+; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @reduction_sum
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_prod
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+;CHECK: @reduction_mix
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_bad
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_bad(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %3, %6
+  %8 = add i32 %7, %5
+  %9 = mul i32 %8, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}