From 618d8d058bf2a2f15e8d04994412088ca1a797a9 Mon Sep 17 00:00:00 2001
From: James Molloy <james.molloy@arm.com>
Date: Wed, 2 Sep 2015 10:15:39 +0000
Subject: [PATCH] [LV] Don't bail to MiddleBlock if a runtime check fails, bail
 to ScalarPH instead

We were bailing to two places if our runtime checks failed. If the initial overflow check failed, we'd go to ScalarPH. If any other check failed, we'd go to MiddleBlock. This caused us to have to have an extra PHI per induction and reduction as the vector loop's exit block was not dominated by its latch.

There's no need to have this behavior - if we just always go to ScalarPH we can get rid of a bunch of complexity.

llvm-svn: 246637
---
 lib/Transforms/Vectorize/LoopVectorize.cpp    | 87 ++++++-------------
 test/Transforms/LoopVectorize/debugloc.ll     |  2 +-
 test/Transforms/LoopVectorize/induction.ll    |  4 +-
 test/Transforms/LoopVectorize/reduction.ll    |  2 +-
 .../Transforms/LoopVectorize/runtime-check.ll |  4 +-
 5 files changed, 33 insertions(+), 66 deletions(-)

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0975b03da17..dd9edf442a5 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2785,13 +2785,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
   |  /  |
   | /   v
   ||   [ ]     <-- vector pre header.
-  ||    |
-  ||    v
-  ||   [  ] \
-  ||   [  ]_|   <-- vector loop.
-  ||    |
-  | \   v
-  |   >[ ]   <--- middle-block.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
   |  /  |
   | /   v
   -|- >[ ]     <--- new preheader.
@@ -2860,16 +2860,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
   emitMinimumIterationCountCheck(Lp, ScalarPH);
   // Now, compare the new count to zero. If it is zero skip the vector loop and
   // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, MiddleBlock);
+  emitVectorLoopEnteredCheck(Lp, ScalarPH);
   // Generate the code to check that the strides we assumed to be one are really
   // one. We want the new basic block to start at the first instruction in a
   // sequence of instructions that form a check.
-  emitStrideChecks(Lp, MiddleBlock);
+  emitStrideChecks(Lp, ScalarPH);
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
-  emitMemRuntimeChecks(Lp, MiddleBlock);
-
+  emitMemRuntimeChecks(Lp, ScalarPH);
+  
   // Generate the induction variable.
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
@@ -2890,27 +2890,20 @@ void InnerLoopVectorizer::createEmptyLoop() {
   // This variable saves the new starting index for the scalar loop. It is used
   // to test if there are any tail iterations left once the vector loop has
   // completed.
-  PHINode *ResumeIndex = nullptr;
   LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   for (I = List->begin(), E = List->end(); I != E; ++I) {
     PHINode *OrigPhi = I->first;
     InductionDescriptor II = I->second;
 
-    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
-                                         MiddleBlock->getTerminator());
     // Create phi nodes to merge from the  backedge-taken check block.
     PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
                                            "bc.resume.val",
                                            ScalarPH->getTerminator());
-    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
-
     Value *EndValue;
     if (OrigPhi == OldInduction) {
       // We know what the end value is.
       EndValue = CountRoundDown;
-      // We also know which PHI node holds it.
-      ResumeIndex = ResumeVal;
     } else {
       IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
       Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
@@ -2922,41 +2915,23 @@ void InnerLoopVectorizer::createEmptyLoop() {
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
-    ResumeVal->addIncoming(EndValue, VecBody);
+    BCResumeVal->addIncoming(EndValue, MiddleBlock);
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
 
     // The old induction's phi node in the scalar body needs the truncated
     // value.
-    BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
     OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
   }
 
-  // If we are generating a new induction variable then we also need to
-  // generate the code that calculates the exit value. This value is not
-  // simply the end of the counter because we may skip the vectorized body
-  // in case of a runtime check.
-  if (!OldInduction){
-    assert(!ResumeIndex && "Unexpected resume value found");
-    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
-                                  MiddleBlock->getTerminator());
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
-    ResumeIndex->addIncoming(CountRoundDown, VecBody);
-  }
-
-  // Make sure that we found the index where scalar loop needs to continue.
-  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
-         "Invalid resume Index");
-
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                ResumeIndex, "cmp.n",
+                                CountRoundDown, "cmp.n",
                                 MiddleBlock->getTerminator());
   ReplaceInstWithInst(MiddleBlock->getTerminator(),
                       BranchInst::Create(ExitBlock, ScalarPH, CmpN));
@@ -3262,19 +3237,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // instructions.
     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
-    VectorParts RdxParts, &RdxExitVal = getVectorValue(LoopExitInst);
+    VectorParts RdxParts = getVectorValue(LoopExitInst);
     setDebugLocFromInst(Builder, LoopExitInst);
-    for (unsigned part = 0; part < UF; ++part) {
-      // This PHINode contains the vectorized reduction variable, or
-      // the initial value vector, if we bypass the vector loop.
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
-      NewPhi->addIncoming(RdxExitVal[part],
-                          LoopVectorBody.back());
-      RdxParts.push_back(NewPhi);
-    }
 
     // If the vector reduction can be performed in a smaller type, we truncate
     // then extend the loop exit value to enable InstCombine to evaluate the
@@ -3283,15 +3247,17 @@ void InnerLoopVectorizer::vectorizeLoop() {
       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
       Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
       for (unsigned part = 0; part < UF; ++part) {
-        Value *Trunc = Builder.CreateTrunc(RdxExitVal[part], RdxVecTy);
+        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
         Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
                                           : Builder.CreateZExt(Trunc, VecTy);
-        for (Value::user_iterator UI = RdxExitVal[part]->user_begin();
-             UI != RdxExitVal[part]->user_end();)
-          if (*UI != Trunc)
-            (*UI++)->replaceUsesOfWith(RdxExitVal[part], Extnd);
-          else
+        for (Value::user_iterator UI = RdxParts[part]->user_begin();
+             UI != RdxParts[part]->user_end();)
+          if (*UI != Trunc) {
+            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+            RdxParts[part] = Extnd;
+          } else {
             ++UI;
+          }
       }
       Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
       for (unsigned part = 0; part < UF; ++part)
@@ -3362,7 +3328,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
     // block and the middle block.
     PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
                                           LoopScalarPreHeader->getTerminator());
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
     BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
 
     // Now, we need to fix the users of the reduction variable
@@ -3850,7 +3817,7 @@ void InnerLoopVectorizer::updateAnalysis() {
     }
   }
 
-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
   DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
   DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
   DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
diff --git a/test/Transforms/LoopVectorize/debugloc.ll b/test/Transforms/LoopVectorize/debugloc.ll
index d470a7a0846..c5277c36b3f 100644
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
 ; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
 ; CHECK: middle.block
-; CHECK:   add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg ![[LOC2]]
+; CHECK:   add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[LOC2]]
 ; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC2]]
 
 define i32 @f(i32* nocapture %a, i32 %size) #0 {
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index f6072add3d1..59ee66a4a35 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -115,8 +115,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked
 
 ; CHECK: scalar.ph:
-; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ]
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ]
 
 define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {
 
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 647e58a7e41..63b138f1d56 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -175,8 +175,8 @@ for.end:                                          ; preds = %for.body, %entry
 }
 
 ;CHECK-LABEL: @reduction_and(
-;CHECK: and <4 x i32>
 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: and <4 x i32>
 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ;CHECK: and <4 x i32>
 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
index 9066fb5ff0d..3673b71db30 100644
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -11,9 +11,9 @@ target triple = "x86_64-apple-macosx10.9.0"
 
 ;CHECK-LABEL: define i32 @foo
 ;CHECK: for.body.preheader:
-;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
+;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
 ;CHECK: vector.memcheck:
-;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph, !dbg [[BODY_LOC]]
+;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]]
 ;CHECK: load <4 x float>
 define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
 entry: