[LV] Don't bail to MiddleBlock if a runtime check fails, bail to ScalarPH instead

We were bailing to two places if our runtime checks failed. If the initial overflow check failed, we'd go to ScalarPH. If any other check failed, we'd go to MiddleBlock. This caused us to have to have an extra PHI per induction and reduction as the vector loop's exit block was not dominated by its latch. There's no need to have this behavior - if we just always go to ScalarPH we can get rid of a bunch of complexity. llvm-svn: 246637
2024-12-04 01:11:44 +00:00 · 2015-09-02 10:15:39 +00:00 · 2015-09-02 10:15:39 +00:00 · 618d8d058b
commit 618d8d058b
parent 18560aef4d
5 changed files with 33 additions and 66 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -2785,13 +2785,13 @@ void InnerLoopVectorizer::createEmptyLoop() {
  |  /  |
  | /   v
  ||   [ ]     <-- vector pre header.
-  ||    |
-  ||    v
-  ||   [  ] \
-  ||   [  ]_|   <-- vector loop.
-  ||    |
-  | \   v
-  |   >[ ]   <--- middle-block.
+  |/    |
+  |     v
+  |    [  ] \
+  |    [  ]_|   <-- vector loop.
+  |     |
+  |     v
+  |   -[ ]   <--- middle-block.
  |  /  |
  | /   v
  -|- >[ ]     <--- new preheader.
@ -2860,16 +2860,16 @@ void InnerLoopVectorizer::createEmptyLoop() {
  emitMinimumIterationCountCheck(Lp, ScalarPH);
  // Now, compare the new count to zero. If it is zero skip the vector loop and
  // jump to the scalar loop.
-  emitVectorLoopEnteredCheck(Lp, MiddleBlock);
+  emitVectorLoopEnteredCheck(Lp, ScalarPH);
  // Generate the code to check that the strides we assumed to be one are really
  // one. We want the new basic block to start at the first instruction in a
  // sequence of instructions that form a check.
-  emitStrideChecks(Lp, MiddleBlock);
+  emitStrideChecks(Lp, ScalarPH);
  // Generate the code that checks in runtime if arrays overlap. We put the
  // checks into a separate block to make the more common case of few elements
  // faster.
-  emitMemRuntimeChecks(Lp, MiddleBlock);
-
+  emitMemRuntimeChecks(Lp, ScalarPH);
+  
  // Generate the induction variable.
  // The loop step is equal to the vectorization factor (num of SIMD elements)
  // times the unroll factor (num of SIMD instructions).
@ -2890,27 +2890,20 @@ void InnerLoopVectorizer::createEmptyLoop() {
  // This variable saves the new starting index for the scalar loop. It is used
  // to test if there are any tail iterations left once the vector loop has
  // completed.
-  PHINode *ResumeIndex = nullptr;
  LoopVectorizationLegality::InductionList::iterator I, E;
  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
  for (I = List->begin(), E = List->end(); I != E; ++I) {
    PHINode *OrigPhi = I->first;
    InductionDescriptor II = I->second;

-    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
-                                         MiddleBlock->getTerminator());
    // Create phi nodes to merge from the  backedge-taken check block.
    PHINode *BCResumeVal = PHINode::Create(OrigPhi->getType(), 3,
                                           "bc.resume.val",
                                           ScalarPH->getTerminator());
-    BCResumeVal->addIncoming(ResumeVal, MiddleBlock);
-
    Value *EndValue;
    if (OrigPhi == OldInduction) {
      // We know what the end value is.
      EndValue = CountRoundDown;
-      // We also know which PHI node holds it.
-      ResumeIndex = ResumeVal;
    } else {
      IRBuilder<> B(LoopBypassBlocks.back()->getTerminator());
      Value *CRD = B.CreateSExtOrTrunc(CountRoundDown,
@ -2922,41 +2915,23 @@ void InnerLoopVectorizer::createEmptyLoop() {

    // The new PHI merges the original incoming value, in case of a bypass,
    // or the value at the end of the vectorized loop.
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
-    ResumeVal->addIncoming(EndValue, VecBody);
+    BCResumeVal->addIncoming(EndValue, MiddleBlock);

    // Fix the scalar body counter (PHI node).
    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);

    // The old induction's phi node in the scalar body needs the truncated
    // value.
-    BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCResumeVal->addIncoming(II.getStartValue(), LoopBypassBlocks[I]);
    OrigPhi->setIncomingValue(BlockIdx, BCResumeVal);
  }

-  // If we are generating a new induction variable then we also need to
-  // generate the code that calculates the exit value. This value is not
-  // simply the end of the counter because we may skip the vectorized body
-  // in case of a runtime check.
-  if (!OldInduction){
-    assert(!ResumeIndex && "Unexpected resume value found");
-    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
-                                  MiddleBlock->getTerminator());
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
-    ResumeIndex->addIncoming(CountRoundDown, VecBody);
-  }
-
-  // Make sure that we found the index where scalar loop needs to continue.
-  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
-         "Invalid resume Index");
-
  // Add a check in the middle block to see if we have completed
  // all of the iterations in the first vector loop.
  // If (N - N%VF) == N, then we *don't* need to run the remainder.
  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                ResumeIndex, "cmp.n",
+                                CountRoundDown, "cmp.n",
                                MiddleBlock->getTerminator());
  ReplaceInstWithInst(MiddleBlock->getTerminator(),
                      BranchInst::Create(ExitBlock, ScalarPH, CmpN));
@ -3262,19 +3237,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
    // instructions.
    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());

-    VectorParts RdxParts, &RdxExitVal = getVectorValue(LoopExitInst);
+    VectorParts RdxParts = getVectorValue(LoopExitInst);
    setDebugLocFromInst(Builder, LoopExitInst);
-    for (unsigned part = 0; part < UF; ++part) {
-      // This PHINode contains the vectorized reduction variable, or
-      // the initial value vector, if we bypass the vector loop.
-      PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-        NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
-      NewPhi->addIncoming(RdxExitVal[part],
-                          LoopVectorBody.back());
-      RdxParts.push_back(NewPhi);
-    }

    // If the vector reduction can be performed in a smaller type, we truncate
    // then extend the loop exit value to enable InstCombine to evaluate the
@ -3283,15 +3247,17 @@ void InnerLoopVectorizer::vectorizeLoop() {
      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
      Builder.SetInsertPoint(LoopVectorBody.back()->getTerminator());
      for (unsigned part = 0; part < UF; ++part) {
-        Value *Trunc = Builder.CreateTrunc(RdxExitVal[part], RdxVecTy);
+        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
        Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
                                          : Builder.CreateZExt(Trunc, VecTy);
-        for (Value::user_iterator UI = RdxExitVal[part]->user_begin();
-             UI != RdxExitVal[part]->user_end();)
-          if (*UI != Trunc)
-            (*UI++)->replaceUsesOfWith(RdxExitVal[part], Extnd);
-          else
+        for (Value::user_iterator UI = RdxParts[part]->user_begin();
+             UI != RdxParts[part]->user_end();)
+          if (*UI != Trunc) {
+            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+            RdxParts[part] = Extnd;
+          } else {
            ++UI;
+          }
      }
      Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
      for (unsigned part = 0; part < UF; ++part)
@ -3362,7 +3328,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
    // block and the middle block.
    PHINode *BCBlockPhi = PHINode::Create(RdxPhi->getType(), 2, "bc.merge.rdx",
                                          LoopScalarPreHeader->getTerminator());
-    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[0]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);

    // Now, we need to fix the users of the reduction variable
@ -3850,7 +3817,7 @@ void InnerLoopVectorizer::updateAnalysis() {
    }
  }

-  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks[1]);
+  DT->addNewBlock(LoopMiddleBlock, LoopVectorBody.back());
  DT->addNewBlock(LoopScalarPreHeader, LoopBypassBlocks[0]);
  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
  DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
--- a/test/Transforms/LoopVectorize/debugloc.ll
+++ b/test/Transforms/LoopVectorize/debugloc.ll
@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK:   add i64 %index, 2, !dbg ![[LOC]]
 ; CHECK:   icmp eq i64 %index.next, %n.vec, !dbg ![[LOC]]
 ; CHECK: middle.block
-; CHECK:   add <2 x i32> %rdx.vec.exit.phi, %rdx.shuf, !dbg ![[LOC2]]
+; CHECK:   add <2 x i32> %{{.*}}, %rdx.shuf, !dbg ![[LOC2]]
 ; CHECK:   extractelement <2 x i32> %bin.rdx, i32 0, !dbg ![[LOC2]]

 define i32 @f(i32* nocapture %a, i32 %size) #0 {
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@ -115,8 +115,8 @@ define i32 @i16_loop() nounwind readnone ssp uwtable {
 ; CHECK:  br i1 true, label %scalar.ph, label %min.iters.checked

 ; CHECK: scalar.ph:
-; CHECK:  %bc.resume.val = phi i32 [ %resume.val, %middle.block ], [ 0, %0 ]
-; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ %5, %middle.block ]
+; CHECK:  %bc.resume.val = phi i32 [ 0, %middle.block ], [ 0, %0 ]
+; CHECK:  %bc.merge.rdx = phi i32 [ 1, %0 ], [ 1, %min.iters.checked ], [ %5, %middle.block ]

 define i32 @max_i32_backedgetaken() nounwind readnone ssp uwtable {

--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@ -175,8 +175,8 @@ for.end:                                          ; preds = %for.body, %entry
 }

 ;CHECK-LABEL: @reduction_and(
-;CHECK: and <4 x i32>
 ;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: and <4 x i32>
 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ;CHECK: and <4 x i32>
 ;CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@ -11,9 +11,9 @@ target triple = "x86_64-apple-macosx10.9.0"

 ;CHECK-LABEL: define i32 @foo
 ;CHECK: for.body.preheader:
-;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
+;CHECK: br i1 %cmp.zero, label %scalar.ph, label %vector.memcheck, !dbg [[BODY_LOC:![0-9]+]]
 ;CHECK: vector.memcheck:
-;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph, !dbg [[BODY_LOC]]
+;CHECK: br i1 %memcheck.conflict, label %scalar.ph, label %vector.ph, !dbg [[BODY_LOC]]
 ;CHECK: load <4 x float>
 define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
 entry: