[LV] Avoid emitting trivially dead instructions

Some instructions from the original loop, when vectorized, can become trivially
dead. This happens because of the way we structure the new loop. For example,
we create new induction variables and induction variable "steps" in the new
loop. Thus, when we go to vectorize the original induction variable update, it
may no longer be needed due to the instructions we've already created. This
patch prevents us from creating these redundant instructions. This reduces code
size before simplification and allows greater flexibility in code generation
since we have fewer unnecessary instruction uses.

Differential Revision: https://reviews.llvm.org/D25631

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@284631 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matthew Simpson 2016-10-19 19:22:02 +00:00
parent db638de2de
commit f461f21edc
2 changed files with 87 additions and 0 deletions

View File

@ -441,6 +441,10 @@ protected:
/// respective conditions.
void predicateInstructions();
/// Collect the instructions from the original loop that would be trivially
/// dead in the vectorized loop if generated.
void collectTriviallyDeadInstructions();
/// Shrinks vector element sizes to the smallest bitwidth they can be legally
/// represented as.
void truncateToMinimalBitwidths();
@ -763,6 +767,14 @@ protected:
// Record whether runtime checks are added.
bool AddedSafetyChecks;
// Holds instructions from the original loop whose counterparts in the
// vectorized loop would be trivially dead if generated. For example,
// original induction update instructions can become dead because we
// separately emit induction "steps" when generating code for the new loop.
// Similarly, we create a new latch condition when setting up the structure
// of the new loop, so the old one can become dead.
SmallPtrSet<Instruction *, 4> DeadInstructions;
};
class InnerLoopUnroller : public InnerLoopVectorizer {
@ -3802,6 +3814,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
// are vectorized, so we can use them to construct the PHI.
PhiVector PHIsToFix;
// Collect instructions from the original loop that will become trivially
// dead in the vectorized loop. We don't need to vectorize these
// instructions.
collectTriviallyDeadInstructions();
// Scan the loop in a topological order to ensure that defs are vectorized
// before users.
LoopBlocksDFS DFS(OrigLoop);
@ -4209,6 +4226,29 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
}
}
void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
BasicBlock *Latch = OrigLoop->getLoopLatch();
// We create new control-flow for the vectorized loop, so the original
// condition will be dead after vectorization if it's only used by the
// branch.
auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
if (Cmp && Cmp->hasOneUse())
DeadInstructions.insert(Cmp);
// We create new "steps" for induction variable updates to which the original
// induction variables map. An original update instruction will be dead if
// all its users except the induction variable are dead.
for (auto &Induction : *Legal->getInductionVars()) {
PHINode *Ind = Induction.first;
auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
if (all_of(IndUpdate->users(), [&](User *U) -> bool {
return U == Ind || DeadInstructions.count(cast<Instruction>(U));
}))
DeadInstructions.insert(IndUpdate);
}
}
void InnerLoopVectorizer::predicateInstructions() {
// For each instruction I marked for predication on value C, split I into its
@ -4536,6 +4576,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
// For each instruction in the old loop.
for (Instruction &I : *BB) {
// If the instruction will become trivially dead when vectorized, we don't
// need to generate it.
if (DeadInstructions.count(&I))
continue;
// Scalarize instructions that should remain scalar after vectorization.
if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
isa<DbgInfoIntrinsic>(&I)) &&

View File

@ -0,0 +1,42 @@
; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; CHECK-LABEL: @dead_instructions_01
;
; This test ensures that we don't generate trivially dead instructions prior to
; instruction simplification. We don't need to generate instructions
; corresponding to the original induction variable update or branch condition,
; since we rewrite the loop structure.
;
; CHECK: vector.body:
; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK: %[[I0:.+]] = add i64 %index, 0
; CHECK: %[[I2:.+]] = add i64 %index, 2
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[I0]]
; CHECK: getelementptr inbounds i64, i64* %a, i64 %[[I2]]
; CHECK-NOT: add nuw nsw i64 %[[I0]], 1
; CHECK-NOT: add nuw nsw i64 %[[I2]], 1
; CHECK-NOT: icmp slt i64 {{.*}}, %n
; CHECK: %index.next = add i64 %index, 4
; CHECK: %[[CMP:.+]] = icmp eq i64 %index.next, %n.vec
; CHECK: br i1 %[[CMP]], label %middle.block, label %vector.body
;
define i64 @dead_instructions_01(i64 *%a, i64 %n) {
entry:
br label %for.body
for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%r = phi i64 [ %tmp2, %for.body ], [ 0, %entry ]
%tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
%tmp1 = load i64, i64* %tmp0, align 8
%tmp2 = add i64 %tmp1, %r
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end:
%tmp3 = phi i64 [ %tmp2, %for.body ]
ret i64 %tmp3
}