[RuntimeUnroll] Add heuristic for unrolling multi-exit loop

Add a profitability heuristic to enable runtime unrolling of multi-exit
loop: There can be atmost two unique exit blocks for the loop and the
second exit block should be a deoptimizing block. Also, there can be one
other exiting block other than the latch exiting block. The reason for
the latter is so that we limit the number of branches in the unrolled
code to being at most the unroll factor.  Deoptimizing blocks are rarely
taken so these additional number of branches created due to the
unrolling are predictable, since one of their target is the deopt block.

Reviewers: apilipenko, reames, evstupac, mkuper

Subscribers: llvm-commits

Reviewed by: reames

Differential Revision: https://reviews.llvm.org/D35380

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@313363 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Anna Thomas 2017-09-15 15:56:05 +00:00
parent a50c5eba3e
commit 3c71a6c431
2 changed files with 128 additions and 2 deletions

View File

@ -487,9 +487,41 @@ static bool canProfitablyUnrollMultiExitLoop(
PreserveLCSSA, UseEpilogRemainder) &&
"Should be safe to unroll before checking profitability!");
#endif
// Priority goes to UnrollRuntimeMultiExit if it's supplied.
return UnrollRuntimeMultiExit.getNumOccurrences() ? UnrollRuntimeMultiExit
: false;
if (UnrollRuntimeMultiExit.getNumOccurrences())
return UnrollRuntimeMultiExit;
// The main pain point with multi-exit loop unrolling is that once unrolled,
// we will not be able to merge all blocks into a straight line code.
// There are branches within the unrolled loop that go to the OtherExits.
// The second point is the increase in code size, but this is true
// irrespective of multiple exits.
// Note: Both the heuristics below are coarse grained. We are essentially
// enabling unrolling of loops that have a single side exit other than the
// normal LatchExit (i.e. exiting into a deoptimize block).
// The heuristics considered are:
// 1. low number of branches in the unrolled version.
// 2. high predictability of these extra branches.
// We avoid unrolling loops that have more than two exiting blocks. This
// limits the total number of branches in the unrolled loop to be atmost
// the unroll factor (since one of the exiting blocks is the latch block).
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
if (ExitingBlocks.size() > 2)
return false;
// The second heuristic is that L has one exit other than the latchexit and
// that exit is a deoptimize block. We know that deoptimize blocks are rarely
// taken, which also implies the branch leading to the deoptimize block is
// highly predictable.
return (OtherExits.size() == 1 &&
OtherExits[0]->getTerminatingDeoptimizeCall());
// TODO: These can be fine-tuned further to consider code size or deopt states
// that are captured by the deoptimize exit block.
// Also, we can extend this to support more cases, if we actually
// know of kinds of multiexit loops that would benefit from unrolling.
}
/// Insert code in the prolog/epilog code when unrolling a loop with a

View File

@ -0,0 +1,94 @@
; RUN: opt < %s -loop-unroll -unroll-runtime=true -verify-dom-info -verify-loop-info -instcombine -S | FileCheck %s
; RUN: opt < %s -loop-unroll -unroll-runtime=true -verify-dom-info -unroll-runtime-multi-exit=false -verify-loop-info -S | FileCheck %s -check-prefix=NOUNROLL
; this tests when unrolling multiple exit loop occurs by default (i.e. without specifying -unroll-runtime-multi-exit)
; the second exit block is a deopt block. The loop has one exiting block other than the latch.
define i32 @test1(i32* nocapture %a, i64 %n) {
; CHECK-LABEL: test1(
; CHECK-LABEL: header.epil:
; CHECK-NEXT: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %latch.epil ], [ %indvars.iv.unr, %header.epil.preheader ]
; CHECK-LABEL: otherexit.loopexit:
; CHECK-NEXT: %sum.02.lcssa.ph = phi i32 [ %sum.02, %for.exiting_block ], [ %add, %for.exiting_block.1 ], [ %add.1, %for.exiting_block.2 ], [ %add.2, %for.exiting_block.3 ], [ %add.3, %for.exiting_block.4 ], [ %add.4, %for.exiting_block.5 ], [ %add.5, %for.exiting_block.6 ],
; CHECK-NEXT: br label %otherexit
; CHECK-LABEL: otherexit.loopexit3:
; CHECK-NEXT: br label %otherexit
; CHECK-LABEL: otherexit:
; CHECK-NEXT: %sum.02.lcssa = phi i32 [ %sum.02.lcssa.ph, %otherexit.loopexit ], [ %sum.02.epil, %otherexit.loopexit3 ]
; CHECK-NEXT: %rval = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02.lcssa) ]
; CHECK-NEXT: ret i32 %rval
; CHECK-LABEL: latch.7:
; CHECK: add i64 %indvars.iv, 8
; NOUNROLL: test1(
; NOUNROLL-NOT: .epil
; NOUNROLL-NOT: .prol
; NOUNROLL: otherexit:
; NOUNROLL-NEXT: %sum.02.lcssa = phi i32 [ %sum.02, %for.exiting_block ]
; NOUNROLL-NEXT: %rval = call i32 (...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02.lcssa) ]
entry:
br label %header
header:
%indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
%sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
br label %for.exiting_block
for.exiting_block:
%cmp = icmp eq i64 %n, 42
br i1 %cmp, label %otherexit, label %latch
latch:
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %sum.02
%indvars.iv.next = add i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond, label %latchexit, label %header
latchexit: ; preds = %latch
%sum.0.lcssa = phi i32 [ %add, %latch ]
ret i32 %sum.0.lcssa
otherexit:
%rval = call i32(...) @llvm.experimental.deoptimize.i32() [ "deopt"(i32 %sum.02) ]
ret i32 %rval
}
; the exit block is not a deopt block.
define i32 @test2(i32* nocapture %a, i64 %n) {
; CHECK-LABEL: test2(
; CHECK-NOT: .epil
; CHECK-NOT: .prol
; CHECK-LABEL: otherexit:
; CHECK-NEXT: ret i32 %sum.02
entry:
br label %header
header:
%indvars.iv = phi i64 [ %indvars.iv.next, %latch ], [ 0, %entry ]
%sum.02 = phi i32 [ %add, %latch ], [ 0, %entry ]
br label %for.exiting_block
for.exiting_block:
%cmp = icmp eq i64 %n, 42
br i1 %cmp, label %otherexit, label %latch
latch:
%arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %sum.02
%indvars.iv.next = add i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %n
br i1 %exitcond, label %latchexit, label %header
latchexit: ; preds = %latch
%sum.0.lcssa = phi i32 [ %add, %latch ]
ret i32 %sum.0.lcssa
otherexit:
%rval = phi i32 [%sum.02, %for.exiting_block ]
ret i32 %rval
}
declare i32 @llvm.experimental.deoptimize.i32(...)