From c40a2f9b2a0a76c1cc10f96deba6b19074c02c4b Mon Sep 17 00:00:00 2001 From: Evgeny Stupachenko Date: Thu, 2 Mar 2017 17:38:46 +0000 Subject: [PATCH] The patch turns on epilogue unroll for loops with constant recurency start. Summary: Set unroll remainder to epilog if a loop contains a phi with constant parameter: loop: pn = phi [Const, PreHeader], [pn.next, Latch] ... Reviewer: hfinkel Differential Revision: http://reviews.llvm.org/D27004 From: Evgeny Stupachenko git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@296770 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/LoopUnroll.cpp | 45 ++++++++++++++++++- test/Transforms/LoopUnroll/revisit.ll | 6 +-- test/Transforms/LoopUnroll/runtime-loop5.ll | 7 +-- .../LoopUnroll/unroll-heuristics-pgo.ll | 4 +- test/Transforms/LoopUnroll/unroll-pragmas.ll | 24 +++++----- 5 files changed, 65 insertions(+), 21 deletions(-) diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index a675797af27..3c669ce644e 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -216,6 +216,45 @@ const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB, } } +/// The function chooses which type of unroll (epilog or prolog) is more +/// profitabale. +/// Epilog unroll is more profitable when there is PHI that starts from +/// constant. In this case epilog will leave PHI start from constant, +/// but prolog will convert it to non-constant. +/// +/// loop: +/// PN = PHI [I, Latch], [CI, PreHeader] +/// I = foo(PN) +/// ... +/// +/// Epilog unroll case. +/// loop: +/// PN = PHI [I2, Latch], [CI, PreHeader] +/// I1 = foo(PN) +/// I2 = foo(I1) +/// ... +/// Prolog unroll case. +/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader] +/// loop: +/// PN = PHI [I2, Latch], [NewPN, PreHeader] +/// I1 = foo(PN) +/// I2 = foo(I1) +/// ... +/// +static bool isEpilogProfitable(Loop *L) { + BasicBlock *PreHeader = L->getLoopPreheader(); + BasicBlock *Header = L->getHeader(); + assert(PreHeader && Header); + for (Instruction &BBI : *Header) { + PHINode *PN = dyn_cast(&BBI); + if (!PN) + break; + if (isa(PN->getIncomingValueForBlock(PreHeader))) + return true; + } + return false; +} + /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true /// if unrolling was successful, or false if the loop was unmodified. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional @@ -359,9 +398,13 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, "convergent operation."); }); + bool EpilogProfitability = + UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog + : isEpilogProfitable(L); + if (RuntimeTripCount && TripMultiple % Count != 0 && !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount, - UnrollRuntimeEpilog, LI, SE, DT, + EpilogProfitability, LI, SE, DT, PreserveLCSSA)) { if (Force) RuntimeTripCount = false; diff --git a/test/Transforms/LoopUnroll/revisit.ll b/test/Transforms/LoopUnroll/revisit.ll index 88c9f7ba21a..fddf6cd1c4e 100644 --- a/test/Transforms/LoopUnroll/revisit.ll +++ b/test/Transforms/LoopUnroll/revisit.ll @@ -138,11 +138,11 @@ l0.0.latch: ; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 2 containing: %l0.0
; CHECK-CHILDREN-NOT: LoopUnrollPass ; -; Revisit the children of the outer loop that are part of the prologue. +; Revisit the children of the outer loop that are part of the epilogue. ; -; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.prol
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.epil
; CHECK-NOT: LoopUnrollPass -; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.prol
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.epil
; CHECK-NOT: LoopUnrollPass l0.latch: br label %l0 diff --git a/test/Transforms/LoopUnroll/runtime-loop5.ll b/test/Transforms/LoopUnroll/runtime-loop5.ll index 6340058411f..86a26baca65 100644 --- a/test/Transforms/LoopUnroll/runtime-loop5.ll +++ b/test/Transforms/LoopUnroll/runtime-loop5.ll @@ -14,9 +14,6 @@ entry: %cmp1 = icmp eq i3 %n, 0 br i1 %cmp1, label %for.end, label %for.body -; UNROLL-16-NOT: for.body.prol: -; UNROLL-4: for.body.prol: - for.body: ; preds = %for.body, %entry ; UNROLL-16-LABEL: for.body: ; UNROLL-4-LABEL: for.body: @@ -42,6 +39,10 @@ for.body: ; preds = %for.body, %entry ; UNROLL-16-LABEL: for.end ; UNROLL-4-LABEL: for.end + +; UNROLL-16-NOT: for.body.epil: +; UNROLL-4: for.body.epil: + for.end: ; preds = %for.body, %entry %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ] ret i3 %sum.0.lcssa diff --git a/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll index f7add40b9d1..6778a52b3af 100644 --- a/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll +++ b/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -3,12 +3,12 @@ @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 ; CHECK-LABEL: @bar_prof -; CHECK: loop.prol: ; CHECK: loop: ; CHECK: %mul = mul ; CHECK: %mul.1 = mul ; CHECK: %mul.2 = mul ; CHECK: %mul.3 = mul +; CHECK: loop.epil: define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !1 { entry: br label %loop @@ -32,7 +32,7 @@ loop.end: } ; CHECK-LABEL: @bar_prof_flat -; CHECK-NOT: loop.prol +; CHECK-NOT: loop.epil define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !1 { entry: br label %loop diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll index 2843e627b3c..88f32c92d69 100644 --- a/test/Transforms/LoopUnroll/unroll-pragmas.ll +++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll @@ -171,10 +171,6 @@ for.end: ; preds = %for.body, %entry ; should be duplicated (original and 4x unrolled). ; ; CHECK-LABEL: @runtime_loop_with_count4( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body ; CHECK: store ; CHECK: store @@ -182,6 +178,10 @@ for.end: ; preds = %for.body, %entry ; CHECK: store ; CHECK-NOT: store ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -287,10 +287,6 @@ for.end: ; preds = %for.body ; (original and 8x). ; ; CHECK-LABEL: @runtime_loop_with_enable( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body: ; CHECK: store i32 ; CHECK: store i32 @@ -302,6 +298,10 @@ for.end: ; preds = %for.body ; CHECK: store i32 ; CHECK-NOT: store i32 ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0 @@ -328,16 +328,16 @@ for.end: ; preds = %for.body, %entry ; should be duplicated (original and 3x unrolled). ; ; CHECK-LABEL: @runtime_loop_with_count3( -; CHECK: for.body.prol: -; CHECK: store -; CHECK-NOT: store -; CHECK: br i1 ; CHECK: for.body ; CHECK: store ; CHECK: store ; CHECK: store ; CHECK-NOT: store ; CHECK: br i1 +; CHECK: for.body.epil: +; CHECK: store +; CHECK-NOT: store +; CHECK: br i1 define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) { entry: %cmp3 = icmp sgt i32 %b, 0