[Loop Fusion] Integrate Loop Peeling into Loop Fusion

Summary:
This patch adds the ability to peel off iterations of the first loop in loop
fusion. This can allow for both loops to have the same trip count, making it
legal for them to be fused together.

Here is a simple scenario peeling can be used in loop fusion:

for (i = 0; i < 10; ++i)
  a[i] = a[i] + 3;
for (j = 1; j < 10; ++j)
  b[j] = b[j] + 5;

Here is we can make use of peeling, and then fuse the two loops together. We can
peel off the 0th iteration of the loop i, and then combine loop i and j for
i = 1 to 10.

a[0] = a[0] +3;
for (i = 1; i < 10; ++i) {
  a[i] = a[i] + 3;
  b[i] = b[i] + 5;
}

Currently peeling with loop fusion is only supported for loops with constant
trip counts and a single exit point. Both unguarded and guarded loops are
supported.

Author: sidbav (Sidharth Baveja)

Reviewers: kbarton, Meinersbur, bkramer, Whitney, skatkov, ashlykov, fhahn, bmahjour

Reviewed By: bmahjour

Subscribers: bmahjour, mgorny, hiraditya, zzheng

Tags: LLVM

Differential Revision: https://reviews.llvm.org/D82927
This commit is contained in:
Sidharth Baveja 2020-07-21 15:59:14 +00:00
parent 670f47ee88
commit 4628407302
5 changed files with 605 additions and 45 deletions

View File

@ -46,6 +46,7 @@
#include "llvm/Transforms/Scalar/LoopFuse.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/DependenceAnalysis.h"
#include "llvm/Analysis/DomTreeUpdater.h"
#include "llvm/Analysis/LoopInfo.h"
@ -53,6 +54,7 @@
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Verifier.h"
#include "llvm/InitializePasses.h"
@ -64,6 +66,7 @@
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/CodeMoverUtils.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
using namespace llvm;
@ -114,6 +117,11 @@ static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
"Use all available analyses")),
cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
static cl::opt<unsigned> FusionPeelMaxCount(
"loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
cl::desc("Max number of iterations to be peeled from a loop, such that "
"fusion can take place"));
#ifndef NDEBUG
static cl::opt<bool>
VerboseFusionDebugging("loop-fusion-verbose-debug",
@ -157,6 +165,12 @@ struct FusionCandidate {
bool Valid;
/// Guard branch of the loop, if it exists
BranchInst *GuardBranch;
/// Peeling Paramaters of the Loop.
TTI::PeelingPreferences PP;
/// Can you Peel this Loop?
bool AbleToPeel;
/// Has this loop been Peeled
bool Peeled;
/// Dominator and PostDominator trees are needed for the
/// FusionCandidateCompare function, required by FusionCandidateSet to
@ -168,11 +182,13 @@ struct FusionCandidate {
OptimizationRemarkEmitter &ORE;
FusionCandidate(Loop *L, const DominatorTree *DT,
const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE)
const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
TTI::PeelingPreferences PP)
: Preheader(L->getLoopPreheader()), Header(L->getHeader()),
ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
Latch(L->getLoopLatch()), L(L), Valid(true),
GuardBranch(L->getLoopGuardBranch()), DT(DT), PDT(PDT), ORE(ORE) {
GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
// Walk over all blocks in the loop and check for conditions that may
// prevent fusion. For each block, walk over all instructions and collect
@ -243,6 +259,17 @@ struct FusionCandidate {
return Preheader;
}
/// After Peeling the loop is modified quite a bit, hence all of the Blocks
/// need to be updated accordingly.
void updateAfterPeeling() {
Preheader = L->getLoopPreheader();
Header = L->getHeader();
ExitingBlock = L->getExitingBlock();
ExitBlock = L->getExitBlock();
Latch = L->getLoopLatch();
verify();
}
/// Given a guarded loop, get the successor of the guard that is not in the
/// loop.
///
@ -254,6 +281,8 @@ struct FusionCandidate {
assert(GuardBranch && "Only valid on guarded loops.");
assert(GuardBranch->isConditional() &&
"Expecting guard to be a conditional branch.");
if (Peeled)
return GuardBranch->getSuccessor(1);
return (GuardBranch->getSuccessor(0) == Preheader)
? GuardBranch->getSuccessor(1)
: GuardBranch->getSuccessor(0);
@ -358,25 +387,26 @@ struct FusionCandidateCompare {
bool operator()(const FusionCandidate &LHS,
const FusionCandidate &RHS) const {
const DominatorTree *DT = LHS.DT;
const PostDominatorTree *PDT = LHS.PDT;
BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
// Do not save PDT to local variable as it is only used in asserts and thus
// will trigger an unused variable warning if building without asserts.
assert(DT && LHS.PDT && "Expecting valid dominator tree");
assert(DT && PDT && "Expecting valid dominator tree");
// Do this compare first so if LHS == RHS, function returns false.
if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
// RHS dominates LHS
// Verify LHS post-dominates RHS
assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
assert(PDT->dominates(LHSEntryBlock, RHSEntryBlock));
return false;
}
if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
// Verify RHS Postdominates LHS
assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
assert(PDT->dominates(RHSEntryBlock, LHSEntryBlock));
return true;
}
@ -515,13 +545,17 @@ private:
ScalarEvolution &SE;
PostDominatorTree &PDT;
OptimizationRemarkEmitter &ORE;
AssumptionCache &AC;
const TargetTransformInfo &TTI;
public:
LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
ScalarEvolution &SE, PostDominatorTree &PDT,
OptimizationRemarkEmitter &ORE, const DataLayout &DL)
OptimizationRemarkEmitter &ORE, const DataLayout &DL,
AssumptionCache &AC, const TargetTransformInfo &TTI)
: LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE) {}
DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
/// This is the main entry point for loop fusion. It will traverse the
/// specified function and collect candidate loops to fuse, starting at the
@ -606,7 +640,9 @@ private:
/// Flow Equivalent sets, sorted by dominance.
void collectFusionCandidates(const LoopVector &LV) {
for (Loop *L : LV) {
FusionCandidate CurrCand(L, &DT, &PDT, ORE);
TTI::PeelingPreferences PP =
gatherPeelingPreferences(L, SE, TTI, None, None);
FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
if (!CurrCand.isEligibleForFusion(SE))
continue;
@ -656,33 +692,126 @@ private:
/// Determine if two fusion candidates have the same trip count (i.e., they
/// execute the same number of iterations).
///
/// Note that for now this method simply returns a boolean value because there
/// are no mechanisms in loop fusion to handle different trip counts. In the
/// future, this behaviour can be extended to adjust one of the loops to make
/// the trip counts equal (e.g., loop peeling). When this is added, this
/// interface may need to change to return more information than just a
/// boolean value.
bool identicalTripCounts(const FusionCandidate &FC0,
const FusionCandidate &FC1) const {
/// This function will return a pair of values. The first is a boolean,
/// stating whether or not the two candidates are known at compile time to
/// have the same TripCount. The second is the difference in the two
/// TripCounts. This information can be used later to determine whether or not
/// peeling can be performed on either one of the candiates.
std::pair<bool, Optional<unsigned>>
haveIdenticalTripCounts(const FusionCandidate &FC0,
const FusionCandidate &FC1) const {
const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
if (isa<SCEVCouldNotCompute>(TripCount0)) {
UncomputableTripCount++;
LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
return false;
return {false, None};
}
const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
if (isa<SCEVCouldNotCompute>(TripCount1)) {
UncomputableTripCount++;
LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
return false;
return {false, None};
}
LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
<< *TripCount1 << " are "
<< (TripCount0 == TripCount1 ? "identical" : "different")
<< "\n");
return (TripCount0 == TripCount1);
if (TripCount0 == TripCount1)
return {true, 0};
LLVM_DEBUG(dbgs() << "The loops do not have the same tripcount, "
"determining the difference between trip counts\n");
// Currently only considering loops with a single exit point
// and a non-constant trip count.
unsigned TC0 = SE.getSmallConstantTripCount(FC0.L);
unsigned TC1 = SE.getSmallConstantTripCount(FC1.L);
// If any of the tripcounts are zero that means that loop(s) do not have
// a single exit or a constant tripcount.
if (TC0 == 0 || TC1 == 0) {
LLVM_DEBUG(dbgs() << "Loop(s) do not have a single exit point or do not "
"have a constant number of iterations. Peeling "
"is not benefical\n");
return {false, None};
}
Optional<unsigned> Difference = None;
int Diff = TC0 - TC1;
if (Diff > 0)
Difference = Diff;
else {
LLVM_DEBUG(
dbgs()
<< "Difference is less than 0. FC1 (second loop) has more "
"iterations than the first one. Currently not supported.\n");
}
LLVM_DEBUG(dbgs() << "Difference in loop trip count is: " << Difference
<< "\n");
return {false, Difference};
}
void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
unsigned PeelCount) {
assert(FC0.AbleToPeel && "Should be able to peel loop");
LLVM_DEBUG(dbgs() << "Attempting to peel first " << PeelCount
<< " iterations of the first loop. \n");
FC0.Peeled = peelLoop(FC0.L, PeelCount, &LI, &SE, &DT, &AC, true);
if (FC0.Peeled) {
LLVM_DEBUG(dbgs() << "Done Peeling\n");
#ifndef NDEBUG
auto IdenticalTripCount = haveIdenticalTripCounts(FC0, FC1);
assert(IdenticalTripCount.first && *IdenticalTripCount.second == 0 &&
"Loops should have identical trip counts after peeling");
#endif
FC0.PP.PeelCount = PeelCount;
// Peeling does not update the PDT
PDT.recalculate(*FC0.Preheader->getParent());
FC0.updateAfterPeeling();
// In this case the iterations of the loop are constant, so the first
// loop will execute completely (will not jump from one of
// the peeled blocks to the second loop). Here we are updating the
// branch conditions of each of the peeled blocks, such that it will
// branch to its successor which is not the Preheader of the second Loop.
// Doing this update will ensure that the entry block of the first loop
// dominates the entry block of the second loop.
BasicBlock *BB =
FC0.GuardBranch ? FC0.ExitBlock->getUniqueSuccessor() : FC1.Preheader;
SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
for (BasicBlock *Pred : predecessors(BB)) {
if (Pred != FC0.ExitBlock) {
BranchInst *Old = dyn_cast<BranchInst>(Pred->getTerminator());
BasicBlock *Succ = Old->getSuccessor(0);
if (Succ == BB)
Succ = Old->getSuccessor(1);
BranchInst *NewBranch = BranchInst::Create(Succ);
ReplaceInstWithInst(Old, NewBranch);
TreeUpdates.emplace_back(
DominatorTree::UpdateType(DominatorTree::Delete, Pred, BB));
}
}
DTU.applyUpdates(TreeUpdates);
DTU.flush();
LLVM_DEBUG(
dbgs() << "Sucessfully peeled " << FC0.PP.PeelCount
<< " iterations from the first loop.\n"
"Both Loops have the same number of iterations now.\n");
}
}
/// Walk each set of control flow equivalent fusion candidates and attempt to
@ -716,7 +845,32 @@ private:
FC0->verify();
FC1->verify();
if (!identicalTripCounts(*FC0, *FC1)) {
// Check if the candidates have identical tripcounts (first value of
// pair), and if not check the difference in the tripcounts between
// the loops (second value of pair). The difference is not equal to
// None iff the loops iterate a constant number of times, and have a
// single exit.
std::pair<bool, Optional<unsigned>> IdenticalTripCountRes =
haveIdenticalTripCounts(*FC0, *FC1);
bool SameTripCount = IdenticalTripCountRes.first;
Optional<unsigned> TCDifference = IdenticalTripCountRes.second;
// Here we are checking that FC0 (the first loop) can be peeled, and
// both loops have different tripcounts.
if (FC0->AbleToPeel && !SameTripCount && TCDifference) {
if (*TCDifference > FusionPeelMaxCount) {
LLVM_DEBUG(dbgs()
<< "Difference in loop trip counts: " << *TCDifference
<< " is greater than maximum peel count specificed: "
<< FusionPeelMaxCount << "\n");
} else {
// Dependent on peeling being performed on the first loop, and
// assuming all other conditions for fusion return true.
SameTripCount = true;
}
}
if (!SameTripCount) {
LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
"counts. Not fusing.\n");
reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@ -734,7 +888,7 @@ private:
// Ensure that FC0 and FC1 have identical guards.
// If one (or both) are not guarded, this check is not necessary.
if (FC0->GuardBranch && FC1->GuardBranch &&
!haveIdenticalGuards(*FC0, *FC1)) {
!haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
"guards. Not Fusing.\n");
reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
@ -803,13 +957,23 @@ private:
LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
<< *FC1 << "\n");
FusionCandidate FC0Copy = *FC0;
// Peel the loop after determining that fusion is legal. The Loops
// will still be safe to fuse after the peeling is performed.
bool Peel = TCDifference && *TCDifference > 0;
if (Peel)
peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
// Report fusion to the Optimization Remarks.
// Note this needs to be done *before* performFusion because
// performFusion will change the original loops, making it not
// possible to identify them after fusion is complete.
reportLoopFusion<OptimizationRemark>(*FC0, *FC1, FuseCounter);
reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
FuseCounter);
FusionCandidate FusedCand(performFusion(*FC0, *FC1), &DT, &PDT, ORE);
FusionCandidate FusedCand(
performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
FC0Copy.PP);
FusedCand.verify();
assert(FusedCand.isEligibleForFusion(SE) &&
"Fused candidate should be eligible for fusion!");
@ -1086,16 +1250,17 @@ private:
return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
}
/// Simplify the condition of the latch branch of \p FC to true, when both of
/// its successors are the same.
/// Modify the latch branch of FC to be unconditional since successors of the
/// branch are the same.
void simplifyLatchBranch(const FusionCandidate &FC) const {
BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
if (FCLatchBranch) {
assert(FCLatchBranch->isConditional() &&
FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
"Expecting the two successors of FCLatchBranch to be the same");
FCLatchBranch->setCondition(
llvm::ConstantInt::getTrue(FCLatchBranch->getCondition()->getType()));
BranchInst *NewBranch =
BranchInst::Create(FCLatchBranch->getSuccessor(0));
ReplaceInstWithInst(FCLatchBranch, NewBranch);
}
}
@ -1155,7 +1320,8 @@ private:
if (FC0.GuardBranch)
return fuseGuardedLoops(FC0, FC1);
assert(FC1.Preheader == FC0.ExitBlock);
assert(FC1.Preheader ==
(FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
assert(FC1.Preheader->size() == 1 &&
FC1.Preheader->getSingleSuccessor() == FC1.Header);
@ -1178,7 +1344,7 @@ private:
FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
// Then modify the control flow and update DT and PDT.
SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
SmallVector<DominatorTree::UpdateType, 16> TreeUpdates;
// The old exiting block of the first loop (FC0) has to jump to the header
// of the second as we need to execute the code in the second header block
@ -1197,12 +1363,27 @@ private:
// to FC1.Header? I think this is basically what the three sequences are
// trying to accomplish; however, doing this directly in the CFG may mean
// the DT/PDT becomes invalid
FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
FC1.Header);
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
if (!FC0.Peeled) {
FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
FC1.Header);
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Delete, FC0.ExitingBlock, FC1.Preheader));
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
} else {
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
// Remove the ExitBlock of the first Loop (also not needed)
FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
FC1.Header);
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
FC0.ExitBlock->getTerminator()->eraseFromParent();
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
}
// The pre-header of L1 is not necessary anymore.
assert(pred_begin(FC1.Preheader) == pred_end(FC1.Preheader));
@ -1246,7 +1427,7 @@ private:
FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
// Change the condition of FC0 latch branch to true, as both successors of
// Modify the latch branch of FC0 to be unconditional as both successors of
// the branch are the same.
simplifyLatchBranch(FC0);
@ -1268,6 +1449,11 @@ private:
LI.removeBlock(FC1.Preheader);
DTU.deleteBB(FC1.Preheader);
if (FC0.Peeled) {
LI.removeBlock(FC0.ExitBlock);
DTU.deleteBB(FC0.ExitBlock);
}
DTU.flush();
// Is there a way to keep SE up-to-date so we don't need to forget the loops
@ -1364,10 +1550,15 @@ private:
BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
// Move instructions from the exit block of FC0 to the beginning of the exit
// block of FC1.
moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI);
// block of FC1, in the case that the FC0 loop has not been peeled. In the
// case that FC0 loop is peeled, then move the instructions of the successor
// of the FC0 Exit block to the beginning of the exit block of FC1.
moveInstructionsToTheBeginning(
(FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
DT, PDT, DI);
// Move instructions from the guard block of FC1 to the end of the guard
// block of FC0.
@ -1387,8 +1578,9 @@ private:
// for FC1 (where FC1 guard would have gone if FC1 was not executed).
FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
FC1.Header);
BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
// The guard of FC1 is not necessary anymore.
FC1.GuardBranch->eraseFromParent();
@ -1403,6 +1595,15 @@ private:
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
if (FC0.Peeled) {
// Remove the Block after the ExitBlock of FC0
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Delete, FC0ExitBlockSuccessor, FC1GuardBlock));
FC0ExitBlockSuccessor->getTerminator()->eraseFromParent();
new UnreachableInst(FC0ExitBlockSuccessor->getContext(),
FC0ExitBlockSuccessor);
}
assert(pred_begin(FC1GuardBlock) == pred_end(FC1GuardBlock) &&
"Expecting guard block to have no predecessors");
assert(succ_begin(FC1GuardBlock) == succ_end(FC1GuardBlock) &&
@ -1509,7 +1710,7 @@ private:
FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
// Change the condition of FC0 latch branch to true, as both successors of
// Modify the latch branch of FC0 to be unconditional as both successors of
// the branch are the same.
simplifyLatchBranch(FC0);
@ -1540,6 +1741,10 @@ private:
LI.removeBlock(FC1GuardBlock);
LI.removeBlock(FC1.Preheader);
LI.removeBlock(FC0.ExitBlock);
if (FC0.Peeled) {
LI.removeBlock(FC0ExitBlockSuccessor);
DTU.deleteBB(FC0ExitBlockSuccessor);
}
DTU.deleteBB(FC1GuardBlock);
DTU.deleteBB(FC1.Preheader);
DTU.deleteBB(FC0.ExitBlock);
@ -1606,6 +1811,8 @@ struct LoopFuseLegacy : public FunctionPass {
AU.addRequired<PostDominatorTreeWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addRequired<DependenceAnalysisWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
@ -1622,9 +1829,12 @@ struct LoopFuseLegacy : public FunctionPass {
auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
const DataLayout &DL = F.getParent()->getDataLayout();
LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
return LF.fuseLoops(F);
}
};
@ -1637,9 +1847,11 @@ PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
const DataLayout &DL = F.getParent()->getDataLayout();
LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL);
LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
bool Changed = LF.fuseLoops(F);
if (!Changed)
return PreservedAnalyses::all();
@ -1662,6 +1874,8 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }

View File

@ -0,0 +1,85 @@
; RUN: opt -S -loop-fusion -loop-fusion-peel-max-count=3 < %s | FileCheck %s
; This will test if we are able to fuse two guarded loops which have constant
; but different trip counts. The first two iterations of the first loop should
; be peeled off, and then the loops should be fused together.
@B = common global [1024 x i32] zeroinitializer, align 16
; CHECK: void @main
; CHECK-NEXT: entry:
; CHECK: br i1 %cmp4, label %for.first.entry, label %for.end
; CHECK: for.first.entry
; CHECK-NEXT: br label %for.first.peel.begin
; CHECK: for.first.peel.begin:
; CHECK-NEXT: br label %for.first.peel
; CHECK: for.first.peel:
; CHECK: br label %for.first.peel.next
; CHECK: for.first.peel.next:
; CHECK-NEXT: br label %for.first.peel2
; CHECK: for.first.peel2:
; CHECK: br label %for.first.peel.next1
; CHECK: for.first.peel.next1:
; CHECK-NEXT: br label %for.first.peel.next11
; CHECK: for.first.peel.next11:
; CHECK-NEXT: br label %for.first.entry.peel.newph
; CHECK: for.first.entry.peel.newph:
; CHECK: br label %for.first
; CHECK: for.first:
; CHECK: br i1 %cmp3, label %for.first, label %for.second.exit
; CHECK: for.second.exit:
; CHECK: br label %for.end
; CHECK: for.end:
; CHECK-NEXT: ret void
define void @main(i32* noalias %A) {
entry:
%cmp4 = icmp slt i64 0, 45
br i1 %cmp4, label %for.first.entry, label %for.second.guard
for.first.entry: ; preds = %entry
br label %for.first
for.first: ; preds = %for.first.entry, %for.first
%i.05 = phi i64 [ %inc, %for.first ], [ 0, %for.first.entry ]
%sub = sub nsw i64 %i.05, 3
%add = add nsw i64 %i.05, 3
%mul = mul nsw i64 %sub, %add
%rem = srem i64 %mul, %i.05
%conv = trunc i64 %rem to i32
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.05
store i32 %conv, i32* %arrayidx, align 4
%inc = add nsw i64 %i.05, 1
%cmp = icmp slt i64 %inc, 45
br i1 %cmp, label %for.first, label %for.first.exit
for.first.exit: ; preds = %for.first
br label %for.second.guard
for.second.guard: ; preds = %for.first.exit, %entry
%cmp31 = icmp slt i64 2, 45
br i1 %cmp31, label %for.second.entry, label %for.end
for.second.entry: ; preds = %for.second.guard
br label %for.second
for.second: ; preds = %for.second.entry, %for.second
%i1.02 = phi i64 [ %inc14, %for.second ], [ 2, %for.second.entry ]
%sub7 = sub nsw i64 %i1.02, 3
%add8 = add nsw i64 %i1.02, 3
%mul9 = mul nsw i64 %sub7, %add8
%rem10 = srem i64 %mul9, %i1.02
%conv11 = trunc i64 %rem10 to i32
%arrayidx12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %i1.02
store i32 %conv11, i32* %arrayidx12, align 4
%inc14 = add nsw i64 %i1.02, 1
%cmp3 = icmp slt i64 %inc14, 45
br i1 %cmp3, label %for.second, label %for.second.exit
for.second.exit: ; preds = %for.second
br label %for.end
for.end: ; preds = %for.second.exit, %for.second.guard
ret void
}

View File

@ -0,0 +1,72 @@
; RUN: opt -S -loop-fusion -loop-fusion-peel-max-count=3 < %s | FileCheck %s
; This will test that we do not fuse two guarded loops together.
; These loops do not have the same trip count, fusing should be possible after
; peeling the loops. However, the exit block of the first loop makes the loops
; unsafe to peel.
; The expected output of this test is the function as below.
; CHECK: void @unsafe_exitblock
; CHECK: for.first.guard
; CHECK: br i1 %cmp3, label %for.first.preheader, label %for.second.guard
; CHECK: for.first.preheader:
; CHECK-NEXT: br label %for.first
; CHECK: for.first:
; CHECK: br i1 %cmp, label %for.first, label %for.first.exit
; CHECK: for.first.exit:
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label %for.second.guard
; CHECK: for.second.guard:
; CHECK: br i1 %cmp21, label %for.second.preheader, label %for.end
; CHECK: for.second.preheader:
; CHECK-NEXT: br label %for.second
; CHECK: for.second:
; CHECK: br i1 %cmp2, label %for.second, label %for.second.exit
; CHECK: for.second.exit:
; CHECK-NEXT: br label %for.end
; CHECK: for.end:
; CHECK-NEXT: ret void
define void @unsafe_exitblock(i32* noalias %A, i32* noalias %B) {
for.first.guard:
%cmp3 = icmp slt i64 0, 45
br i1 %cmp3, label %for.first.preheader, label %for.second.guard
for.first.preheader:
br label %for.first
for.first:
%i.04 = phi i64 [ %inc, %for.first ], [ 0, %for.first.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.04
store i32 0, i32* %arrayidx, align 4
%inc = add nsw i64 %i.04, 1
%cmp = icmp slt i64 %inc, 45
br i1 %cmp, label %for.first, label %for.first.exit
for.first.exit:
call void @bar()
br label %for.second.guard
for.second.guard:
%cmp21 = icmp slt i64 2,45
br i1 %cmp21, label %for.second.preheader, label %for.end
for.second.preheader:
br label %for.second
for.second:
%j.02 = phi i64 [ %inc6, %for.second ], [ 2, %for.second.preheader ]
%arrayidx4 = getelementptr inbounds i32, i32* %B, i64 %j.02
store i32 0, i32* %arrayidx4, align 4
%inc6 = add nsw i64 %j.02, 1
%cmp2 = icmp slt i64 %inc6, 45
br i1 %cmp2, label %for.second, label %for.second.exit
for.second.exit:
br label %for.end
for.end:
ret void
}
declare void @bar()

View File

@ -0,0 +1,83 @@
; RUN: opt -S -loop-fusion -loop-fusion-peel-max-count=3 < %s | FileCheck %s
; This will check that we do not fuse these two loops together. These loops are
; valid cadidates for peeling, however they are not adjacent.
; The expected output of this test is the function below.
; CHECK: void @function
; CHECK-NEXT: for.first.preheader:
; CHECK-NEXT: br label %for.first
; CHECK: for.first:
; CHECK: br label %for.first.latch
; CHECK: for.first.latch:
; CHECK: br i1 %exitcond4, label %for.first, label %for.first.exit
; CHECK: for.first.exit:
; CHECK-NEXT: br label %for.next
; CHECK: for.next:
; CHECK-NEXT: br label %for.second.preheader
; CHECK: for.second.preheader:
; CHECK: br label %for.second
; CHECK: for.second:
; CHECK: br label %for.second.latch
; CHECK: for.second.latch:
; CHECK: br i1 %exitcond, label %for.second, label %for.end
; CHECK: for.end:
; CHECK-NEXT: ret void
@B = common global [1024 x i32] zeroinitializer, align 16
define void @function(i32* noalias %arg) {
for.first.preheader:
br label %for.first
for.first: ; preds = %for.first.preheader, %for.first.latch
%.014 = phi i32 [ 0, %for.first.preheader ], [ %tmp15, %for.first.latch ]
%indvars.iv23 = phi i64 [ 0, %for.first.preheader ], [ %indvars.iv.next3, %for.first.latch ]
%tmp = add nsw i32 %.014, -3
%tmp8 = add nuw nsw i64 %indvars.iv23, 3
%tmp9 = trunc i64 %tmp8 to i32
%tmp10 = mul nsw i32 %tmp, %tmp9
%tmp11 = trunc i64 %indvars.iv23 to i32
%tmp12 = srem i32 %tmp10, %tmp11
%tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv23
store i32 %tmp12, i32* %tmp13, align 4
br label %for.first.latch
for.first.latch: ; preds = %for.first
%indvars.iv.next3 = add nuw nsw i64 %indvars.iv23, 1
%tmp15 = add nuw nsw i32 %.014, 1
%exitcond4 = icmp ne i64 %indvars.iv.next3, 100
br i1 %exitcond4, label %for.first, label %for.first.exit
for.first.exit: ; preds: %for.first.latch
br label %for.next
for.next: ; preds = %for.first.exit
br label %for.second.preheader
for.second.preheader: ; preds = %for.next
br label %for.second
for.second: ; preds = %for.second.preheader, %for.second.latch
%.02 = phi i32 [ 0, %for.second.preheader ], [ %tmp28, %for.second.latch ]
%indvars.iv1 = phi i64 [ 3, %for.second.preheader ], [ %indvars.iv.next, %for.second.latch ]
%tmp20 = add nsw i32 %.02, -3
%tmp21 = add nuw nsw i64 %indvars.iv1, 3
%tmp22 = trunc i64 %tmp21 to i32
%tmp23 = mul nsw i32 %tmp20, %tmp22
%tmp24 = trunc i64 %indvars.iv1 to i32
%tmp25 = srem i32 %tmp23, %tmp24
%tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv1
store i32 %tmp25, i32* %tmp26, align 4
br label %for.second.latch
for.second.latch: ; preds = %for.second
%indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
%tmp28 = add nuw nsw i32 %.02, 1
%exitcond = icmp ne i64 %indvars.iv.next, 100
br i1 %exitcond, label %for.second, label %for.end
for.end: ; preds = %for.second.latch
ret void
}

View File

@ -0,0 +1,106 @@
; RUN: opt -S -loop-fusion -loop-fusion-peel-max-count=3 < %s | FileCheck %s
; This will test whether we can fuse two loops together if they have constant
; but a different tripcount.
; The first three iterations of the first loop should be peeled, and then the
; two loops should be fused together in this example.
; C Code
;
; int B[1024];
;
; void function(int *arg) {
; for (int i = 0; i != 100; ++i)
; arg[i] = ((i - 3)*(i+3)) % i;
;
; for (int i = 3; i != 100; ++i)
; B[i] = ((i-6)*(i+3)) % i;
; }
; CHECK: void @function
; CHECK-NEXT: for.first.preheader:
; CHECK-NEXT: br label %for.first.peel.begin
; CHECK: for.first.peel.begin:
; CHECK-NEXT: br label %for.first.peel
; CHECK: for.first.peel
; CHECK: br label %for.first.latch.peel
; CHECK: for.first.latch.peel:
; CHECK: br label %for.first.peel.next
; CHECK: for.first.peel.next:
; CHECK-NEXT: br label %for.first.peel2
; CHECK: for.first.peel2:
; CHECK: br label %for.first.latch.peel10
; CHECK: for.first.latch.peel10:
; CHECK: br label %for.first.peel.next1
; CHECK: for.first.peel.next1:
; CHECK-NEXT: br label %for.first.peel15
; CHECK: for.first.peel15:
; CHECK: br label %for.first.latch.peel23
; CHECK: for.first.latch.peel23:
; CHECK: br label %for.first.peel.next14
; CHECK: for.first.peel.next14:
; CHECK-NEXT: br label %for.first.peel.next27
; CHECK: for.first.peel.next27:
; CHECK-NEXT: br label %for.first.preheader.peel.newph
; CHECK: for.first.preheader.peel.newph:
; CHECK-NEXT: br label %for.first
; CHECK: for.first:
; CHECK: br label %for.first.latch
; CHECK: for.first.latch:
; CHECK: br label %for.second.latch
; CHECK: for.second.latch:
; CHECK: br i1 %exitcond, label %for.first, label %for.end
; CHECK: for.end:
; CHECK-NEXT: ret void
@B = common global [1024 x i32] zeroinitializer, align 16
define void @function(i32* noalias %arg) {
for.first.preheader:
br label %for.first
for.first: ; preds = %for.first.preheader, %for.first.latch
%.014 = phi i32 [ 0, %for.first.preheader ], [ %tmp15, %for.first.latch ]
%indvars.iv23 = phi i64 [ 0, %for.first.preheader ], [ %indvars.iv.next3, %for.first.latch ]
%tmp = add nsw i32 %.014, -3
%tmp8 = add nuw nsw i64 %indvars.iv23, 3
%tmp9 = trunc i64 %tmp8 to i32
%tmp10 = mul nsw i32 %tmp, %tmp9
%tmp11 = trunc i64 %indvars.iv23 to i32
%tmp12 = srem i32 %tmp10, %tmp11
%tmp13 = getelementptr inbounds i32, i32* %arg, i64 %indvars.iv23
store i32 %tmp12, i32* %tmp13, align 4
br label %for.first.latch
for.first.latch: ; preds = %for.first
%indvars.iv.next3 = add nuw nsw i64 %indvars.iv23, 1
%tmp15 = add nuw nsw i32 %.014, 1
%exitcond4 = icmp ne i64 %indvars.iv.next3, 100
br i1 %exitcond4, label %for.first, label %for.second.preheader
for.second.preheader: ; preds = %for.first.latch
br label %for.second
for.second: ; preds = %for.second.preheader, %for.second.latch
%.02 = phi i32 [ 0, %for.second.preheader ], [ %tmp28, %for.second.latch ]
%indvars.iv1 = phi i64 [ 3, %for.second.preheader ], [ %indvars.iv.next, %for.second.latch ]
%tmp20 = add nsw i32 %.02, -3
%tmp21 = add nuw nsw i64 %indvars.iv1, 3
%tmp22 = trunc i64 %tmp21 to i32
%tmp23 = mul nsw i32 %tmp20, %tmp22
%tmp24 = trunc i64 %indvars.iv1 to i32
%tmp25 = srem i32 %tmp23, %tmp24
%tmp26 = getelementptr inbounds [1024 x i32], [1024 x i32]* @B, i64 0, i64 %indvars.iv1
store i32 %tmp25, i32* %tmp26, align 4
br label %for.second.latch
for.second.latch: ; preds = %for.second
%indvars.iv.next = add nuw nsw i64 %indvars.iv1, 1
%tmp28 = add nuw nsw i32 %.02, 1
%exitcond = icmp ne i64 %indvars.iv.next, 100
br i1 %exitcond, label %for.second, label %for.end
for.end: ; preds = %for.second.latch
ret void
}