[CodeGen] Generic Hardware Loop Support

Patch which introduces a target-independent framework for generating
hardware loops at the IR level. Most of the code has been taken from
PowerPC CTRLoops and PowerPC has been ported over to use this generic
pass. The target dependent parts have been moved into
TargetTransformInfo, via isHardwareLoopProfitable, with
HardwareLoopInfo introduced to transfer information from the backend.
    
Three generic intrinsics have been introduced:
- void @llvm.set_loop_iterations
  Takes as a single operand, the number of iterations to be executed.
- i1 @llvm.loop_decrement(anyint)
  Takes the maximum number of elements processed in an iteration of
  the loop body and subtracts this from the total count. Returns
  false when the loop should exit.
- anyint @llvm.loop_decrement_reg(anyint, anyint)
  Takes the number of elements remaining to be processed as well as
  the maximum numbe of elements processed in an iteration of the loop
  body. Returns the updated number of elements remaining.

llvm-svn: 362774
This commit is contained in:
Sam Parker 2019-06-07 07:35:30 +00:00
parent 6484b770e8
commit 7df94e8bbc
24 changed files with 1083 additions and 597 deletions

View File

@ -35,6 +35,8 @@ namespace Intrinsic {
enum ID : unsigned;
}
class AssumptionCache;
class BranchInst;
class Function;
class GlobalValue;
class IntrinsicInst;
@ -44,6 +46,7 @@ class SCEV;
class ScalarEvolution;
class StoreInst;
class SwitchInst;
class TargetLibraryInfo;
class Type;
class User;
class Value;
@ -445,6 +448,32 @@ public:
void getUnrollingPreferences(Loop *L, ScalarEvolution &,
UnrollingPreferences &UP) const;
/// Attributes of a target dependent hardware loop. Here, the term 'element'
/// describes the work performed by an IR loop that has not been vectorized
/// by the compiler.
struct HardwareLoopInfo {
HardwareLoopInfo() = delete;
HardwareLoopInfo(Loop *L) : L(L) { }
Loop *L = nullptr;
BasicBlock *ExitBlock = nullptr;
BranchInst *ExitBranch = nullptr;
const SCEV *ExitCount = nullptr;
IntegerType *CountType = nullptr;
Value *LoopDecrement = nullptr; // The maximum number of elements
// processed in the loop body.
bool IsNestingLegal = false; // Can a hardware loop be a parent to
// another hardware loop.
bool CounterInReg = false; // Should loop counter be updated in
// the loop via a phi?
};
/// Query the target whether it would be profitable to convert the given loop
/// into a hardware loop.
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) const;
/// @}
/// \name Scalar Target Information
@ -1073,6 +1102,10 @@ public:
virtual bool isLoweredToCall(const Function *F) = 0;
virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
UnrollingPreferences &UP) = 0;
virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) = 0;
virtual bool isLegalAddImmediate(int64_t Imm) = 0;
virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@ -1304,6 +1337,12 @@ public:
UnrollingPreferences &UP) override {
return Impl.getUnrollingPreferences(L, SE, UP);
}
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
HardwareLoopInfo &HWLoopInfo) override {
return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
bool isLegalAddImmediate(int64_t Imm) override {
return Impl.isLegalAddImmediate(Imm);
}

View File

@ -190,6 +190,13 @@ public:
return true;
}
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo) {
return false;
}
void getUnrollingPreferences(Loop *, ScalarEvolution &,
TTI::UnrollingPreferences &) {}

View File

@ -491,6 +491,13 @@ public:
UP.BEInsns = 2;
}
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo) {
return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
int getInstructionLatency(const Instruction *I) {
if (isa<LoadInst>(I))
return getST()->getSchedModel().DefaultLoadLatency;

View File

@ -446,6 +446,9 @@ namespace llvm {
/// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
FunctionPass *createCFIInstrInserter();
/// Create Hardware Loop pass. \see HardwareLoops.cpp
FunctionPass *createHardwareLoopsPass();
} // End llvm namespace
#endif

View File

@ -1182,6 +1182,27 @@ def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
//===---------- Intrinsics to control hardware supported loops ----------===//
// Specify that the value given is the number of iterations that the next loop
// will execute.
def int_set_loop_iterations :
Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
// Decrement loop counter by the given argument. Return false if the loop
// should exit.
def int_loop_decrement :
Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
// Decrement the first operand (the loop counter) by the second operand (the
// maximum number of elements processed in an iteration). Return the remaining
// number of iterations still to be executed. This is effectively a sub which
// can be used with a phi, icmp and br to control the number of iterations
// executed, as usual.
def int_loop_decrement_reg :
Intrinsic<[llvm_anyint_ty],
[llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
//===----- Intrinsics that are used to provide predicate information -----===//
def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],

View File

@ -163,6 +163,7 @@ void initializeGlobalOptLegacyPassPass(PassRegistry&);
void initializeGlobalSplitPass(PassRegistry&);
void initializeGlobalsAAWrapperPassPass(PassRegistry&);
void initializeGuardWideningLegacyPassPass(PassRegistry&);
void initializeHardwareLoopsPass(PassRegistry&);
void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
void initializeIPCPPass(PassRegistry&);

View File

@ -223,6 +223,7 @@ namespace {
(void) llvm::createEliminateAvailableExternallyPass();
(void) llvm::createScalarizeMaskedMemIntrinPass();
(void) llvm::createWarnMissedTransformationsPass();
(void) llvm::createHardwareLoopsPass();
(void)new llvm::IntervalPartition();
(void)new llvm::ScalarEvolutionWrapperPass();

View File

@ -130,6 +130,12 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
return TTIImpl->isLoweredToCall(F);
}
bool TargetTransformInfo::isHardwareLoopProfitable(
Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
}
void TargetTransformInfo::getUnrollingPreferences(
Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
return TTIImpl->getUnrollingPreferences(L, SE, UP);

View File

@ -33,6 +33,7 @@ add_llvm_library(LLVMCodeGen
GCRootLowering.cpp
GCStrategy.cpp
GlobalMerge.cpp
HardwareLoops.cpp
IfConversion.cpp
ImplicitNullChecks.cpp
IndirectBrExpandPass.cpp

View File

@ -38,6 +38,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
initializeFuncletLayoutPass(Registry);
initializeGCMachineCodeAnalysisPass(Registry);
initializeGCModuleInfoPass(Registry);
initializeHardwareLoopsPass(Registry);
initializeIfConverterPass(Registry);
initializeImplicitNullChecksPass(Registry);
initializeIndirectBrExpandPassPass(Registry);

View File

@ -0,0 +1,441 @@
//===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// Insert hardware loop intrinsics into loops which are deemed profitable by
/// the target, by querying TargetTransformInfo. A hardware loop comprises of
/// two intrinsics: one, outside the loop, to set the loop iteration count and
/// another, in the exit block, to decrement the counter. The decremented value
/// can either be carried through the loop via a phi or handled in some opaque
/// way by the target.
///
//===----------------------------------------------------------------------===//
#include "llvm/Pass.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#define DEBUG_TYPE "hardware-loops"
#define HW_LOOPS_NAME "Hardware Loop Insertion"
using namespace llvm;
static cl::opt<bool>
ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
cl::desc("Force hardware loops intrinsics to be inserted"));
static cl::opt<bool>
ForceHardwareLoopPHI(
"force-hardware-loop-phi", cl::Hidden, cl::init(false),
cl::desc("Force hardware loop counter to be updated through a phi"));
static cl::opt<bool>
ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
cl::desc("Force allowance of nested hardware loops"));
static cl::opt<unsigned>
LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
cl::desc("Set the loop decrement value"));
static cl::opt<unsigned>
CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
cl::desc("Set the loop counter bitwidth"));
STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
namespace {
using TTI = TargetTransformInfo;
class HardwareLoops : public FunctionPass {
public:
static char ID;
HardwareLoops() : FunctionPass(ID) {
initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
// Try to convert the given Loop into a hardware loop.
bool TryConvertLoop(Loop *L);
// Given that the target believes the loop to be profitable, try to
// convert it.
bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);
private:
ScalarEvolution *SE = nullptr;
LoopInfo *LI = nullptr;
const DataLayout *DL = nullptr;
const TargetTransformInfo *TTI = nullptr;
DominatorTree *DT = nullptr;
bool PreserveLCSSA = false;
AssumptionCache *AC = nullptr;
TargetLibraryInfo *LibInfo = nullptr;
Module *M = nullptr;
bool MadeChange = false;
};
class HardwareLoop {
// Expand the trip count scev into a value that we can use.
Value *InitLoopCount(BasicBlock *BB);
// Insert the set_loop_iteration intrinsic.
void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB);
// Insert the loop_decrement intrinsic.
void InsertLoopDec();
// Insert the loop_decrement_reg intrinsic.
Instruction *InsertLoopRegDec(Value *EltsRem);
// If the target requires the counter value to be updated in the loop,
// insert a phi to hold the value. The intended purpose is for use by
// loop_decrement_reg.
PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
// Create a new cmp, that checks the returned value of loop_decrement*,
// and update the exit branch to use it.
void UpdateBranch(Value *EltsRem);
public:
HardwareLoop(TTI::HardwareLoopInfo &Info, ScalarEvolution &SE,
const DataLayout &DL) :
SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()),
ExitCount(Info.ExitCount),
CountType(Info.CountType),
ExitBranch(Info.ExitBranch),
LoopDecrement(Info.LoopDecrement),
UsePHICounter(Info.CounterInReg) { }
void Create();
private:
ScalarEvolution &SE;
const DataLayout &DL;
Loop *L = nullptr;
Module *M = nullptr;
const SCEV *ExitCount = nullptr;
Type *CountType = nullptr;
BranchInst *ExitBranch = nullptr;
Value *LoopDecrement = nullptr;
bool UsePHICounter = false;
};
}
char HardwareLoops::ID = 0;
bool HardwareLoops::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
M = F.getParent();
for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
Loop *L = *I;
if (!L->getParentLoop())
TryConvertLoop(L);
}
return MadeChange;
}
// Return true if the search should stop, which will be when an inner loop is
// converted and the parent loop doesn't support containing a hardware loop.
bool HardwareLoops::TryConvertLoop(Loop *L) {
// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
if (TryConvertLoop(*I))
return true; // Stop search.
// Bail out if the loop has irreducible control flow.
LoopBlocksRPO RPOT(L);
RPOT.perform(LI);
if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
return false;
TTI::HardwareLoopInfo HWLoopInfo(L);
if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) ||
ForceHardwareLoops) {
// Allow overriding of the counter width and loop decrement value.
if (CounterBitWidth.getNumOccurrences())
HWLoopInfo.CountType =
IntegerType::get(M->getContext(), CounterBitWidth);
if (LoopDecrement.getNumOccurrences())
HWLoopInfo.LoopDecrement =
ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
MadeChange |= TryConvertLoop(HWLoopInfo);
return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
}
return false;
}
bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {
Loop *L = HWLoopInfo.L;
LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
IE = ExitingBlocks.end(); I != IE; ++I) {
const SCEV *EC = SE->getExitCount(L, *I);
if (isa<SCEVCouldNotCompute>(EC))
continue;
if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
if (ConstEC->getValue()->isZero())
continue;
} else if (!SE->isLoopInvariant(EC, L))
continue;
if (SE->getTypeSizeInBits(EC->getType()) >
HWLoopInfo.CountType->getBitWidth())
continue;
// If this exiting block is contained in a nested loop, it is not eligible
// for insertion of the branch-and-decrement since the inner loop would
// end up messing up the value in the CTR.
if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L &&
!ForceNestedLoop)
continue;
// We now have a loop-invariant count of loop iterations (which is not the
// constant zero) for which we know that this loop will not exit via this
// existing block.
// We need to make sure that this block will run on every loop iteration.
// For this to be true, we must dominate all blocks with backedges. Such
// blocks are in-loop predecessors to the header block.
bool NotAlways = false;
for (pred_iterator PI = pred_begin(L->getHeader()),
PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
if (!L->contains(*PI))
continue;
if (!DT->dominates(*I, *PI)) {
NotAlways = true;
break;
}
}
if (NotAlways)
continue;
// Make sure this blocks ends with a conditional branch.
Instruction *TI = (*I)->getTerminator();
if (!TI)
continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
if (!BI->isConditional())
continue;
HWLoopInfo.ExitBranch = BI;
} else
continue;
// Note that this block may not be the loop latch block, even if the loop
// has a latch block.
HWLoopInfo.ExitBlock = *I;
HWLoopInfo.ExitCount = EC;
break;
}
if (!HWLoopInfo.ExitBlock)
return false;
BasicBlock *Preheader = L->getLoopPreheader();
// If we don't have a preheader, then insert one.
if (!Preheader)
Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
if (!Preheader)
return false;
HardwareLoop HWLoop(HWLoopInfo, *SE, *DL);
HWLoop.Create();
++NumHWLoops;
return true;
}
void HardwareLoop::Create() {
LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
BasicBlock *BeginBB = L->getLoopPreheader();
Value *LoopCountInit = InitLoopCount(BeginBB);
if (!LoopCountInit)
return;
InsertIterationSetup(LoopCountInit, BeginBB);
if (UsePHICounter || ForceHardwareLoopPHI) {
Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
LoopDec->setOperand(0, EltsRem);
UpdateBranch(LoopDec);
} else
InsertLoopDec();
// Run through the basic blocks of the loop and see if any of them have dead
// PHIs that can be removed.
for (auto I : L->blocks())
DeleteDeadPHIs(I);
}
Value *HardwareLoop::InitLoopCount(BasicBlock *BB) {
SCEVExpander SCEVE(SE, DL, "loopcnt");
if (!ExitCount->getType()->isPointerTy() &&
ExitCount->getType() != CountType)
ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount "
<< *ExitCount << "\n");
return nullptr;
}
Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
BB->getTerminator());
LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n");
return Count;
}
void HardwareLoop::InsertIterationSetup(Value *LoopCountInit,
BasicBlock *BB) {
IRBuilder<> Builder(BB->getTerminator());
Type *Ty = LoopCountInit->getType();
Function *LoopIter =
Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
Value *Call = Builder.CreateCall(LoopIter, LoopCountInit);
LLVM_DEBUG(dbgs() << "HWLoops: Iteration set: " << *Call << "\n");
}
void HardwareLoop::InsertLoopDec() {
IRBuilder<> CondBuilder(ExitBranch);
Function *DecFunc =
Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
LoopDecrement->getType());
Value *Ops[] = { LoopDecrement };
Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
Value *OldCond = ExitBranch->getCondition();
ExitBranch->setCondition(NewCond);
// The false branch must exit the loop.
if (!L->contains(ExitBranch->getSuccessor(0)))
ExitBranch->swapSuccessors();
// The old condition may be dead now, and may have even created a dead PHI
// (the original induction variable).
RecursivelyDeleteTriviallyDeadInstructions(OldCond);
LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
}
Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
IRBuilder<> CondBuilder(ExitBranch);
Function *DecFunc =
Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
{ EltsRem->getType(), EltsRem->getType(),
LoopDecrement->getType()
});
Value *Ops[] = { EltsRem, LoopDecrement };
Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
return cast<Instruction>(Call);
}
PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
BasicBlock *Preheader = L->getLoopPreheader();
BasicBlock *Header = L->getHeader();
BasicBlock *Latch = ExitBranch->getParent();
IRBuilder<> Builder(Header->getFirstNonPHI());
PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
Index->addIncoming(NumElts, Preheader);
Index->addIncoming(EltsRem, Latch);
LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
return Index;
}
void HardwareLoop::UpdateBranch(Value *EltsRem) {
IRBuilder<> CondBuilder(ExitBranch);
Value *NewCond =
CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
Value *OldCond = ExitBranch->getCondition();
ExitBranch->setCondition(NewCond);
// The false branch must exit the loop.
if (!L->contains(ExitBranch->getSuccessor(0)))
ExitBranch->swapSuccessors();
// The old condition may be dead now, and may have even created a dead PHI
// (the original induction variable).
RecursivelyDeleteTriviallyDeadInstructions(OldCond);
}
INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }

View File

@ -71,63 +71,7 @@ using namespace llvm;
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif
// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
static cl::opt<unsigned>
SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
cl::desc("Loops with a constant trip count smaller than "
"this value will not use the count register."));
STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
namespace {
struct PPCCTRLoops : public FunctionPass {
#ifndef NDEBUG
static int Counter;
#endif
public:
static char ID;
PPCCTRLoops() : FunctionPass(ID) {
initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
private:
bool mightUseCTR(BasicBlock *BB);
bool convertToCTRLoop(Loop *L);
private:
const PPCTargetMachine *TM;
const PPCSubtarget *STI;
const PPCTargetLowering *TLI;
const DataLayout *DL;
const TargetLibraryInfo *LibInfo;
const TargetTransformInfo *TTI;
LoopInfo *LI;
ScalarEvolution *SE;
DominatorTree *DT;
bool PreserveLCSSA;
TargetSchedModel SchedModel;
};
char PPCCTRLoops::ID = 0;
#ifndef NDEBUG
int PPCCTRLoops::Counter = 0;
#endif
#ifndef NDEBUG
struct PPCCTRLoopsVerify : public MachineFunctionPass {
@ -153,16 +97,6 @@ namespace {
#endif // NDEBUG
} // end anonymous namespace
INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
false, false)
FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
#ifndef NDEBUG
INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
"PowerPC CTR Loops Verify", false, false)
@ -175,512 +109,6 @@ FunctionPass *llvm::createPPCCTRLoopsVerify() {
}
#endif // NDEBUG
bool PPCCTRLoops::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
if (!TPC)
return false;
TM = &TPC->getTM<PPCTargetMachine>();
STI = TM->getSubtargetImpl(F);
TLI = STI->getTargetLowering();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
SchedModel.init(STI);
bool MadeChange = false;
for (LoopInfo::iterator I = LI->begin(), E = LI->end();
I != E; ++I) {
Loop *L = *I;
if (!L->getParentLoop())
MadeChange |= convertToCTRLoop(L);
}
return MadeChange;
}
static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
return false;
}
// Determining the address of a TLS variable results in a function call in
// certain TLS models.
static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
const auto *GV = dyn_cast<GlobalValue>(MemAddr);
if (!GV) {
// Recurse to check for constants that refer to TLS global variables.
if (const auto *CV = dyn_cast<Constant>(MemAddr))
for (const auto &CO : CV->operands())
if (memAddrUsesCTR(TM, CO))
return true;
return false;
}
if (!GV->isThreadLocal())
return false;
TLSModel::Model Model = TM.getTLSModel(GV);
return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
}
// Loop through the inline asm constraints and look for something that clobbers
// ctr.
static bool asmClobbersCTR(InlineAsm *IA) {
InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
InlineAsm::ConstraintInfo &C = CIV[i];
if (C.Type != InlineAsm::isInput)
for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
return true;
}
return false;
}
bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
J != JE; ++J) {
if (CallInst *CI = dyn_cast<CallInst>(J)) {
// Inline ASM is okay, unless it clobbers the ctr register.
if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
if (asmClobbersCTR(IA))
return true;
continue;
}
if (Function *F = CI->getCalledFunction()) {
// Most intrinsics don't become function calls, but some might.
// sin, cos, exp and log are always calls.
unsigned Opcode = 0;
if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
switch (F->getIntrinsicID()) {
default: continue;
// If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
// we're definitely using CTR.
case Intrinsic::ppc_is_decremented_ctr_nonzero:
case Intrinsic::ppc_mtctr:
return true;
// VisualStudio defines setjmp as _setjmp
#if defined(_MSC_VER) && defined(setjmp) && \
!defined(setjmp_undefined_for_msvc)
# pragma push_macro("setjmp")
# undef setjmp
# define setjmp_undefined_for_msvc
#endif
case Intrinsic::setjmp:
#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
// let's return it to _setjmp state
# pragma pop_macro("setjmp")
# undef setjmp_undefined_for_msvc
#endif
case Intrinsic::longjmp:
// Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
// because, although it does clobber the counter register, the
// control can't then return to inside the loop unless there is also
// an eh_sjlj_setjmp.
case Intrinsic::eh_sjlj_setjmp:
case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset:
case Intrinsic::powi:
case Intrinsic::log:
case Intrinsic::log2:
case Intrinsic::log10:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::pow:
case Intrinsic::sin:
case Intrinsic::cos:
return true;
case Intrinsic::copysign:
if (CI->getArgOperand(0)->getType()->getScalarType()->
isPPC_FP128Ty())
return true;
else
continue; // ISD::FCOPYSIGN is never a library call.
case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
case Intrinsic::rint: Opcode = ISD::FRINT; break;
case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
case Intrinsic::round: Opcode = ISD::FROUND; break;
case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
}
}
// PowerPC does not use [US]DIVREM or other library calls for
// operations on regular types which are not otherwise library calls
// (i.e. soft float or atomics). If adapting for targets that do,
// additional care is required here.
LibFunc Func;
if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
LibInfo->getLibFunc(F->getName(), Func) &&
LibInfo->hasOptimizedCodeGen(Func)) {
// Non-read-only functions are never treated as intrinsics.
if (!CI->onlyReadsMemory())
return true;
// Conversion happens only for FP calls.
if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
return true;
switch (Func) {
default: return true;
case LibFunc_copysign:
case LibFunc_copysignf:
continue; // ISD::FCOPYSIGN is never a library call.
case LibFunc_copysignl:
return true;
case LibFunc_fabs:
case LibFunc_fabsf:
case LibFunc_fabsl:
continue; // ISD::FABS is never a library call.
case LibFunc_sqrt:
case LibFunc_sqrtf:
case LibFunc_sqrtl:
Opcode = ISD::FSQRT; break;
case LibFunc_floor:
case LibFunc_floorf:
case LibFunc_floorl:
Opcode = ISD::FFLOOR; break;
case LibFunc_nearbyint:
case LibFunc_nearbyintf:
case LibFunc_nearbyintl:
Opcode = ISD::FNEARBYINT; break;
case LibFunc_ceil:
case LibFunc_ceilf:
case LibFunc_ceill:
Opcode = ISD::FCEIL; break;
case LibFunc_rint:
case LibFunc_rintf:
case LibFunc_rintl:
Opcode = ISD::FRINT; break;
case LibFunc_round:
case LibFunc_roundf:
case LibFunc_roundl:
Opcode = ISD::FROUND; break;
case LibFunc_trunc:
case LibFunc_truncf:
case LibFunc_truncl:
Opcode = ISD::FTRUNC; break;
case LibFunc_fmin:
case LibFunc_fminf:
case LibFunc_fminl:
Opcode = ISD::FMINNUM; break;
case LibFunc_fmax:
case LibFunc_fmaxf:
case LibFunc_fmaxl:
Opcode = ISD::FMAXNUM; break;
}
}
if (Opcode) {
EVT EVTy =
TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
if (EVTy == MVT::Other)
return true;
if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
continue;
else if (EVTy.isVector() &&
TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
continue;
return true;
}
}
return true;
} else if (isa<BinaryOperator>(J) &&
J->getType()->getScalarType()->isPPC_FP128Ty()) {
// Most operations on ppc_f128 values become calls.
return true;
} else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
CastInst *CI = cast<CastInst>(J);
if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
return true;
} else if (isLargeIntegerTy(!TM->isPPC64(),
J->getType()->getScalarType()) &&
(J->getOpcode() == Instruction::UDiv ||
J->getOpcode() == Instruction::SDiv ||
J->getOpcode() == Instruction::URem ||
J->getOpcode() == Instruction::SRem)) {
return true;
} else if (!TM->isPPC64() &&
isLargeIntegerTy(false, J->getType()->getScalarType()) &&
(J->getOpcode() == Instruction::Shl ||
J->getOpcode() == Instruction::AShr ||
J->getOpcode() == Instruction::LShr)) {
// Only on PPC32, for 128-bit integers (specifically not 64-bit
// integers), these might be runtime calls.
return true;
} else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
// On PowerPC, indirect jumps use the counter register.
return true;
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
return true;
}
// FREM is always a call.
if (J->getOpcode() == Instruction::FRem)
return true;
if (STI->useSoftFloat()) {
switch(J->getOpcode()) {
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::UIToFP:
case Instruction::SIToFP:
case Instruction::FCmp:
return true;
}
}
for (Value *Operand : J->operands())
if (memAddrUsesCTR(*TM, Operand))
return true;
}
return false;
}
bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
bool MadeChange = false;
// Do not convert small short loops to CTR loop.
unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
SmallPtrSet<const Value *, 32> EphValues;
auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
// 6 is an approximate latency for the mtctr instruction.
if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
return false;
}
// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
MadeChange |= convertToCTRLoop(*I);
LLVM_DEBUG(dbgs() << "Nested loop converted\n");
}
// If a nested loop has been converted, then we can't convert this loop.
if (MadeChange)
return MadeChange;
// Bail out if the loop has irreducible control flow.
LoopBlocksRPO RPOT(L);
RPOT.perform(LI);
if (containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI))
return false;
#ifndef NDEBUG
// Stop trying after reaching the limit (if any).
int Limit = CTRLoopLimit;
if (Limit >= 0) {
if (Counter >= CTRLoopLimit)
return false;
Counter++;
}
#endif
// We don't want to spill/restore the counter register, and so we don't
// want to use the counter register if the loop contains calls.
for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
I != IE; ++I)
if (mightUseCTR(*I))
return MadeChange;
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
// If there is an exit edge known to be frequently taken,
// we should not transform this loop.
for (auto &BB : ExitingBlocks) {
Instruction *TI = BB->getTerminator();
if (!TI) continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
uint64_t TrueWeight = 0, FalseWeight = 0;
if (!BI->isConditional() ||
!BI->extractProfMetadata(TrueWeight, FalseWeight))
continue;
// If the exit path is more frequent than the loop path,
// we return here without further analysis for this loop.
bool TrueIsExit = !L->contains(BI->getSuccessor(0));
if (( TrueIsExit && FalseWeight < TrueWeight) ||
(!TrueIsExit && FalseWeight > TrueWeight))
return MadeChange;
}
}
BasicBlock *CountedExitBlock = nullptr;
const SCEV *ExitCount = nullptr;
BranchInst *CountedExitBranch = nullptr;
for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
IE = ExitingBlocks.end(); I != IE; ++I) {
const SCEV *EC = SE->getExitCount(L, *I);
LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
<< (*I)->getName() << ": " << *EC << "\n");
if (isa<SCEVCouldNotCompute>(EC))
continue;
if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
if (ConstEC->getValue()->isZero())
continue;
} else if (!SE->isLoopInvariant(EC, L))
continue;
if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
continue;
// If this exiting block is contained in a nested loop, it is not eligible
// for insertion of the branch-and-decrement since the inner loop would
// end up messing up the value in the CTR.
if (LI->getLoopFor(*I) != L)
continue;
// We now have a loop-invariant count of loop iterations (which is not the
// constant zero) for which we know that this loop will not exit via this
// existing block.
// We need to make sure that this block will run on every loop iteration.
// For this to be true, we must dominate all blocks with backedges. Such
// blocks are in-loop predecessors to the header block.
bool NotAlways = false;
for (pred_iterator PI = pred_begin(L->getHeader()),
PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
if (!L->contains(*PI))
continue;
if (!DT->dominates(*I, *PI)) {
NotAlways = true;
break;
}
}
if (NotAlways)
continue;
// Make sure this blocks ends with a conditional branch.
Instruction *TI = (*I)->getTerminator();
if (!TI)
continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
if (!BI->isConditional())
continue;
CountedExitBranch = BI;
} else
continue;
// Note that this block may not be the loop latch block, even if the loop
// has a latch block.
CountedExitBlock = *I;
ExitCount = EC;
break;
}
if (!CountedExitBlock)
return MadeChange;
BasicBlock *Preheader = L->getLoopPreheader();
// If we don't have a preheader, then insert one. If we already have a
// preheader, then we can use it (except if the preheader contains a use of
// the CTR register because some such uses might be reordered by the
// selection DAG after the mtctr instruction).
if (!Preheader || mightUseCTR(Preheader))
Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
if (!Preheader)
return MadeChange;
LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
<< "\n");
// Insert the count into the preheader and replace the condition used by the
// selected branch.
MadeChange = true;
SCEVExpander SCEVE(*SE, *DL, "loopcnt");
LLVMContext &C = SE->getContext();
Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
if (!ExitCount->getType()->isPointerTy() &&
ExitCount->getType() != CountType)
ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
Value *ECValue =
SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
IRBuilder<> CountBuilder(Preheader->getTerminator());
Module *M = Preheader->getParent()->getParent();
Function *MTCTRFunc =
Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType);
CountBuilder.CreateCall(MTCTRFunc, ECValue);
IRBuilder<> CondBuilder(CountedExitBranch);
Function *DecFunc =
Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
Value *OldCond = CountedExitBranch->getCondition();
CountedExitBranch->setCondition(NewCond);
// The false branch must exit the loop.
if (!L->contains(CountedExitBranch->getSuccessor(0)))
CountedExitBranch->swapSuccessors();
// The old condition may be dead now, and may have even created a dead PHI
// (the original induction variable).
RecursivelyDeleteTriviallyDeadInstructions(OldCond);
// Run through the basic blocks of the loop and see if any of them have dead
// PHIs that can be removed.
for (auto I : L->blocks())
DeleteDeadPHIs(I);
++NumCTRLoops;
return MadeChange;
}
#ifndef NDEBUG
static bool clobbersCTR(const MachineInstr &MI) {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {

View File

@ -9944,7 +9944,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_W_CHAIN: {
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
Intrinsic::ppc_is_decremented_ctr_nonzero)
Intrinsic::loop_decrement)
break;
assert(N->getValueType(0) == MVT::i1 &&
@ -13636,7 +13636,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
Intrinsic::ppc_is_decremented_ctr_nonzero) {
Intrinsic::loop_decrement) {
// We now need to make the intrinsic dead (it cannot be instruction
// selected).
@ -13662,14 +13662,14 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
if (LHS.getOpcode() == ISD::AND &&
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
Intrinsic::ppc_is_decremented_ctr_nonzero &&
Intrinsic::loop_decrement &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
!isNullConstant(LHS.getOperand(1)))
LHS = LHS.getOperand(0);
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
Intrinsic::ppc_is_decremented_ctr_nonzero &&
Intrinsic::loop_decrement &&
isa<ConstantSDNode>(RHS)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
"Counter decrement comparison is not EQ or NE");

View File

@ -388,7 +388,7 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
PPC970_DGroup_First, PPC970_Unit_FXU;
}
let hasSideEffects = 1, Defs = [CTR8] in {
let Pattern = [(int_ppc_mtctr i64:$rS)] in
let Pattern = [(int_set_loop_iterations i64:$rS)] in
def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
"mtctr $rS", IIC_SprMTSPR>,
PPC970_DGroup_First, PPC970_Unit_FXU;

View File

@ -2605,7 +2605,7 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
PPC970_DGroup_First, PPC970_Unit_FXU;
}
let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
let Pattern = [(int_ppc_mtctr i32:$rS)] in
let Pattern = [(int_set_loop_iterations i32:$rS)] in
def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
"mtctr $rS", IIC_SprMTSPR>,
PPC970_DGroup_First, PPC970_Unit_FXU;

View File

@ -101,7 +101,6 @@ extern "C" void LLVMInitializePowerPCTarget() {
RegisterTargetMachine<PPCTargetMachine> C(getThePPC64LETarget());
PassRegistry &PR = *PassRegistry::getPassRegistry();
initializePPCCTRLoopsPass(PR);
#ifndef NDEBUG
initializePPCCTRLoopsVerifyPass(PR);
#endif
@ -422,7 +421,7 @@ bool PPCPassConfig::addPreISel() {
addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
addPass(createPPCCTRLoops());
addPass(createHardwareLoopsPass());
return false;
}

View File

@ -7,10 +7,12 @@
//===----------------------------------------------------------------------===//
#include "PPCTargetTransformInfo.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
#include "llvm/CodeGen/CostTable.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
@ -31,6 +33,13 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
cl::desc("Enable using coldcc calling conv for cold "
"internal functions"));
// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
static cl::opt<unsigned>
SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
cl::desc("Loops with a constant trip count smaller than "
"this value will not use the count register."));
//===----------------------------------------------------------------------===//
//
// PPC cost model.
@ -204,6 +213,341 @@ unsigned PPCTTIImpl::getUserCost(const User *U,
return BaseT::getUserCost(U, Operands);
}
bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
TargetLibraryInfo *LibInfo) {
const PPCTargetMachine &TM = ST->getTargetMachine();
// Loop through the inline asm constraints and look for something that
// clobbers ctr.
auto asmClobbersCTR = [](InlineAsm *IA) {
InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
InlineAsm::ConstraintInfo &C = CIV[i];
if (C.Type != InlineAsm::isInput)
for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
return true;
}
return false;
};
// Determining the address of a TLS variable results in a function call in
// certain TLS models.
std::function<bool(const Value*)> memAddrUsesCTR =
[&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
const auto *GV = dyn_cast<GlobalValue>(MemAddr);
if (!GV) {
// Recurse to check for constants that refer to TLS global variables.
if (const auto *CV = dyn_cast<Constant>(MemAddr))
for (const auto &CO : CV->operands())
if (memAddrUsesCTR(CO))
return true;
return false;
}
if (!GV->isThreadLocal())
return false;
TLSModel::Model Model = TM.getTLSModel(GV);
return Model == TLSModel::GeneralDynamic ||
Model == TLSModel::LocalDynamic;
};
auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
return false;
};
for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
J != JE; ++J) {
if (CallInst *CI = dyn_cast<CallInst>(J)) {
// Inline ASM is okay, unless it clobbers the ctr register.
if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
if (asmClobbersCTR(IA))
return true;
continue;
}
if (Function *F = CI->getCalledFunction()) {
// Most intrinsics don't become function calls, but some might.
// sin, cos, exp and log are always calls.
unsigned Opcode = 0;
if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
switch (F->getIntrinsicID()) {
default: continue;
// If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
// we're definitely using CTR.
case Intrinsic::set_loop_iterations:
case Intrinsic::loop_decrement:
return true;
// VisualStudio defines setjmp as _setjmp
#if defined(_MSC_VER) && defined(setjmp) && \
!defined(setjmp_undefined_for_msvc)
# pragma push_macro("setjmp")
# undef setjmp
# define setjmp_undefined_for_msvc
#endif
case Intrinsic::setjmp:
#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
// let's return it to _setjmp state
# pragma pop_macro("setjmp")
# undef setjmp_undefined_for_msvc
#endif
case Intrinsic::longjmp:
// Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
// because, although it does clobber the counter register, the
// control can't then return to inside the loop unless there is also
// an eh_sjlj_setjmp.
case Intrinsic::eh_sjlj_setjmp:
case Intrinsic::memcpy:
case Intrinsic::memmove:
case Intrinsic::memset:
case Intrinsic::powi:
case Intrinsic::log:
case Intrinsic::log2:
case Intrinsic::log10:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::pow:
case Intrinsic::sin:
case Intrinsic::cos:
return true;
case Intrinsic::copysign:
if (CI->getArgOperand(0)->getType()->getScalarType()->
isPPC_FP128Ty())
return true;
else
continue; // ISD::FCOPYSIGN is never a library call.
case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
case Intrinsic::rint: Opcode = ISD::FRINT; break;
case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
case Intrinsic::round: Opcode = ISD::FROUND; break;
case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
}
}
// PowerPC does not use [US]DIVREM or other library calls for
// operations on regular types which are not otherwise library calls
// (i.e. soft float or atomics). If adapting for targets that do,
// additional care is required here.
LibFunc Func;
if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
LibInfo->getLibFunc(F->getName(), Func) &&
LibInfo->hasOptimizedCodeGen(Func)) {
// Non-read-only functions are never treated as intrinsics.
if (!CI->onlyReadsMemory())
return true;
// Conversion happens only for FP calls.
if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
return true;
switch (Func) {
default: return true;
case LibFunc_copysign:
case LibFunc_copysignf:
continue; // ISD::FCOPYSIGN is never a library call.
case LibFunc_copysignl:
return true;
case LibFunc_fabs:
case LibFunc_fabsf:
case LibFunc_fabsl:
continue; // ISD::FABS is never a library call.
case LibFunc_sqrt:
case LibFunc_sqrtf:
case LibFunc_sqrtl:
Opcode = ISD::FSQRT; break;
case LibFunc_floor:
case LibFunc_floorf:
case LibFunc_floorl:
Opcode = ISD::FFLOOR; break;
case LibFunc_nearbyint:
case LibFunc_nearbyintf:
case LibFunc_nearbyintl:
Opcode = ISD::FNEARBYINT; break;
case LibFunc_ceil:
case LibFunc_ceilf:
case LibFunc_ceill:
Opcode = ISD::FCEIL; break;
case LibFunc_rint:
case LibFunc_rintf:
case LibFunc_rintl:
Opcode = ISD::FRINT; break;
case LibFunc_round:
case LibFunc_roundf:
case LibFunc_roundl:
Opcode = ISD::FROUND; break;
case LibFunc_trunc:
case LibFunc_truncf:
case LibFunc_truncl:
Opcode = ISD::FTRUNC; break;
case LibFunc_fmin:
case LibFunc_fminf:
case LibFunc_fminl:
Opcode = ISD::FMINNUM; break;
case LibFunc_fmax:
case LibFunc_fmaxf:
case LibFunc_fmaxl:
Opcode = ISD::FMAXNUM; break;
}
}
if (Opcode) {
EVT EVTy =
TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
if (EVTy == MVT::Other)
return true;
if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
continue;
else if (EVTy.isVector() &&
TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
continue;
return true;
}
}
return true;
} else if (isa<BinaryOperator>(J) &&
J->getType()->getScalarType()->isPPC_FP128Ty()) {
// Most operations on ppc_f128 values become calls.
return true;
} else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
CastInst *CI = cast<CastInst>(J);
if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
return true;
} else if (isLargeIntegerTy(!TM.isPPC64(),
J->getType()->getScalarType()) &&
(J->getOpcode() == Instruction::UDiv ||
J->getOpcode() == Instruction::SDiv ||
J->getOpcode() == Instruction::URem ||
J->getOpcode() == Instruction::SRem)) {
return true;
} else if (!TM.isPPC64() &&
isLargeIntegerTy(false, J->getType()->getScalarType()) &&
(J->getOpcode() == Instruction::Shl ||
J->getOpcode() == Instruction::AShr ||
J->getOpcode() == Instruction::LShr)) {
// Only on PPC32, for 128-bit integers (specifically not 64-bit
// integers), these might be runtime calls.
return true;
} else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
// On PowerPC, indirect jumps use the counter register.
return true;
} else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
return true;
}
// FREM is always a call.
if (J->getOpcode() == Instruction::FRem)
return true;
if (ST->useSoftFloat()) {
switch(J->getOpcode()) {
case Instruction::FAdd:
case Instruction::FSub:
case Instruction::FMul:
case Instruction::FDiv:
case Instruction::FPTrunc:
case Instruction::FPExt:
case Instruction::FPToUI:
case Instruction::FPToSI:
case Instruction::UIToFP:
case Instruction::SIToFP:
case Instruction::FCmp:
return true;
}
}
for (Value *Operand : J->operands())
if (memAddrUsesCTR(Operand))
return true;
}
return false;
}
bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo) {
const PPCTargetMachine &TM = ST->getTargetMachine();
TargetSchedModel SchedModel;
SchedModel.init(ST);
// Do not convert small short loops to CTR loop.
unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
SmallPtrSet<const Value *, 32> EphValues;
CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, *this, EphValues);
// 6 is an approximate latency for the mtctr instruction.
if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
return false;
}
// We don't want to spill/restore the counter register, and so we don't
// want to use the counter register if the loop contains calls.
for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
I != IE; ++I)
if (mightUseCTR(*I, LibInfo))
return false;
SmallVector<BasicBlock*, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
// If there is an exit edge known to be frequently taken,
// we should not transform this loop.
for (auto &BB : ExitingBlocks) {
Instruction *TI = BB->getTerminator();
if (!TI) continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
uint64_t TrueWeight = 0, FalseWeight = 0;
if (!BI->isConditional() ||
!BI->extractProfMetadata(TrueWeight, FalseWeight))
continue;
// If the exit path is more frequent than the loop path,
// we return here without further analysis for this loop.
bool TrueIsExit = !L->contains(BI->getSuccessor(0));
if (( TrueIsExit && FalseWeight < TrueWeight) ||
(!TrueIsExit && FalseWeight > TrueWeight))
return false;
}
}
LLVMContext &C = L->getHeader()->getContext();
HWLoopInfo.CountType = TM.isPPC64() ?
Type::getInt64Ty(C) : Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
}
void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {
if (ST->getDarwinDirective() == PPC::DIR_A2) {

View File

@ -33,6 +33,7 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
const PPCSubtarget *getST() const { return ST; }
const PPCTargetLowering *getTLI() const { return TLI; }
bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
public:
explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
@ -52,6 +53,10 @@ public:
unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
AssumptionCache &AC,
TargetLibraryInfo *LibInfo,
TTI::HardwareLoopInfo &HWLoopInfo);
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);

View File

@ -263,7 +263,7 @@ for.body.116.preheader: ; preds = %for.cond.112.prehea
%8 = sub i64 0, %int_part_ptr.02534
%scevgep5 = getelementptr i8, i8* %call109, i64 %8
%scevgep56 = ptrtoint i8* %scevgep5 to i64
call void @llvm.ppc.mtctr.i64(i64 %scevgep56)
call void @llvm.set.loop.iterations.i64(i64 %scevgep56)
br label %for.body.116
for.cond.cleanup: ; preds = %if.end.138, %if.end.105
@ -298,8 +298,9 @@ for.body.116: ; preds = %for.body.116, %for.
%conv134 = trunc i32 %add133 to i8
%scevgep = getelementptr i8, i8* inttoptr (i64 -1 to i8*), i64 %call109.pn2
store i8 %conv134, i8* %scevgep, align 1, !tbaa !10
%12 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
br i1 %12, label %for.body.116, label %for.cond.cleanup.115
%12 = call i64 @llvm.loop.dec(i64 %scevgep56, i64 1)
%dec.cmp = icmp ne i64 %12, 0
br i1 %dec.cmp, label %for.body.116, label %for.cond.cleanup.115
if.then.136: ; preds = %for.cond.cleanup.115
%incdec.ptr137 = getelementptr inbounds i8, i8* %int_part_ptr.0253, i64 -1
@ -323,10 +324,10 @@ cleanup.148: ; preds = %for.cond.cleanup, %
declare i8* @memcpy(i8*, i8* nocapture readonly, i64) #1
; Function Attrs: nounwind
declare void @llvm.ppc.mtctr.i64(i64) #0
declare void @llvm.set.loop.iterations.i64(i64) #0
; Function Attrs: nounwind
declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0
declare i64 @llvm.loop.dec(i64, i64) #0
attributes #0 = { nounwind }
attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }

View File

@ -1,15 +1,3 @@
; Test pass name: ppc-ctr-loops.
; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-CTR-LOOPS
; STOP-BEFORE-CTR-LOOPS-NOT: -ppc-ctr-loops
; STOP-BEFORE-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
; STOP-BEFORE-CTR-LOOPS-NOT: PowerPC CTR Loops
; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-CTR-LOOPS
; STOP-AFTER-CTR-LOOPS: -ppc-ctr-loops
; STOP-AFTER-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
; STOP-AFTER-CTR-LOOPS: PowerPC CTR Loops
; Test pass name: ppc-loop-preinc-prep.
; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP
; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep

View File

@ -0,0 +1,144 @@
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED
; CHECK-LABEL: while_lt
define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
entry:
%cmp4 = icmp ult i32 %i, %N
br i1 %cmp4, label %while.body, label %while.end
; CHECK: while.body.preheader:
; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK: br label %while.body
; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
while.body:
%i.addr.05 = phi i32 [ %inc, %while.body ], [ %i, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
store i32 %i.addr.05, i32* %arrayidx, align 4
%inc = add nuw i32 %i.addr.05, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %while.end, label %while.body
while.end:
ret void
}
; CHECK-LABEL: while_gt
; CHECK: while.body.preheader:
; CHECK: [[COUNT:%[^ ]+]] = sub i32 %i, %N
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK: br label %while.body
; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
entry:
%cmp4 = icmp sgt i32 %i, %N
br i1 %cmp4, label %while.body, label %while.end
while.body:
%i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
store i32 %i.addr.05, i32* %arrayidx, align 4
%dec = add nsw i32 %i.addr.05, -1
%cmp = icmp sgt i32 %dec, %N
br i1 %cmp, label %while.body, label %while.end
while.end:
ret void
}
; CHECK-LABEL: while_gte
; CHECK: while.body.preheader:
; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1
; CHECK: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i
; CHECK: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i
; CHECK: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]]
; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
; CHECK: br label %while.body
; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
entry:
%cmp4 = icmp slt i32 %i, %N
br i1 %cmp4, label %while.end, label %while.body
while.body:
%i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
store i32 %i.addr.05, i32* %arrayidx, align 4
%dec = add nsw i32 %i.addr.05, -1
%cmp = icmp sgt i32 %i.addr.05, %N
br i1 %cmp, label %while.body, label %while.end
while.end:
ret void
}
; CHECK-LABEL: nested
; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N)
; CHECK-NESTED: br label %while.cond1.preheader.us
; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
; CHECK: br label %while.body3.us
; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
; CHECK-REGDEC: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
define void @nested(i32* nocapture %A, i32 %N) {
entry:
%cmp20 = icmp eq i32 %N, 0
br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
while.cond1.preheader.us:
%i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
%mul.us = mul i32 %i.021.us, %N
br label %while.body3.us
while.body3.us:
%j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
%add.us = add i32 %j.019.us, %mul.us
%arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
store i32 %add.us, i32* %arrayidx.us, align 4
%inc.us = add nuw i32 %j.019.us, 1
%exitcond = icmp eq i32 %inc.us, %N
br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
while.cond1.while.end_crit_edge.us:
%inc6.us = add nuw i32 %i.021.us, 1
%exitcond23 = icmp eq i32 %inc6.us, %N
br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
while.end7:
ret void
}

View File

@ -0,0 +1,47 @@
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s
; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s
; CHECK-LABEL: float_counter
; CHECK-NOT: set.loop.iterations
; CHECK-NOT: loop.decrement
define void @float_counter(i32* nocapture %A, float %N) {
entry:
%cmp6 = fcmp ogt float %N, 0.000000e+00
br i1 %cmp6, label %while.body, label %while.end
while.body:
%i.07 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
%arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
store i32 %i.07, i32* %arrayidx, align 4
%inc = add i32 %i.07, 1
%conv = uitofp i32 %inc to float
%cmp = fcmp olt float %conv, %N
br i1 %cmp, label %while.body, label %while.end
while.end:
ret void
}
; CHECK-LABEL: variant_counter
; CHECK-NOT: set.loop.iterations
; CHECK-NOT: loop.decrement
define void @variant_counter(i32* nocapture %A, i32* nocapture readonly %B) {
entry:
%0 = load i32, i32* %B, align 4
%cmp7 = icmp eq i32 %0, 0
br i1 %cmp7, label %while.end, label %while.body
while.body:
%i.08 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
%arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.08
store i32 %i.08, i32* %arrayidx1, align 4
%inc = add nuw i32 %i.08, 1
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %inc
%1 = load i32, i32* %arrayidx, align 4
%cmp = icmp ult i32 %inc, %1
br i1 %cmp, label %while.body, label %while.end
while.end:
ret void
}

View File

@ -308,6 +308,7 @@ int main(int argc, char **argv) {
initializeVectorization(*Registry);
initializeScalarizeMaskedMemIntrinPass(*Registry);
initializeExpandReductionsPass(*Registry);
initializeHardwareLoopsPass(*Registry);
// Initialize debugging passes.
initializeScavengerTestPass(*Registry);

View File

@ -528,6 +528,7 @@ int main(int argc, char **argv) {
initializeExpandReductionsPass(Registry);
initializeWasmEHPreparePass(Registry);
initializeWriteBitcodePassPass(Registry);
initializeHardwareLoopsPass(Registry);
#ifdef LINK_POLLY_INTO_TOOLS
polly::initializePollyPasses(Registry);