From a7c032d8b541b066cad7820a71740c67995d666b Mon Sep 17 00:00:00 2001 From: Shuxin Yang Date: Thu, 29 Nov 2012 19:38:54 +0000 Subject: [PATCH] rdar://12100355 (part 1) This revision attempts to recognize following population-count pattern: while(a) { c++; ... ; a &= a - 1; ... }, where and could be used multiple times in the loop body. TODO: On X8664 and ARM, __buildin_ctpop() are not expanded to a efficent instruction sequence, which need to be improved in the following commits. Reviewed by Nadav, really appreciate! llvm-svn: 168931 --- include/llvm/Target/TargetTransformImpl.h | 2 +- include/llvm/TargetTransformInfo.h | 17 + lib/Target/X86/X86ISelLowering.cpp | 11 + lib/Target/X86/X86ISelLowering.h | 8 + lib/Target/X86/X86TargetMachine.h | 2 +- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 570 +++++++++++++++++-- test/Transforms/LoopIdiom/popcnt.ll | 76 +++ 7 files changed, 646 insertions(+), 40 deletions(-) create mode 100644 test/Transforms/LoopIdiom/popcnt.ll diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h index 7ea2396076d..ad38eaf95f0 100644 --- a/include/llvm/Target/TargetTransformImpl.h +++ b/include/llvm/Target/TargetTransformImpl.h @@ -26,7 +26,7 @@ class TargetLowering; /// ScalarTargetTransformInfo interface. Different targets can implement /// this interface differently. class ScalarTargetTransformImpl : public ScalarTargetTransformInfo { -private: +protected: const TargetLowering *TLI; public: diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h index 94db4904433..dbe35ec4c36 100644 --- a/include/llvm/TargetTransformInfo.h +++ b/include/llvm/TargetTransformInfo.h @@ -75,6 +75,18 @@ public: /// LSR, and LowerInvoke use this interface. class ScalarTargetTransformInfo { public: + /// PopcntHwSupport - Hardware support for population count. Compared to the + /// SW implementation, HW support is supposed to significantly boost the + /// performance when the population is dense, and it may or not may degrade + /// performance if the population is sparse. A HW support is considered as + /// "Fast" if it can outperform, or is on a par with, SW implementaion when + /// the population is sparse; otherwise, it is considered as "Slow". + enum PopcntHwSupport { + None, + Fast, + Slow + }; + virtual ~ScalarTargetTransformInfo() {} /// isLegalAddImmediate - Return true if the specified immediate is legal @@ -122,6 +134,11 @@ public: virtual bool shouldBuildLookupTables() const { return true; } + + /// getPopcntHwSupport - Return hardware support for population count. + virtual PopcntHwSupport getPopcntHwSupport(unsigned IntTyWidthInBit) const { + return None; + } }; /// VectorTargetTransformInfo - This interface is used by the vectorizers diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index d1552cd512e..d8e72632b0c 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -17670,6 +17670,17 @@ FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len, return -1; } +ScalarTargetTransformInfo::PopcntHwSupport +X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const { + assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2"); + const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget(); + + // TODO: Currently the __builtin_popcount() implementation using SSE3 + // instructions is inefficient. Once the problem is fixed, we should + // call ST.hasSSE3() instead of ST.hasSSE4(). + return ST.hasSSE41() ? Fast : None; +} + unsigned X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 465c6036ada..06733680a8c 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -933,6 +933,14 @@ namespace llvm { const TargetLibraryInfo *libInfo); } + class X86ScalarTargetTransformImpl : public ScalarTargetTransformImpl { + public: + explicit X86ScalarTargetTransformImpl(const TargetLowering *TL) : + ScalarTargetTransformImpl(TL) {}; + + virtual PopcntHwSupport getPopcntHwSupport(unsigned TyWidth) const; + }; + class X86VectorTargetTransformInfo : public VectorTargetTransformImpl { public: explicit X86VectorTargetTransformInfo(const TargetLowering *TL) : diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index 12311a1abfb..3f60c95092b 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -118,7 +118,7 @@ class X86_64TargetMachine : public X86TargetMachine { X86SelectionDAGInfo TSInfo; X86TargetLowering TLInfo; X86JITInfo JITInfo; - ScalarTargetTransformImpl STTI; + X86ScalarTargetTransformImpl STTI; X86VectorTargetTransformInfo VTTI; public: X86_64TargetMachine(const Target &T, StringRef TT, diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index a44e798f121..abcd636d0ac 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -56,6 +56,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/DataLayout.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -63,16 +64,83 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); namespace { + + class LoopIdiomRecognize; + + /// This class defines some utility functions for loop idiom recognization. + class LIRUtil { + public: + /// Return true iff the block contains nothing but an uncondition branch + /// (aka goto instruction). + static bool isAlmostEmpty(BasicBlock *); + + static BranchInst *getBranch(BasicBlock *BB) { + return dyn_cast(BB->getTerminator()); + } + + /// Return the condition of the branch terminating the given basic block. + static Value *getBrCondtion(BasicBlock *); + + /// Derive the precondition block (i.e the block that guards the loop + /// preheader) from the given preheader. + static BasicBlock *getPrecondBb(BasicBlock *PreHead); + }; + + /// This class is to recoginize idioms of population-count conducted in + /// a noncountable loop. Currently it only recognizes this pattern: + /// \code + /// while(x) {cnt++; ...; x &= x - 1; ...} + /// \endcode + class NclPopcountRecognize { + LoopIdiomRecognize &LIR; + Loop *CurLoop; + BasicBlock *PreCondBB; + + typedef IRBuilder<> IRBuilderTy; + + public: + explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR); + bool recognize(); + + private: + /// Take a glimpse of the loop to see if we need to go ahead recoginizing + /// the idiom. + bool preliminaryScreen(); + + /// Check if the given conditional branch is based on the comparison + /// beween a variable and zero, and if the variable is non-zero, the + /// control yeilds to the loop entry. If the branch matches the behavior, + /// the variable involved in the comparion is returned. This function will + /// be called to see if the precondition and postcondition of the loop + /// are in desirable form. + Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const; + + /// Return true iff the idiom is detected in the loop. and 1) \p CntInst + /// is set to the instruction counting the pupulation bit. 2) \p CntPhi + /// is set to the corresponding phi node. 3) \p Var is set to the value + /// whose population bits are being counted. + bool detectIdiom + (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const; + + /// Insert ctpop intrinsic function and some obviously dead instructions. + void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var); + + /// Create llvm.ctpop.* intrinsic function. + CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL); + }; + class LoopIdiomRecognize : public LoopPass { Loop *CurLoop; const DataLayout *TD; DominatorTree *DT; ScalarEvolution *SE; TargetLibraryInfo *TLI; + const ScalarTargetTransformInfo *STTI; public: static char ID; explicit LoopIdiomRecognize() : LoopPass(ID) { initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + TD = 0; DT = 0; SE = 0; TLI = 0; STTI = 0; } bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -110,6 +178,36 @@ namespace { AU.addRequired(); AU.addRequired(); } + + const DataLayout *getDataLayout() { + return TD ? TD : TD=getAnalysisIfAvailable(); + } + + DominatorTree *getDominatorTree() { + return DT ? DT : (DT=&getAnalysis()); + } + + ScalarEvolution *getScalarEvolution() { + return SE ? SE : (SE = &getAnalysis()); + } + + TargetLibraryInfo *getTargetLibraryInfo() { + return TLI ? TLI : (TLI = &getAnalysis()); + } + + const ScalarTargetTransformInfo *getScalarTargetTransformInfo() { + if (!STTI) { + TargetTransformInfo *TTI = getAnalysisIfAvailable(); + if (TTI) STTI = TTI->getScalarTargetTransformInfo(); + } + return STTI; + } + + Loop *getLoop() const { return CurLoop; } + + private: + bool runOnNoncountableLoop(); + bool runOnCountableLoop(); }; } @@ -172,6 +270,437 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE, deleteDeadInstruction(I, SE, TLI); } +//===----------------------------------------------------------------------===// +// +// Implementation of LIRUtil +// +//===----------------------------------------------------------------------===// + +// This fucntion will return true iff the given block contains nothing but goto. +// A typical usage of this function is to check if the preheader fucntion is +// "almost" empty such that generated intrinsic function can be moved across +// preheader and to be placed at the end of the preconditiona block without +// concerning of breaking data dependence. +bool LIRUtil::isAlmostEmpty(BasicBlock *BB) { + if (BranchInst *Br = getBranch(BB)) { + return Br->isUnconditional() && BB->size() == 1; + } + return false; +} + +Value *LIRUtil::getBrCondtion(BasicBlock *BB) { + BranchInst *Br = getBranch(BB); + return Br ? Br->getCondition() : 0; +} + +BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) { + if (BasicBlock *BB = PreHead->getSinglePredecessor()) { + BranchInst *Br = getBranch(BB); + return Br && Br->isConditional() ? BB : 0; + } + return 0; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of NclPopcountRecognize +// +//===----------------------------------------------------------------------===// + +NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR): + LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) { +} + +bool NclPopcountRecognize::preliminaryScreen() { + const ScalarTargetTransformInfo *STTI = LIR.getScalarTargetTransformInfo(); + if (STTI->getPopcntHwSupport(32) != ScalarTargetTransformInfo::Fast) + return false; + + // Counting population are usually conducted by few arithmetic instrutions. + // Such instructions can be easilly "absorbed" by vacant slots in a + // non-compact loop. Therefore, recognizing popcount idiom only makes sense + // in a compact loop. + + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + BasicBlock *LoopBody = *(CurLoop->block_begin()); + if (LoopBody->size() >= 20) { + // The loop is too big, bail out. + return false; + } + + // It should have a preheader containing nothing but a goto instruction. + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead)) + return false; + + // It should have a precondition block where the generated popcount instrinsic + // function will be inserted. + PreCondBB = LIRUtil::getPrecondBb(PreHead); + if (!PreCondBB) + return false; + + return true; +} + +Value *NclPopcountRecognize::matchCondition (BranchInst *Br, + BasicBlock *LoopEntry) const { + if (!Br || !Br->isConditional()) + return 0; + + ICmpInst *Cond = dyn_cast(Br->getCondition()); + if (!Cond) + return 0; + + ConstantInt *CmpZero = dyn_cast(Cond->getOperand(1)); + if (!CmpZero || !CmpZero->isZero()) + return 0; + + ICmpInst::Predicate Pred = Cond->getPredicate(); + if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) || + (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry)) + return Cond->getOperand(0); + + return 0; +} + +bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst, + PHINode *&CntPhi, + Value *&Var) const { + // Following code tries to detect this idiom: + // + // if (x0 != 0) + // goto loop-exit // the precondition of the loop + // cnt0 = init-val; + // do { + // x1 = phi (x0, x2); + // cnt1 = phi(cnt0, cnt2); + // + // cnt2 = cnt1 + 1; + // ... + // x2 = x1 & (x1 - 1); + // ... + // } while(x != 0); + // + // loop-exit: + // + + // step 1: Check to see if the look-back branch match this pattern: + // "if (a!=0) goto loop-entry". + BasicBlock *LoopEntry; + Instruction *DefX2, *CountInst; + Value *VarX1, *VarX0; + PHINode *PhiX, *CountPhi; + + DefX2 = CountInst = 0; + VarX1 = VarX0 = 0; + PhiX = CountPhi = 0; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + { + if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry)) + DefX2 = dyn_cast(T); + else + return false; + } + + // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)" + { + if (DefX2->getOpcode() != Instruction::And) + return false; + + BinaryOperator *SubOneOp; + + if ((SubOneOp = dyn_cast(DefX2->getOperand(0)))) + VarX1 = DefX2->getOperand(1); + else { + VarX1 = DefX2->getOperand(0); + SubOneOp = dyn_cast(DefX2->getOperand(1)); + } + if (!SubOneOp) + return false; + + Instruction *SubInst = cast(SubOneOp); + ConstantInt *Dec = dyn_cast(SubInst->getOperand(1)); + if (!Dec || + !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || + (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) { + return false; + } + } + + // step 3: Check the recurrence of variable X + { + PhiX = dyn_cast(VarX1); + if (!PhiX || + (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { + return false; + } + } + + // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 + { + CountInst = NULL; + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(), + IterE = LoopEntry->end(); Iter != IterE; Iter++) { + Instruction *Inst = Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast(Inst->getOperand(0)); + if (!Phi && Phi->getParent() != LoopEntry) + continue; + + // Check if the result of the instruction is live of the loop. + bool LiveOutLoop = false; + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); + I != E; I++) { + if ((cast(*I))->getParent() != LoopEntry) { + LiveOutLoop = true; break; + } + } + + if (LiveOutLoop) { + CountInst = Inst; + CountPhi = Phi; + break; + } + } + + if (!CountInst) + return false; + } + + // step 5: check if the precondition is in this form: + // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;" + { + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); + Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader()); + if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1)) + return false; + + CntInst = CountInst; + CntPhi = CountPhi; + Var = T; + } + + return true; +} + +void NclPopcountRecognize::transform(Instruction *CntInst, + PHINode *CntPhi, Value *Var) { + + ScalarEvolution *SE = LIR.getScalarEvolution(); + TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo(); + BasicBlock *PreHead = CurLoop->getLoopPreheader(); + BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB); + const DebugLoc DL = CntInst->getDebugLoc(); + + // Assuming before transformation, the loop is following: + // if (x) // the precondition + // do { cnt++; x &= x - 1; } while(x); + + // Step 1: Insert the ctpop instruction at the end of the precondition block + IRBuilderTy Builder(PreCondBr); + Value *PopCnt, *PopCntZext, *NewCount; + { + PopCnt = createPopcntIntrinsic(Builder, Var, DL); + NewCount = PopCntZext = + Builder.CreateZExtOrTrunc(PopCnt, cast(CntPhi->getType())); + + if (NewCount != PopCnt) + (cast(NewCount))->setDebugLoc(DL); + + // If the popoulation counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead); + ConstantInt *InitConst = dyn_cast(CntInitVal); + if (!InitConst || !InitConst->isZero()) { + NewCount = Builder.CreateAdd(PopCnt, InitConst); + (cast(NewCount))->setDebugLoc(DL); + } + } + + // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to + // "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic + // function would be partial dead code, and downstream passes will drag + // it back from the precondition block to the preheader. + { + ICmpInst *PreCond = cast(PreCondBr->getCondition()); + + Value *Opnd0 = PopCntZext; + Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0); + if (PreCond->getOperand(0) != Var) + std::swap(Opnd0, Opnd1); + + ICmpInst *NewPreCond = + cast(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1)); + PreCond->replaceAllUsesWith(NewPreCond); + + deleteDeadInstruction(PreCond, *SE, TLI); + } + + // Step 3: Note that the population count is exactly the trip count of the + // loop in question, which enble us to to convert the loop from noncountable + // loop into a countable one. The benefit is twofold: + // + // - If the loop only counts population, the entire loop become dead after + // the transformation. It is lots easier to prove a countable loop dead + // than to prove a noncountable one. (In some C dialects, a infite loop + // isn't dead even if it computes nothing useful. In general, DCE needs + // to prove a noncountable loop finite before safely delete it.) + // + // - If the loop also performs something else, it remains alive. + // Since it is transformed to countable form, it can be aggressively + // optimized by some optimizations which are in general not applicable + // to a noncountable loop. + // + // After this step, this loop (conceptually) would look like following: + // newcnt = __builtin_ctpop(x); + // t = newcnt; + // if (x) + // do { cnt++; x &= x-1; t--) } while (t > 0); + BasicBlock *Body = *(CurLoop->block_begin()); + { + BranchInst *LbBr = LIRUtil::getBranch(Body); + ICmpInst *LbCond = cast(LbBr->getCondition()); + Type *Ty = NewCount->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin()); + + Builder.SetInsertPoint(LbCond); + Value *Opnd1 = cast(TcPhi); + Value *Opnd2 = cast(ConstantInt::get(Ty, 1)); + Instruction *TcDec = + cast(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true)); + + TcPhi->addIncoming(NewCount, PreHead); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ? + CmpInst::ICMP_UGT : CmpInst::ICMP_SLE; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, cast(ConstantInt::get(Ty, 0))); + } + + // Step 4: All the references to the original population counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctpop(). + { + SmallVector CntUses; + for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end(); + I != E; I++) { + if (cast(*I)->getParent() != Body) + CntUses.push_back(*I); + } + for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) { + (cast(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount); + } + } + + // step 5: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} + +CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, + Value *Val, DebugLoc DL) { + Value *Ops[] = { Val }; + Type *Tys[] = { Val->getType() }; + + Module *M = (*(CurLoop->block_begin()))->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// recognize - detect population count idiom in a non-countable loop. If +/// detected, transform the relevant code to popcount intrinsic function +/// call, and return true; otherwise, return false. +bool NclPopcountRecognize::recognize() { + + if (!LIR.getScalarTargetTransformInfo()) + return false; + + LIR.getScalarEvolution(); + + if (!preliminaryScreen()) + return false; + + Instruction *CntInst; + PHINode *CntPhi; + Value *Val; + if (!detectIdiom(CntInst, CntPhi, Val)) + return false; + + transform(CntInst, CntPhi, Val); + return true; +} + +//===----------------------------------------------------------------------===// +// +// Implementation of LoopIdiomRecognize +// +//===----------------------------------------------------------------------===// + +bool LoopIdiomRecognize::runOnCountableLoop() { + const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop); + if (isa(BECount)) return false; + + // If this loop executes exactly one time, then it should be peeled, not + // optimized by this pass. + if (const SCEVConstant *BECst = dyn_cast(BECount)) + if (BECst->getValue()->getValue() == 0) + return false; + + // We require target data for now. + if (!getDataLayout()) + return false; + + getDominatorTree(); + + LoopInfo &LI = getAnalysis(); + TLI = &getAnalysis(); + + getTargetLibraryInfo(); + + SmallVector ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + DEBUG(dbgs() << "loop-idiom Scanning: F[" + << CurLoop->getHeader()->getParent()->getName() + << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); + + bool MadeChange = false; + // Scan all the blocks in the loop that are not in subloops. + for (Loop::block_iterator BI = CurLoop->block_begin(), + E = CurLoop->block_end(); BI != E; ++BI) { + // Ignore blocks in subloops. + if (LI.getLoopFor(*BI) != CurLoop) + continue; + + MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); + } + return MadeChange; +} + +bool LoopIdiomRecognize::runOnNoncountableLoop() { + NclPopcountRecognize Popcount(*this); + if (Popcount.recognize()) + return true; + + return false; +} + bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { CurLoop = L; @@ -185,45 +714,10 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { if (Name == "memset" || Name == "memcpy") return false; - // The trip count of the loop must be analyzable. SE = &getAnalysis(); - if (!SE->hasLoopInvariantBackedgeTakenCount(L)) - return false; - const SCEV *BECount = SE->getBackedgeTakenCount(L); - if (isa(BECount)) return false; - - // If this loop executes exactly one time, then it should be peeled, not - // optimized by this pass. - if (const SCEVConstant *BECst = dyn_cast(BECount)) - if (BECst->getValue()->getValue() == 0) - return false; - - // We require target data for now. - TD = getAnalysisIfAvailable(); - if (TD == 0) return false; - - DT = &getAnalysis(); - LoopInfo &LI = getAnalysis(); - TLI = &getAnalysis(); - - SmallVector ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - - DEBUG(dbgs() << "loop-idiom Scanning: F[" - << L->getHeader()->getParent()->getName() - << "] Loop %" << L->getHeader()->getName() << "\n"); - - bool MadeChange = false; - // Scan all the blocks in the loop that are not in subloops. - for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; - ++BI) { - // Ignore blocks in subloops. - if (LI.getLoopFor(*BI) != CurLoop) - continue; - - MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); - } - return MadeChange; + if (SE->hasLoopInvariantBackedgeTakenCount(L)) + return runOnCountableLoop(); + return runOnNoncountableLoop(); } /// runOnLoopBlock - Process the specified block, which lives in a counted loop diff --git a/test/Transforms/LoopIdiom/popcnt.ll b/test/Transforms/LoopIdiom/popcnt.ll new file mode 100644 index 00000000000..039af8024d3 --- /dev/null +++ b/test/Transforms/LoopIdiom/popcnt.ll @@ -0,0 +1,76 @@ +; RUN: opt -loop-idiom < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -S | FileCheck %s + +;To recognize this pattern: +;int popcount(unsigned long long a) { +; int c = 0; +; while (a) { +; c++; +; a &= a - 1; +; } +; return c; +;} +; +; CHECK: entry +; CHECK: llvm.ctpop.i64 +; CHECK: ret +define i32 @popcount(i64 %a) nounwind uwtable readnone ssp { +entry: + %tobool3 = icmp eq i64 %a, 0 + br i1 %tobool3, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %a.addr.04 = phi i64 [ %and, %while.body ], [ %a, %entry ] + %inc = add nsw i32 %c.05, 1 + %sub = add i64 %a.addr.04, -1 + %and = and i64 %sub, %a.addr.04 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] + ret i32 %c.0.lcssa +} + +; To recognize this pattern: +;int popcount(unsigned long long a, int mydata1, int mydata2) { +; int c = 0; +; while (a) { +; c++; +; a &= a - 1; +; mydata1 *= c; +; mydata2 *= (int)a; +; } +; return c + mydata1 + mydata2; +;} +; CHECK: entry +; CHECK: llvm.ctpop.i64 +; CHECK: ret +define i32 @popcount2(i64 %a, i32 %mydata1, i32 %mydata2) nounwind uwtable readnone ssp { +entry: + %tobool9 = icmp eq i64 %a, 0 + br i1 %tobool9, label %while.end, label %while.body + +while.body: ; preds = %entry, %while.body + %c.013 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %mydata2.addr.012 = phi i32 [ %mul1, %while.body ], [ %mydata2, %entry ] + %mydata1.addr.011 = phi i32 [ %mul, %while.body ], [ %mydata1, %entry ] + %a.addr.010 = phi i64 [ %and, %while.body ], [ %a, %entry ] + %inc = add nsw i32 %c.013, 1 + %sub = add i64 %a.addr.010, -1 + %and = and i64 %sub, %a.addr.010 + %mul = mul nsw i32 %inc, %mydata1.addr.011 + %conv = trunc i64 %and to i32 + %mul1 = mul nsw i32 %conv, %mydata2.addr.012 + %tobool = icmp eq i64 %and, 0 + br i1 %tobool, label %while.end, label %while.body + +while.end: ; preds = %while.body, %entry + %c.0.lcssa = phi i32 [ 0, %entry ], [ %inc, %while.body ] + %mydata2.addr.0.lcssa = phi i32 [ %mydata2, %entry ], [ %mul1, %while.body ] + %mydata1.addr.0.lcssa = phi i32 [ %mydata1, %entry ], [ %mul, %while.body ] + %add = add i32 %mydata2.addr.0.lcssa, %mydata1.addr.0.lcssa + %add2 = add i32 %add, %c.0.lcssa + ret i32 %add2 +} +