//===-- MachineLICM.cpp - Machine Loop Invariant Code Motion Pass ---------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass performs loop invariant code motion on machine instructions. We // attempt to remove as much code from the body of a loop as possible. // // This pass is not intended to be a replacement or a complete alternative // for the LLVM-IR-level LICM pass. It is only designed to hoist simple // constructs that are not exposed before lowering and instruction selection. // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/PseudoSourceValue.h" #include "llvm/CodeGen/TargetSchedule.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" using namespace llvm; #define DEBUG_TYPE "machine-licm" static cl::opt AvoidSpeculation("avoid-speculation", cl::desc("MachineLICM should avoid speculation"), cl::init(true), cl::Hidden); static cl::opt HoistCheapInsts("hoist-cheap-insts", cl::desc("MachineLICM should hoist even cheap instructions"), cl::init(false), cl::Hidden); static cl::opt SinkInstsToAvoidSpills("sink-insts-to-avoid-spills", cl::desc("MachineLICM should sink instructions into " "loops to avoid register spills"), cl::init(false), cl::Hidden); STATISTIC(NumHoisted, "Number of machine instructions hoisted out of loops"); STATISTIC(NumLowRP, "Number of instructions hoisted in low reg pressure situation"); STATISTIC(NumHighLatency, "Number of high latency instructions hoisted"); STATISTIC(NumCSEed, "Number of hoisted machine instructions CSEed"); STATISTIC(NumPostRAHoisted, "Number of machine instructions hoisted out of loops post regalloc"); namespace { class MachineLICM : public MachineFunctionPass { const TargetInstrInfo *TII; const TargetLoweringBase *TLI; const TargetRegisterInfo *TRI; const MachineFrameInfo *MFI; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; bool PreRegAlloc; // Various analyses that we use... AliasAnalysis *AA; // Alias analysis info. MachineLoopInfo *MLI; // Current MachineLoopInfo MachineDominatorTree *DT; // Machine dominator tree for the cur loop // State that is updated as we process loops bool Changed; // True if a loop is changed. bool FirstInLoop; // True if it's the first LICM in the loop. MachineLoop *CurLoop; // The current loop we are working on. MachineBasicBlock *CurPreheader; // The preheader for CurLoop. // Exit blocks for CurLoop. SmallVector ExitBlocks; bool isExitBlock(const MachineBasicBlock *MBB) const { return is_contained(ExitBlocks, MBB); } // Track 'estimated' register pressure. SmallSet RegSeen; SmallVector RegPressure; // Register pressure "limit" per register pressure set. If the pressure // is higher than the limit, then it's considered high. SmallVector RegLimit; // Register pressure on path leading from loop preheader to current BB. SmallVector, 16> BackTrace; // For each opcode, keep a list of potential CSE instructions. DenseMap > CSEMap; enum { SpeculateFalse = 0, SpeculateTrue = 1, SpeculateUnknown = 2 }; // If a MBB does not dominate loop exiting blocks then it may not safe // to hoist loads from this block. // Tri-state: 0 - false, 1 - true, 2 - unknown unsigned SpeculationState; public: static char ID; // Pass identification, replacement for typeid MachineLICM() : MachineFunctionPass(ID), PreRegAlloc(true) { initializeMachineLICMPass(*PassRegistry::getPassRegistry()); } explicit MachineLICM(bool PreRA) : MachineFunctionPass(ID), PreRegAlloc(PreRA) { initializeMachineLICMPass(*PassRegistry::getPassRegistry()); } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); AU.addPreserved(); MachineFunctionPass::getAnalysisUsage(AU); } void releaseMemory() override { RegSeen.clear(); RegPressure.clear(); RegLimit.clear(); BackTrace.clear(); CSEMap.clear(); } private: /// Keep track of information about hoisting candidates. struct CandidateInfo { MachineInstr *MI; unsigned Def; int FI; CandidateInfo(MachineInstr *mi, unsigned def, int fi) : MI(mi), Def(def), FI(fi) {} }; void HoistRegionPostRA(); void HoistPostRA(MachineInstr *MI, unsigned Def); void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, BitVector &PhysRegClobbers, SmallSet &StoredFIs, SmallVectorImpl &Candidates); void AddToLiveIns(unsigned Reg); bool IsLICMCandidate(MachineInstr &I); bool IsLoopInvariantInst(MachineInstr &I); bool HasLoopPHIUse(const MachineInstr *MI) const; bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, unsigned Reg) const; bool IsCheapInstruction(MachineInstr &MI) const; bool CanCauseHighRegPressure(const DenseMap &Cost, bool Cheap); void UpdateBackTraceRegPressure(const MachineInstr *MI); bool IsProfitableToHoist(MachineInstr &MI); bool IsGuaranteedToExecute(MachineBasicBlock *BB); void EnterScope(MachineBasicBlock *MBB); void ExitScope(MachineBasicBlock *MBB); void ExitScopeIfDone( MachineDomTreeNode *Node, DenseMap &OpenChildren, DenseMap &ParentMap); void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode); void HoistRegion(MachineDomTreeNode *N, bool IsHeader); void SinkIntoLoop(); void InitRegPressure(MachineBasicBlock *BB); DenseMap calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, bool ConsiderUnseenAsDef); void UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef = false); MachineInstr *ExtractHoistableLoad(MachineInstr *MI); const MachineInstr * LookForDuplicate(const MachineInstr *MI, std::vector &PrevMIs); bool EliminateCSE( MachineInstr *MI, DenseMap>::iterator &CI); bool MayCSE(MachineInstr *MI); bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader); void InitCSEMap(MachineBasicBlock *BB); MachineBasicBlock *getCurPreheader(); }; } // end anonymous namespace char MachineLICM::ID = 0; char &llvm::MachineLICMID = MachineLICM::ID; INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm", "Machine Loop Invariant Code Motion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(MachineLICM, "machinelicm", "Machine Loop Invariant Code Motion", false, false) /// Test if the given loop is the outer-most loop that has a unique predecessor. static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) { // Check whether this loop even has a unique predecessor. if (!CurLoop->getLoopPredecessor()) return false; // Ok, now check to see if any of its outer loops do. for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop()) if (L->getLoopPredecessor()) return false; // None of them did, so this is the outermost with a unique predecessor. return true; } bool MachineLICM::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; Changed = FirstInLoop = false; const TargetSubtargetInfo &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TLI = ST.getTargetLowering(); TRI = ST.getRegisterInfo(); MFI = &MF.getFrameInfo(); MRI = &MF.getRegInfo(); SchedModel.init(ST.getSchedModel(), &ST, TII); PreRegAlloc = MRI->isSSA(); if (PreRegAlloc) DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: "); else DEBUG(dbgs() << "******** Post-regalloc Machine LICM: "); DEBUG(dbgs() << MF.getName() << " ********\n"); if (PreRegAlloc) { // Estimate register pressure during pre-regalloc pass. unsigned NumRPS = TRI->getNumRegPressureSets(); RegPressure.resize(NumRPS); std::fill(RegPressure.begin(), RegPressure.end(), 0); RegLimit.resize(NumRPS); for (unsigned i = 0, e = NumRPS; i != e; ++i) RegLimit[i] = TRI->getRegPressureSetLimit(MF, i); } // Get our Loop information... MLI = &getAnalysis(); DT = &getAnalysis(); AA = &getAnalysis().getAAResults(); SmallVector Worklist(MLI->begin(), MLI->end()); while (!Worklist.empty()) { CurLoop = Worklist.pop_back_val(); CurPreheader = nullptr; ExitBlocks.clear(); // If this is done before regalloc, only visit outer-most preheader-sporting // loops. if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) { Worklist.append(CurLoop->begin(), CurLoop->end()); continue; } CurLoop->getExitBlocks(ExitBlocks); if (!PreRegAlloc) HoistRegionPostRA(); else { // CSEMap is initialized for loop header when the first instruction is // being hoisted. MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader()); FirstInLoop = true; HoistOutOfLoop(N); CSEMap.clear(); if (SinkInstsToAvoidSpills) SinkIntoLoop(); } } return Changed; } /// Return true if instruction stores to the specified frame. static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { // If we lost memory operands, conservatively assume that the instruction // writes to all slots. if (MI->memoperands_empty()) return true; for (const MachineMemOperand *MemOp : MI->memoperands()) { if (!MemOp->isStore() || !MemOp->getPseudoValue()) continue; if (const FixedStackPseudoSourceValue *Value = dyn_cast(MemOp->getPseudoValue())) { if (Value->getFrameIndex() == FI) return true; } } return false; } /// Examine the instruction for potentai LICM candidate. Also /// gather register def and frame object update information. void MachineLICM::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, BitVector &PhysRegClobbers, SmallSet &StoredFIs, SmallVectorImpl &Candidates) { bool RuledOut = false; bool HasNonInvariantUse = false; unsigned Def = 0; for (const MachineOperand &MO : MI->operands()) { if (MO.isFI()) { // Remember if the instruction stores to the frame index. int FI = MO.getIndex(); if (!StoredFIs.count(FI) && MFI->isSpillSlotObjectIndex(FI) && InstructionStoresToFI(MI, FI)) StoredFIs.insert(FI); HasNonInvariantUse = true; continue; } // We can't hoist an instruction defining a physreg that is clobbered in // the loop. if (MO.isRegMask()) { PhysRegClobbers.setBitsNotInMask(MO.getRegMask()); continue; } if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "Not expecting virtual register!"); if (!MO.isDef()) { if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg))) // If it's using a non-loop-invariant register, then it's obviously not // safe to hoist. HasNonInvariantUse = true; continue; } if (MO.isImplicit()) { for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) PhysRegClobbers.set(*AI); if (!MO.isDead()) // Non-dead implicit def? This cannot be hoisted. RuledOut = true; // No need to check if a dead implicit def is also defined by // another instruction. continue; } // FIXME: For now, avoid instructions with multiple defs, unless // it's a dead implicit def. if (Def) RuledOut = true; else Def = Reg; // If we have already seen another instruction that defines the same // register, then this is not safe. Two defs is indicated by setting a // PhysRegClobbers bit. for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { if (PhysRegDefs.test(*AS)) PhysRegClobbers.set(*AS); PhysRegDefs.set(*AS); } if (PhysRegClobbers.test(Reg)) // MI defined register is seen defined by another instruction in // the loop, it cannot be a LICM candidate. RuledOut = true; } // Only consider reloads for now and remats which do not have register // operands. FIXME: Consider unfold load folding instructions. if (Def && !RuledOut) { int FI = INT_MIN; if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) || (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI))) Candidates.push_back(CandidateInfo(MI, Def, FI)); } } /// Walk the specified region of the CFG and hoist loop invariants out to the /// preheader. void MachineLICM::HoistRegionPostRA() { MachineBasicBlock *Preheader = getCurPreheader(); if (!Preheader) return; unsigned NumRegs = TRI->getNumRegs(); BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop. BitVector PhysRegClobbers(NumRegs); // Regs defined more than once. SmallVector Candidates; SmallSet StoredFIs; // Walk the entire region, count number of defs for each register, and // collect potential LICM candidates. const std::vector &Blocks = CurLoop->getBlocks(); for (MachineBasicBlock *BB : Blocks) { // If the header of the loop containing this basic block is a landing pad, // then don't try to hoist instructions out of this loop. const MachineLoop *ML = MLI->getLoopFor(BB); if (ML && ML->getHeader()->isEHPad()) continue; // Conservatively treat live-in's as an external def. // FIXME: That means a reload that're reused in successor block(s) will not // be LICM'ed. for (const auto &LI : BB->liveins()) { for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) PhysRegDefs.set(*AI); } SpeculationState = SpeculateUnknown; for (MachineInstr &MI : *BB) ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates); } // Gather the registers read / clobbered by the terminator. BitVector TermRegs(NumRegs); MachineBasicBlock::iterator TI = Preheader->getFirstTerminator(); if (TI != Preheader->end()) { for (const MachineOperand &MO : TI->operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) TermRegs.set(*AI); } } // Now evaluate whether the potential candidates qualify. // 1. Check if the candidate defined register is defined by another // instruction in the loop. // 2. If the candidate is a load from stack slot (always true for now), // check if the slot is stored anywhere in the loop. // 3. Make sure candidate def should not clobber // registers read by the terminator. Similarly its def should not be // clobbered by the terminator. for (CandidateInfo &Candidate : Candidates) { if (Candidate.FI != INT_MIN && StoredFIs.count(Candidate.FI)) continue; unsigned Def = Candidate.Def; if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { bool Safe = true; MachineInstr *MI = Candidate.MI; for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || MO.isDef() || !MO.getReg()) continue; unsigned Reg = MO.getReg(); if (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)) { // If it's using a non-loop-invariant register, then it's obviously // not safe to hoist. Safe = false; break; } } if (Safe) HoistPostRA(MI, Candidate.Def); } } } /// Add register 'Reg' to the livein sets of BBs in the current loop, and make /// sure it is not killed by any instructions in the loop. void MachineLICM::AddToLiveIns(unsigned Reg) { const std::vector &Blocks = CurLoop->getBlocks(); for (MachineBasicBlock *BB : Blocks) { if (!BB->isLiveIn(Reg)) BB->addLiveIn(Reg); for (MachineInstr &MI : *BB) { for (MachineOperand &MO : MI.operands()) { if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue; if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg())) MO.setIsKill(false); } } } } /// When an instruction is found to only use loop invariant operands that is /// safe to hoist, this instruction is called to do the dirty work. void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) { MachineBasicBlock *Preheader = getCurPreheader(); // Now move the instructions to the predecessor, inserting it before any // terminator instructions. DEBUG(dbgs() << "Hoisting to BB#" << Preheader->getNumber() << " from BB#" << MI->getParent()->getNumber() << ": " << *MI); // Splice the instruction to the preheader. MachineBasicBlock *MBB = MI->getParent(); Preheader->splice(Preheader->getFirstTerminator(), MBB, MI); // Add register to livein list to all the BBs in the current loop since a // loop invariant must be kept live throughout the whole loop. This is // important to ensure later passes do not scavenge the def register. AddToLiveIns(Def); ++NumPostRAHoisted; Changed = true; } /// Check if this mbb is guaranteed to execute. If not then a load from this mbb /// may not be safe to hoist. bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) { if (SpeculationState != SpeculateUnknown) return SpeculationState == SpeculateFalse; if (BB != CurLoop->getHeader()) { // Check loop exiting blocks. SmallVector CurrentLoopExitingBlocks; CurLoop->getExitingBlocks(CurrentLoopExitingBlocks); for (MachineBasicBlock *CurrentLoopExitingBlock : CurrentLoopExitingBlocks) if (!DT->dominates(BB, CurrentLoopExitingBlock)) { SpeculationState = SpeculateTrue; return false; } } SpeculationState = SpeculateFalse; return true; } void MachineLICM::EnterScope(MachineBasicBlock *MBB) { DEBUG(dbgs() << "Entering BB#" << MBB->getNumber() << '\n'); // Remember livein register pressure. BackTrace.push_back(RegPressure); } void MachineLICM::ExitScope(MachineBasicBlock *MBB) { DEBUG(dbgs() << "Exiting BB#" << MBB->getNumber() << '\n'); BackTrace.pop_back(); } /// Destroy scope for the MBB that corresponds to the given dominator tree node /// if its a leaf or all of its children are done. Walk up the dominator tree to /// destroy ancestors which are now done. void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node, DenseMap &OpenChildren, DenseMap &ParentMap) { if (OpenChildren[Node]) return; // Pop scope. ExitScope(Node->getBlock()); // Now traverse upwards to pop ancestors whose offsprings are all done. while (MachineDomTreeNode *Parent = ParentMap[Node]) { unsigned Left = --OpenChildren[Parent]; if (Left != 0) break; ExitScope(Parent->getBlock()); Node = Parent; } } /// Walk the specified loop in the CFG (defined by all blocks dominated by the /// specified header block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before /// uses, allowing us to hoist a loop body in one pass without iteration. /// void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) { MachineBasicBlock *Preheader = getCurPreheader(); if (!Preheader) return; SmallVector Scopes; SmallVector WorkList; DenseMap ParentMap; DenseMap OpenChildren; // Perform a DFS walk to determine the order of visit. WorkList.push_back(HeaderN); while (!WorkList.empty()) { MachineDomTreeNode *Node = WorkList.pop_back_val(); assert(Node && "Null dominator tree node?"); MachineBasicBlock *BB = Node->getBlock(); // If the header of the loop containing this basic block is a landing pad, // then don't try to hoist instructions out of this loop. const MachineLoop *ML = MLI->getLoopFor(BB); if (ML && ML->getHeader()->isEHPad()) continue; // If this subregion is not in the top level loop at all, exit. if (!CurLoop->contains(BB)) continue; Scopes.push_back(Node); const std::vector &Children = Node->getChildren(); unsigned NumChildren = Children.size(); // Don't hoist things out of a large switch statement. This often causes // code to be hoisted that wasn't going to be executed, and increases // register pressure in a situation where it's likely to matter. if (BB->succ_size() >= 25) NumChildren = 0; OpenChildren[Node] = NumChildren; // Add children in reverse order as then the next popped worklist node is // the first child of this node. This means we ultimately traverse the // DOM tree in exactly the same order as if we'd recursed. for (int i = (int)NumChildren-1; i >= 0; --i) { MachineDomTreeNode *Child = Children[i]; ParentMap[Child] = Node; WorkList.push_back(Child); } } if (Scopes.size() == 0) return; // Compute registers which are livein into the loop headers. RegSeen.clear(); BackTrace.clear(); InitRegPressure(Preheader); // Now perform LICM. for (MachineDomTreeNode *Node : Scopes) { MachineBasicBlock *MBB = Node->getBlock(); EnterScope(MBB); // Process the block SpeculationState = SpeculateUnknown; for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end(); MII != E; ) { MachineBasicBlock::iterator NextMII = MII; ++NextMII; MachineInstr *MI = &*MII; if (!Hoist(MI, Preheader)) UpdateRegPressure(MI); MII = NextMII; } // If it's a leaf node, it's done. Traverse upwards to pop ancestors. ExitScopeIfDone(Node, OpenChildren, ParentMap); } } /// Sink instructions into loops if profitable. This especially tries to prevent /// register spills caused by register pressure if there is little to no /// overhead moving instructions into loops. void MachineLICM::SinkIntoLoop() { MachineBasicBlock *Preheader = getCurPreheader(); if (!Preheader) return; SmallVector Candidates; for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin(); I != Preheader->instr_end(); ++I) { // We need to ensure that we can safely move this instruction into the loop. // As such, it must not have side-effects, e.g. such as a call has. if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I)) Candidates.push_back(&*I); } for (MachineInstr *I : Candidates) { const MachineOperand &MO = I->getOperand(0); if (!MO.isDef() || !MO.isReg() || !MO.getReg()) continue; if (!MRI->hasOneDef(MO.getReg())) continue; bool CanSink = true; MachineBasicBlock *B = nullptr; for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) { // FIXME: Come up with a proper cost model that estimates whether sinking // the instruction (and thus possibly executing it on every loop // iteration) is more expensive than a register. // For now assumes that copies are cheap and thus almost always worth it. if (!MI.isCopy()) { CanSink = false; break; } if (!B) { B = MI.getParent(); continue; } B = DT->findNearestCommonDominator(B, MI.getParent()); if (!B) { CanSink = false; break; } } if (!CanSink || !B || B == Preheader) continue; B->splice(B->getFirstNonPHI(), Preheader, I); } } static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) { return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg()); } /// Find all virtual register references that are liveout of the preheader to /// initialize the starting "register pressure". Note this does not count live /// through (livein but not used) registers. void MachineLICM::InitRegPressure(MachineBasicBlock *BB) { std::fill(RegPressure.begin(), RegPressure.end(), 0); // If the preheader has only a single predecessor and it ends with a // fallthrough or an unconditional branch, then scan its predecessor for live // defs as well. This happens whenever the preheader is created by splitting // the critical edge from the loop predecessor to the loop header. if (BB->pred_size() == 1) { MachineBasicBlock *TBB = nullptr, *FBB = nullptr; SmallVector Cond; if (!TII->analyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty()) InitRegPressure(*BB->pred_begin()); } for (const MachineInstr &MI : *BB) UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true); } /// Update estimate of register pressure after the specified instruction. void MachineLICM::UpdateRegPressure(const MachineInstr *MI, bool ConsiderUnseenAsDef) { auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef); for (const auto &RPIdAndCost : Cost) { unsigned Class = RPIdAndCost.first; if (static_cast(RegPressure[Class]) < -RPIdAndCost.second) RegPressure[Class] = 0; else RegPressure[Class] += RPIdAndCost.second; } } /// Calculate the additional register pressure that the registers used in MI /// cause. /// /// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to /// figure out which usages are live-ins. /// FIXME: Figure out a way to consider 'RegSeen' from all code paths. DenseMap MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen, bool ConsiderUnseenAsDef) { DenseMap Cost; if (MI->isImplicitDef()) return Cost; for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || MO.isImplicit()) continue; unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; // FIXME: It seems bad to use RegSeen only for some of these calculations. bool isNew = ConsiderSeen ? RegSeen.insert(Reg).second : false; const TargetRegisterClass *RC = MRI->getRegClass(Reg); RegClassWeight W = TRI->getRegClassWeight(RC); int RCCost = 0; if (MO.isDef()) RCCost = W.RegWeight; else { bool isKill = isOperandKill(MO, MRI); if (isNew && !isKill && ConsiderUnseenAsDef) // Haven't seen this, it must be a livein. RCCost = W.RegWeight; else if (!isNew && isKill) RCCost = -W.RegWeight; } if (RCCost == 0) continue; const int *PS = TRI->getRegClassPressureSets(RC); for (; *PS != -1; ++PS) { if (Cost.find(*PS) == Cost.end()) Cost[*PS] = RCCost; else Cost[*PS] += RCCost; } } return Cost; } /// Return true if this machine instruction loads from global offset table or /// constant pool. static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) { assert (MI.mayLoad() && "Expected MI that loads!"); // If we lost memory operands, conservatively assume that the instruction // reads from everything.. if (MI.memoperands_empty()) return true; for (MachineMemOperand *MemOp : MI.memoperands()) if (const PseudoSourceValue *PSV = MemOp->getPseudoValue()) if (PSV->isGOT() || PSV->isConstantPool()) return true; return false; } /// Returns true if the instruction may be a suitable candidate for LICM. /// e.g. If the instruction is a call, then it's obviously not safe to hoist it. bool MachineLICM::IsLICMCandidate(MachineInstr &I) { // Check if it's safe to move the instruction. bool DontMoveAcrossStore = true; if (!I.isSafeToMove(AA, DontMoveAcrossStore)) return false; // If it is load then check if it is guaranteed to execute by making sure that // it dominates all exiting blocks. If it doesn't, then there is a path out of // the loop which does not execute this load, so we can't hoist it. Loads // from constant memory are not safe to speculate all the time, for example // indexed load from a jump table. // Stores and side effects are already checked by isSafeToMove. if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) && !IsGuaranteedToExecute(I.getParent())) return false; return true; } /// Returns true if the instruction is loop invariant. /// I.e., all virtual register operands are defined outside of the loop, /// physical registers aren't accessed explicitly, and there are no side /// effects that aren't captured by the operands or other flags. /// bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) { if (!IsLICMCandidate(I)) return false; // The instruction is loop invariant if all of its operands are. for (const MachineOperand &MO : I.operands()) { if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg == 0) continue; // Don't hoist an instruction that uses or defines a physical register. if (TargetRegisterInfo::isPhysicalRegister(Reg)) { if (MO.isUse()) { // If the physreg has no defs anywhere, it's just an ambient register // and we can freely move its uses. Alternatively, if it's allocatable, // it could get allocated to something with a def during allocation. if (!MRI->isConstantPhysReg(Reg)) return false; // Otherwise it's safe to move. continue; } else if (!MO.isDead()) { // A def that isn't dead. We can't move it. return false; } else if (CurLoop->getHeader()->isLiveIn(Reg)) { // If the reg is live into the loop, we can't hoist an instruction // which would clobber it. return false; } } if (!MO.isUse()) continue; assert(MRI->getVRegDef(Reg) && "Machine instr not mapped for this vreg?!"); // If the loop contains the definition of an operand, then the instruction // isn't loop invariant. if (CurLoop->contains(MRI->getVRegDef(Reg))) return false; } // If we got this far, the instruction is loop invariant! return true; } /// Return true if the specified instruction is used by a phi node and hoisting /// it could cause a copy to be inserted. bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const { SmallVector Work(1, MI); do { MI = Work.pop_back_val(); for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; for (MachineInstr &UseMI : MRI->use_instructions(Reg)) { // A PHI may cause a copy to be inserted. if (UseMI.isPHI()) { // A PHI inside the loop causes a copy because the live range of Reg is // extended across the PHI. if (CurLoop->contains(&UseMI)) return true; // A PHI in an exit block can cause a copy to be inserted if the PHI // has multiple predecessors in the loop with different values. // For now, approximate by rejecting all exit blocks. if (isExitBlock(UseMI.getParent())) return true; continue; } // Look past copies as well. if (UseMI.isCopy() && CurLoop->contains(&UseMI)) Work.push_back(&UseMI); } } } while (!Work.empty()); return false; } /// Compute operand latency between a def of 'Reg' and an use in the current /// loop, return true if the target considered it high. bool MachineLICM::HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx, unsigned Reg) const { if (MRI->use_nodbg_empty(Reg)) return false; for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) { if (UseMI.isCopyLike()) continue; if (!CurLoop->contains(UseMI.getParent())) continue; for (unsigned i = 0, e = UseMI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = UseMI.getOperand(i); if (!MO.isReg() || !MO.isUse()) continue; unsigned MOReg = MO.getReg(); if (MOReg != Reg) continue; if (TII->hasHighOperandLatency(SchedModel, MRI, MI, DefIdx, UseMI, i)) return true; } // Only look at the first in loop use. break; } return false; } /// Return true if the instruction is marked "cheap" or the operand latency /// between its def and a use is one or less. bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const { if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike()) return true; bool isCheap = false; unsigned NumDefs = MI.getDesc().getNumDefs(); for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) { MachineOperand &DefMO = MI.getOperand(i); if (!DefMO.isReg() || !DefMO.isDef()) continue; --NumDefs; unsigned Reg = DefMO.getReg(); if (TargetRegisterInfo::isPhysicalRegister(Reg)) continue; if (!TII->hasLowDefLatency(SchedModel, MI, i)) return false; isCheap = true; } return isCheap; } /// Visit BBs from header to current BB, check if hoisting an instruction of the /// given cost matrix can cause high register pressure. bool MachineLICM::CanCauseHighRegPressure(const DenseMap& Cost, bool CheapInstr) { for (const auto &RPIdAndCost : Cost) { if (RPIdAndCost.second <= 0) continue; unsigned Class = RPIdAndCost.first; int Limit = RegLimit[Class]; // Don't hoist cheap instructions if they would increase register pressure, // even if we're under the limit. if (CheapInstr && !HoistCheapInsts) return true; for (const auto &RP : BackTrace) if (static_cast(RP[Class]) + RPIdAndCost.second >= Limit) return true; } return false; } /// Traverse the back trace from header to the current block and update their /// register pressures to reflect the effect of hoisting MI from the current /// block to the preheader. void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) { // First compute the 'cost' of the instruction, i.e. its contribution // to register pressure. auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/false, /*ConsiderUnseenAsDef=*/false); // Update register pressure of blocks from loop header to current block. for (auto &RP : BackTrace) for (const auto &RPIdAndCost : Cost) RP[RPIdAndCost.first] += RPIdAndCost.second; } /// Return true if it is potentially profitable to hoist the given loop /// invariant. bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) { if (MI.isImplicitDef()) return true; // Besides removing computation from the loop, hoisting an instruction has // these effects: // // - The value defined by the instruction becomes live across the entire // loop. This increases register pressure in the loop. // // - If the value is used by a PHI in the loop, a copy will be required for // lowering the PHI after extending the live range. // // - When hoisting the last use of a value in the loop, that value no longer // needs to be live in the loop. This lowers register pressure in the loop. bool CheapInstr = IsCheapInstruction(MI); bool CreatesCopy = HasLoopPHIUse(&MI); // Don't hoist a cheap instruction if it would create a copy in the loop. if (CheapInstr && CreatesCopy) { DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI); return false; } // Rematerializable instructions should always be hoisted since the register // allocator can just pull them down again when needed. if (TII->isTriviallyReMaterializable(MI, AA)) return true; // FIXME: If there are long latency loop-invariant instructions inside the // loop at this point, why didn't the optimizer's LICM hoist them? for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (!MO.isReg() || MO.isImplicit()) continue; unsigned Reg = MO.getReg(); if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) { DEBUG(dbgs() << "Hoist High Latency: " << MI); ++NumHighLatency; return true; } } // Estimate register pressure to determine whether to LICM the instruction. // In low register pressure situation, we can be more aggressive about // hoisting. Also, favors hoisting long latency instructions even in // moderately high pressure situation. // Cheap instructions will only be hoisted if they don't increase register // pressure at all. auto Cost = calcRegisterCost(&MI, /*ConsiderSeen=*/false, /*ConsiderUnseenAsDef=*/false); // Visit BBs from header to current BB, if hoisting this doesn't cause // high register pressure, then it's safe to proceed. if (!CanCauseHighRegPressure(Cost, CheapInstr)) { DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI); ++NumLowRP; return true; } // Don't risk increasing register pressure if it would create copies. if (CreatesCopy) { DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI); return false; } // Do not "speculate" in high register pressure situation. If an // instruction is not guaranteed to be executed in the loop, it's best to be // conservative. if (AvoidSpeculation && (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) { DEBUG(dbgs() << "Won't speculate: " << MI); return false; } // High register pressure situation, only hoist if the instruction is going // to be remat'ed. if (!TII->isTriviallyReMaterializable(MI, AA) && !MI.isDereferenceableInvariantLoad(AA)) { DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI); return false; } return true; } /// Unfold a load from the given machineinstr if the load itself could be /// hoisted. Return the unfolded and hoistable load, or null if the load /// couldn't be unfolded or if it wouldn't be hoistable. MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) { // Don't unfold simple loads. if (MI->canFoldAsLoad()) return nullptr; // If not, we may be able to unfold a load and hoist that. // First test whether the instruction is loading from an amenable // memory location. if (!MI->isDereferenceableInvariantLoad(AA)) return nullptr; // Next determine the register class for a temporary register. unsigned LoadRegIndex; unsigned NewOpc = TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(), /*UnfoldLoad=*/true, /*UnfoldStore=*/false, &LoadRegIndex); if (NewOpc == 0) return nullptr; const MCInstrDesc &MID = TII->get(NewOpc); MachineFunction &MF = *MI->getParent()->getParent(); const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF); // Ok, we're unfolding. Create a temporary register and do the unfold. unsigned Reg = MRI->createVirtualRegister(RC); SmallVector NewMIs; bool Success = TII->unfoldMemoryOperand(MF, *MI, Reg, /*UnfoldLoad=*/true, /*UnfoldStore=*/false, NewMIs); (void)Success; assert(Success && "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold " "succeeded!"); assert(NewMIs.size() == 2 && "Unfolded a load into multiple instructions!"); MachineBasicBlock *MBB = MI->getParent(); MachineBasicBlock::iterator Pos = MI; MBB->insert(Pos, NewMIs[0]); MBB->insert(Pos, NewMIs[1]); // If unfolding produced a load that wasn't loop-invariant or profitable to // hoist, discard the new instructions and bail. if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) { NewMIs[0]->eraseFromParent(); NewMIs[1]->eraseFromParent(); return nullptr; } // Update register pressure for the unfolded instruction. UpdateRegPressure(NewMIs[1]); // Otherwise we successfully unfolded a load that we can hoist. MI->eraseFromParent(); return NewMIs[0]; } /// Initialize the CSE map with instructions that are in the current loop /// preheader that may become duplicates of instructions that are hoisted /// out of the loop. void MachineLICM::InitCSEMap(MachineBasicBlock *BB) { for (MachineInstr &MI : *BB) CSEMap[MI.getOpcode()].push_back(&MI); } /// Find an instruction amount PrevMIs that is a duplicate of MI. /// Return this instruction if it's found. const MachineInstr* MachineLICM::LookForDuplicate(const MachineInstr *MI, std::vector &PrevMIs) { for (const MachineInstr *PrevMI : PrevMIs) if (TII->produceSameValue(*MI, *PrevMI, (PreRegAlloc ? MRI : nullptr))) return PrevMI; return nullptr; } /// Given a LICM'ed instruction, look for an instruction on the preheader that /// computes the same value. If it's found, do a RAU on with the definition of /// the existing instruction rather than hoisting the instruction to the /// preheader. bool MachineLICM::EliminateCSE(MachineInstr *MI, DenseMap >::iterator &CI) { // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate // the undef property onto uses. if (CI == CSEMap.end() || MI->isImplicitDef()) return false; if (const MachineInstr *Dup = LookForDuplicate(MI, CI->second)) { DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup); // Replace virtual registers defined by MI by their counterparts defined // by Dup. SmallVector Defs; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); // Physical registers may not differ here. assert((!MO.isReg() || MO.getReg() == 0 || !TargetRegisterInfo::isPhysicalRegister(MO.getReg()) || MO.getReg() == Dup->getOperand(i).getReg()) && "Instructions with different phys regs are not identical!"); if (MO.isReg() && MO.isDef() && !TargetRegisterInfo::isPhysicalRegister(MO.getReg())) Defs.push_back(i); } SmallVector OrigRCs; for (unsigned i = 0, e = Defs.size(); i != e; ++i) { unsigned Idx = Defs[i]; unsigned Reg = MI->getOperand(Idx).getReg(); unsigned DupReg = Dup->getOperand(Idx).getReg(); OrigRCs.push_back(MRI->getRegClass(DupReg)); if (!MRI->constrainRegClass(DupReg, MRI->getRegClass(Reg))) { // Restore old RCs if more than one defs. for (unsigned j = 0; j != i; ++j) MRI->setRegClass(Dup->getOperand(Defs[j]).getReg(), OrigRCs[j]); return false; } } for (unsigned Idx : Defs) { unsigned Reg = MI->getOperand(Idx).getReg(); unsigned DupReg = Dup->getOperand(Idx).getReg(); MRI->replaceRegWith(Reg, DupReg); MRI->clearKillFlags(DupReg); } MI->eraseFromParent(); ++NumCSEed; return true; } return false; } /// Return true if the given instruction will be CSE'd if it's hoisted out of /// the loop. bool MachineLICM::MayCSE(MachineInstr *MI) { unsigned Opcode = MI->getOpcode(); DenseMap >::iterator CI = CSEMap.find(Opcode); // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate // the undef property onto uses. if (CI == CSEMap.end() || MI->isImplicitDef()) return false; return LookForDuplicate(MI, CI->second) != nullptr; } /// When an instruction is found to use only loop invariant operands /// that are safe to hoist, this instruction is called to do the dirty work. /// It returns true if the instruction is hoisted. bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) { // First check whether we should hoist this instruction. if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) { // If not, try unfolding a hoistable load. MI = ExtractHoistableLoad(MI); if (!MI) return false; } // Now move the instructions to the predecessor, inserting it before any // terminator instructions. DEBUG({ dbgs() << "Hoisting " << *MI; if (MI->getParent()->getBasicBlock()) dbgs() << " from BB#" << MI->getParent()->getNumber(); if (Preheader->getBasicBlock()) dbgs() << " to BB#" << Preheader->getNumber(); dbgs() << "\n"; }); // If this is the first instruction being hoisted to the preheader, // initialize the CSE map with potential common expressions. if (FirstInLoop) { InitCSEMap(Preheader); FirstInLoop = false; } // Look for opportunity to CSE the hoisted instruction. unsigned Opcode = MI->getOpcode(); DenseMap >::iterator CI = CSEMap.find(Opcode); if (!EliminateCSE(MI, CI)) { // Otherwise, splice the instruction to the preheader. Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI); // Update register pressure for BBs from header to this block. UpdateBackTraceRegPressure(MI); // Clear the kill flags of any register this instruction defines, // since they may need to be live throughout the entire loop // rather than just live for part of it. for (MachineOperand &MO : MI->operands()) if (MO.isReg() && MO.isDef() && !MO.isDead()) MRI->clearKillFlags(MO.getReg()); // Add to the CSE map. if (CI != CSEMap.end()) CI->second.push_back(MI); else CSEMap[Opcode].push_back(MI); } ++NumHoisted; Changed = true; return true; } /// Get the preheader for the current loop, splitting a critical edge if needed. MachineBasicBlock *MachineLICM::getCurPreheader() { // Determine the block to which to hoist instructions. If we can't find a // suitable loop predecessor, we can't do any hoisting. // If we've tried to get a preheader and failed, don't try again. if (CurPreheader == reinterpret_cast(-1)) return nullptr; if (!CurPreheader) { CurPreheader = CurLoop->getLoopPreheader(); if (!CurPreheader) { MachineBasicBlock *Pred = CurLoop->getLoopPredecessor(); if (!Pred) { CurPreheader = reinterpret_cast(-1); return nullptr; } CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), *this); if (!CurPreheader) { CurPreheader = reinterpret_cast(-1); return nullptr; } } } return CurPreheader; }