diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h index 57febe77464..99cbd870ec3 100644 --- a/include/llvm/CodeGen/MachineScheduler.h +++ b/include/llvm/CodeGen/MachineScheduler.h @@ -297,6 +297,10 @@ public: /// reorderable instructions. virtual void schedule(); + /// Change the position of an instruction within the basic block and update + /// live ranges and region boundary iterators. + void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos); + /// Get current register pressure for the top scheduled instructions. const IntervalPressure &getTopPressure() const { return TopPressure; } const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; } @@ -362,7 +366,6 @@ protected: void updateScheduledPressure(const std::vector &NewMaxPressure); - void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos); bool checkSchedLimit(); void findRootsAndBiasEdges(SmallVectorImpl &TopRoots, diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h index 8c959da696d..e13636c2504 100644 --- a/include/llvm/CodeGen/ScheduleDAG.h +++ b/include/llvm/CodeGen/ScheduleDAG.h @@ -302,6 +302,7 @@ namespace llvm { bool isCallOp : 1; // Is a function call operand. bool isTwoAddress : 1; // Is a two-address instruction. bool isCommutable : 1; // Is a commutable instruction. + bool hasPhysRegUses : 1; // Has physreg uses. bool hasPhysRegDefs : 1; // Has physreg defs that are being used. bool hasPhysRegClobbers : 1; // Has any physreg defs, used or not. bool isPending : 1; // True once pending. @@ -331,10 +332,10 @@ namespace llvm { NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false), isCallOp(false), - isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), - hasPhysRegClobbers(false), isPending(false), isAvailable(false), - isScheduled(false), isScheduleHigh(false), isScheduleLow(false), - isCloned(false), SchedulingPref(Sched::None), + isTwoAddress(false), isCommutable(false), hasPhysRegUses(false), + hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), + isAvailable(false), isScheduled(false), isScheduleHigh(false), + isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} @@ -345,10 +346,10 @@ namespace llvm { NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false), isCallOp(false), - isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), - hasPhysRegClobbers(false), isPending(false), isAvailable(false), - isScheduled(false), isScheduleHigh(false), isScheduleLow(false), - isCloned(false), SchedulingPref(Sched::None), + isTwoAddress(false), isCommutable(false), hasPhysRegUses(false), + hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), + isAvailable(false), isScheduled(false), isScheduleHigh(false), + isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} @@ -358,10 +359,10 @@ namespace llvm { NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0), NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false), isCallOp(false), - isTwoAddress(false), isCommutable(false), hasPhysRegDefs(false), - hasPhysRegClobbers(false), isPending(false), isAvailable(false), - isScheduled(false), isScheduleHigh(false), isScheduleLow(false), - isCloned(false), SchedulingPref(Sched::None), + isTwoAddress(false), isCommutable(false), hasPhysRegUses(false), + hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false), + isAvailable(false), isScheduled(false), isScheduleHigh(false), + isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None), isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {} diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp index 5bd2349b50f..736f6deae9c 100644 --- a/lib/CodeGen/MachineScheduler.cpp +++ b/lib/CodeGen/MachineScheduler.cpp @@ -404,6 +404,8 @@ void ScheduleDAGMI::releasePredecessors(SUnit *SU) { } } +/// This is normally called from the main scheduler loop but may also be invoked +/// by the scheduling strategy to perform additional code motion. void ScheduleDAGMI::moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos) { // Advance RegionBegin if the first instruction moves down. @@ -916,7 +918,7 @@ public: /// Represent the type of SchedCandidate found within a single queue. /// pickNodeBidirectional depends on these listed by decreasing priority. enum CandReason { - NoCand, SingleExcess, SingleCritical, Cluster, + NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse, NodeOrder}; @@ -1191,6 +1193,8 @@ protected: const RegPressureTracker &RPTracker, SchedCandidate &Candidate); + void reschedulePhysRegCopies(SUnit *SU, bool isTop); + #ifndef NDEBUG void traceCandidate(const SchedCandidate &Cand); #endif @@ -1696,6 +1700,34 @@ static unsigned getWeakLeft(const SUnit *SU, bool isTop) { return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft; } +/// Minimize physical register live ranges. Regalloc wants them adjacent to +/// their physreg def/use. +/// +/// FIXME: This is an unnecessary check on the critical path. Most are root/leaf +/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled +/// with the operation that produces or consumes the physreg. We'll do this when +/// regalloc has support for parallel copies. +static int biasPhysRegCopy(const SUnit *SU, bool isTop) { + const MachineInstr *MI = SU->getInstr(); + if (!MI->isCopy()) + return 0; + + unsigned ScheduledOper = isTop ? 1 : 0; + unsigned UnscheduledOper = isTop ? 0 : 1; + // If we have already scheduled the physreg produce/consumer, immediately + // schedule the copy. + if (TargetRegisterInfo::isPhysicalRegister( + MI->getOperand(ScheduledOper).getReg())) + return 1; + // If the physreg is at the boundary, defer it. Otherwise schedule it + // immediately to free the dependent. We can hoist the copy later. + bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft; + if (TargetRegisterInfo::isPhysicalRegister( + MI->getOperand(UnscheduledOper).getReg())) + return AtBoundary ? -1 : 1; + return 0; +} + /// Apply a set of heursitics to a new candidate. Heuristics are currently /// hierarchical. This may be more efficient than a graduated cost model because /// we don't need to evaluate all aspects of the model for each node in the @@ -1723,6 +1755,12 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand, TryCand.Reason = NodeOrder; return; } + + if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()), + biasPhysRegCopy(Cand.SU, Zone.isTop()), + TryCand, Cand, PhysRegCopy)) + return; + // Avoid exceeding the target's limit. if (tryLess(TryCand.RPDelta.Excess.UnitIncrease, Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess)) @@ -1851,6 +1889,7 @@ const char *ConvergingScheduler::getReasonStr( ConvergingScheduler::CandReason Reason) { switch (Reason) { case NoCand: return "NOCAND "; + case PhysRegCopy: return "PREG-COPY"; case SingleExcess: return "REG-EXCESS"; case SingleCritical: return "REG-CRIT "; case Cluster: return "CLUSTER "; @@ -2069,17 +2108,49 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) { return SU; } +void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) { + + MachineBasicBlock::iterator InsertPos = SU->getInstr(); + if (!isTop) + ++InsertPos; + SmallVectorImpl &Deps = isTop ? SU->Preds : SU->Succs; + + // Find already scheduled copies with a single physreg dependence and move + // them just above the scheduled instruction. + for (SmallVectorImpl::iterator I = Deps.begin(), E = Deps.end(); + I != E; ++I) { + if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg())) + continue; + SUnit *DepSU = I->getSUnit(); + if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1) + continue; + MachineInstr *Copy = DepSU->getInstr(); + if (!Copy->isCopy()) + continue; + DEBUG(dbgs() << " Rescheduling physreg copy "; + I->getSUnit()->dump(DAG)); + DAG->moveInstruction(Copy, InsertPos); + } +} + /// Update the scheduler's state after scheduling a node. This is the same node /// that was just returned by pickNode(). However, ScheduleDAGMI needs to update /// it's state based on the current cycle before MachineSchedStrategy does. +/// +/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling +/// them here. See comments in biasPhysRegCopy. void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) { if (IsTopNode) { SU->TopReadyCycle = Top.CurrCycle; Top.bumpNode(SU); + if (SU->hasPhysRegUses) + reschedulePhysRegCopies(SU, true); } else { SU->BotReadyCycle = Bot.CurrCycle; Bot.bumpNode(SU); + if (SU->hasPhysRegDefs) + reschedulePhysRegCopies(SU, false); } } diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp index 71e7a21ef2b..e4da6a41eea 100644 --- a/lib/CodeGen/ScheduleDAGInstrs.cpp +++ b/lib/CodeGen/ScheduleDAGInstrs.cpp @@ -262,6 +262,9 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) { if (UseOp < 0) Dep = SDep(SU, SDep::Artificial); else { + // Set the hasPhysRegDefs only for physreg defs that have a use within + // the scheduling region. + SU->hasPhysRegDefs = true; Dep = SDep(SU, SDep::Data, *Alias); RegUse = UseSU->getInstr(); Dep.setMinLatency( @@ -318,6 +321,7 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) { } if (!MO.isDef()) { + SU->hasPhysRegUses = true; // Either insert a new Reg2SUnits entry with an empty SUnits list, or // retrieve the existing SUnits list for this register's uses. // Push this SUnit on the use list. diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll new file mode 100644 index 00000000000..d04df999b98 --- /dev/null +++ b/test/CodeGen/X86/misched-copy.ll @@ -0,0 +1,48 @@ +; RUN: llc %s -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched 2>&1 | FileCheck %s +; +; Test scheduling of copy instructions. +; +; Argument copies should be hoisted to the top of the block. +; Return copies should be sunk to the end. +; MUL_HiLo PhysReg use copies should be just above the mul. +; MUL_HiLo PhysReg def copies should be just below the mul. +; +; CHECK: *** Final schedule for BB#1 *** +; CHECK-NEXT: %EAX = COPY +; CHECK: MUL32r %vreg{{[0-6]+}}, %EAX, %EDX, %EFLAGS, %EAX; +; CHECK-NEXT: COPY %EAX; +; CHECK-NEXT: COPY %EDX; +; CHECK: DIVSSrm +define i64 @mulhoist(i32 %a, i32 %b) #0 { +entry: + br label %body + +body: + %convb = sitofp i32 %b to float + ; Generates an iMUL64r to legalize types. + %aa = zext i32 %a to i64 + %mul = mul i64 %aa, 74383 + ; Do some dependent long latency stuff. + %trunc = trunc i64 %mul to i32 + %convm = sitofp i32 %trunc to float + %divm = fdiv float %convm, 0.75 + ;%addmb = fadd float %divm, %convb + ;%divmb = fdiv float %addmb, 0.125 + ; Do some independent long latency stuff. + %conva = sitofp i32 %a to float + %diva = fdiv float %conva, 0.75 + %addab = fadd float %diva, %convb + %divab = fdiv float %addab, 0.125 + br label %end + +end: + %val = fptosi float %divab to i64 + %add = add i64 %mul, %val + ret i64 %add +} + +attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!0 = metadata !{metadata !"float", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"}