mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-01 09:18:30 +00:00
MI Sched: eliminate local vreg copies.
For now, we just reschedule instructions that use the copied vregs and let regalloc elliminate it. I would really like to eliminate the copies on-the-fly during scheduling, but we need a complete implementation of repairIntervalsInRange() first. The general strategy is for the register coalescer to eliminate as many global copies as possible and shrink live ranges to be extended-basic-block local. The coalescer should not have to worry about resolving local copies (e.g. it shouldn't attemp to reorder instructions). The scheduler is a much better place to deal with local interference. The coalescer side of this equation needs work. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180193 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
e2326ad2c0
commit
e38afe1e33
@ -399,6 +399,15 @@ namespace llvm {
|
||||
return r != end() && r->containsRange(Start, End);
|
||||
}
|
||||
|
||||
/// True iff this live range is a single segment that lies between the
|
||||
/// specified boundaries, exclusively. Vregs live across a backedge are not
|
||||
/// considered local. The boundaries are expected to lie within an extended
|
||||
/// basic block, so vregs that are not live out should contain no holes.
|
||||
bool isLocal(SlotIndex Start, SlotIndex End) const {
|
||||
return beginIndex() > Start.getBaseIndex() &&
|
||||
endIndex() < End.getBoundaryIndex();
|
||||
}
|
||||
|
||||
/// removeRange - Remove the specified range from this interval. Note that
|
||||
/// the range must be a single LiveRange in its entirety.
|
||||
void removeRange(SlotIndex Start, SlotIndex End,
|
||||
|
@ -274,6 +274,10 @@ public:
|
||||
Mutations.push_back(Mutation);
|
||||
}
|
||||
|
||||
/// \brief True if an edge can be added from PredSU to SuccSU without creating
|
||||
/// a cycle.
|
||||
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU);
|
||||
|
||||
/// \brief Add a DAG edge to the given SU with the given predecessor
|
||||
/// dependence data.
|
||||
///
|
||||
|
@ -727,9 +727,8 @@ namespace llvm {
|
||||
/// IsReachable - Checks if SU is reachable from TargetSU.
|
||||
bool IsReachable(const SUnit *SU, const SUnit *TargetSU);
|
||||
|
||||
/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU
|
||||
/// will create a cycle.
|
||||
bool WillCreateCycle(SUnit *SU, SUnit *TargetSU);
|
||||
/// WillCreateCycle - Return true if addPred(TargetSU, SU) creates a cycle.
|
||||
bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
|
||||
|
||||
/// AddPred - Updates the topological ordering to accommodate an edge
|
||||
/// to be added from SUnit X to SUnit Y.
|
||||
|
@ -150,6 +150,9 @@ namespace llvm {
|
||||
|
||||
virtual ~ScheduleDAGInstrs() {}
|
||||
|
||||
/// \brief Expose LiveIntervals for use in DAG mutators and such.
|
||||
LiveIntervals *getLIS() const { return LIS; }
|
||||
|
||||
/// \brief Get the machine model for instruction scheduling.
|
||||
const TargetSchedModel *getSchedModel() const { return &SchedModel; }
|
||||
|
||||
|
@ -51,7 +51,11 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
|
||||
static bool ViewMISchedDAGs = false;
|
||||
#endif // NDEBUG
|
||||
|
||||
// Experimental heuristics
|
||||
// FIXME: remove this flag after initial testing. It should always be a good
|
||||
// thing.
|
||||
static cl::opt<bool> EnableCopyConstrain("misched-vcopy", cl::Hidden,
|
||||
cl::desc("Constrain vreg copies."), cl::init(true));
|
||||
|
||||
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
|
||||
cl::desc("Enable load clustering."), cl::init(true));
|
||||
|
||||
@ -323,6 +327,10 @@ ScheduleDAGMI::~ScheduleDAGMI() {
|
||||
delete SchedImpl;
|
||||
}
|
||||
|
||||
bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
|
||||
return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
|
||||
}
|
||||
|
||||
bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
|
||||
if (SuccSU != &ExitSU) {
|
||||
// Do not use WillCreateCycle, it assumes SD scheduling.
|
||||
@ -914,6 +922,180 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {
|
||||
}
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// CopyConstrain - DAG post-processing to encourage copy elimination.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace {
|
||||
/// \brief Post-process the DAG to create weak edges from all uses of a copy to
|
||||
/// the one use that defines the copy's source vreg, most likely an induction
|
||||
/// variable increment.
|
||||
class CopyConstrain : public ScheduleDAGMutation {
|
||||
// Transient state.
|
||||
SlotIndex RegionBeginIdx;
|
||||
SlotIndex RegionEndIdx;
|
||||
public:
|
||||
CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
|
||||
|
||||
virtual void apply(ScheduleDAGMI *DAG);
|
||||
|
||||
protected:
|
||||
void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG);
|
||||
};
|
||||
} // anonymous
|
||||
|
||||
/// constrainLocalCopy handles two possibilities:
|
||||
/// 1) Local src:
|
||||
/// I0: = dst
|
||||
/// I1: src = ...
|
||||
/// I2: = dst
|
||||
/// I3: dst = src (copy)
|
||||
/// (create pred->succ edges I0->I1, I2->I1)
|
||||
///
|
||||
/// 2) Local copy:
|
||||
/// I0: dst = src (copy)
|
||||
/// I1: = dst
|
||||
/// I2: src = ...
|
||||
/// I3: = dst
|
||||
/// (create pred->succ edges I1->I2, I3->I2)
|
||||
///
|
||||
/// Although the MachineScheduler is currently constrained to single blocks,
|
||||
/// this algorithm should handle extended blocks. An EBB is a set of
|
||||
/// contiguously numbered blocks such that the previous block in the EBB is
|
||||
/// always the single predecessor.
|
||||
void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
|
||||
LiveIntervals *LIS = DAG->getLIS();
|
||||
MachineInstr *Copy = CopySU->getInstr();
|
||||
|
||||
// Check for pure vreg copies.
|
||||
unsigned SrcReg = Copy->getOperand(1).getReg();
|
||||
if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
|
||||
return;
|
||||
|
||||
unsigned DstReg = Copy->getOperand(0).getReg();
|
||||
if (!TargetRegisterInfo::isVirtualRegister(DstReg))
|
||||
return;
|
||||
|
||||
// Check if either the dest or source is local. If it's live across a back
|
||||
// edge, it's not local. Note that if both vregs are live across the back
|
||||
// edge, we cannot successfully contrain the copy without cyclic scheduling.
|
||||
unsigned LocalReg = DstReg;
|
||||
unsigned GlobalReg = SrcReg;
|
||||
LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
|
||||
if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
|
||||
LocalReg = SrcReg;
|
||||
GlobalReg = DstReg;
|
||||
LocalLI = &LIS->getInterval(LocalReg);
|
||||
if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
|
||||
return;
|
||||
}
|
||||
LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
|
||||
|
||||
// Find the global segment after the start of the local LI.
|
||||
LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
|
||||
// If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
|
||||
// local live range. We could create edges from other global uses to the local
|
||||
// start, but the coalescer should have already eliminated these cases, so
|
||||
// don't bother dealing with it.
|
||||
if (GlobalSegment == GlobalLI->end())
|
||||
return;
|
||||
|
||||
// If GlobalSegment is killed at the LocalLI->start, the call to find()
|
||||
// returned the next global segment. But if GlobalSegment overlaps with
|
||||
// LocalLI->start, then advance to the next segement. If a hole in GlobalLI
|
||||
// exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
|
||||
if (GlobalSegment->contains(LocalLI->beginIndex()))
|
||||
++GlobalSegment;
|
||||
|
||||
if (GlobalSegment == GlobalLI->end())
|
||||
return;
|
||||
|
||||
// Check if GlobalLI contains a hole in the vicinity of LocalLI.
|
||||
if (GlobalSegment != GlobalLI->begin()) {
|
||||
// Two address defs have no hole.
|
||||
if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end,
|
||||
GlobalSegment->start)) {
|
||||
return;
|
||||
}
|
||||
// If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
|
||||
// it would be a disconnected component in the live range.
|
||||
assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
|
||||
"Disconnected LRG within the scheduling region.");
|
||||
}
|
||||
MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
|
||||
if (!GlobalDef)
|
||||
return;
|
||||
|
||||
SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
|
||||
if (!GlobalSU)
|
||||
return;
|
||||
|
||||
// GlobalDef is the bottom of the GlobalLI hole. Open the hole by
|
||||
// constraining the uses of the last local def to precede GlobalDef.
|
||||
SmallVector<SUnit*,8> LocalUses;
|
||||
const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
|
||||
MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
|
||||
SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
|
||||
for (SUnit::const_succ_iterator
|
||||
I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
|
||||
I != E; ++I) {
|
||||
if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
|
||||
continue;
|
||||
if (I->getSUnit() == GlobalSU)
|
||||
continue;
|
||||
if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
|
||||
return;
|
||||
LocalUses.push_back(I->getSUnit());
|
||||
}
|
||||
// Open the top of the GlobalLI hole by constraining any earlier global uses
|
||||
// to precede the start of LocalLI.
|
||||
SmallVector<SUnit*,8> GlobalUses;
|
||||
MachineInstr *FirstLocalDef =
|
||||
LIS->getInstructionFromIndex(LocalLI->beginIndex());
|
||||
SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
|
||||
for (SUnit::const_pred_iterator
|
||||
I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
|
||||
if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
|
||||
continue;
|
||||
if (I->getSUnit() == FirstLocalSU)
|
||||
continue;
|
||||
if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
|
||||
return;
|
||||
GlobalUses.push_back(I->getSUnit());
|
||||
}
|
||||
DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
|
||||
// Add the weak edges.
|
||||
for (SmallVectorImpl<SUnit*>::const_iterator
|
||||
I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
|
||||
DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU("
|
||||
<< GlobalSU->NodeNum << ")\n");
|
||||
DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
|
||||
}
|
||||
for (SmallVectorImpl<SUnit*>::const_iterator
|
||||
I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
|
||||
DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU("
|
||||
<< FirstLocalSU->NodeNum << ")\n");
|
||||
DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Callback from DAG postProcessing to create weak edges to encourage
|
||||
/// copy elimination.
|
||||
void CopyConstrain::apply(ScheduleDAGMI *DAG) {
|
||||
RegionBeginIdx = DAG->getLIS()->getInstructionIndex(
|
||||
&*nextIfDebug(DAG->begin(), DAG->end()));
|
||||
RegionEndIdx = DAG->getLIS()->getInstructionIndex(
|
||||
&*priorNonDebug(DAG->end(), DAG->begin()));
|
||||
|
||||
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
|
||||
SUnit *SU = &DAG->SUnits[Idx];
|
||||
if (!SU->getInstr()->isCopy())
|
||||
continue;
|
||||
|
||||
constrainLocalCopy(SU, DAG);
|
||||
}
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -926,7 +1108,7 @@ public:
|
||||
/// Represent the type of SchedCandidate found within a single queue.
|
||||
/// pickNodeBidirectional depends on these listed by decreasing priority.
|
||||
enum CandReason {
|
||||
NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster,
|
||||
NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster, Weak,
|
||||
ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
|
||||
TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
|
||||
NodeOrder};
|
||||
@ -1802,13 +1984,11 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
|
||||
if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
|
||||
TryCand, Cand, Cluster))
|
||||
return;
|
||||
// Currently, weak edges are for clustering, so we hard-code that reason.
|
||||
// However, deferring the current TryCand will not change Cand's reason.
|
||||
CandReason OrigReason = Cand.Reason;
|
||||
|
||||
// Weak edges are for clustering and other constraints.
|
||||
if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
|
||||
getWeakLeft(Cand.SU, Zone.isTop()),
|
||||
TryCand, Cand, Cluster)) {
|
||||
Cand.Reason = OrigReason;
|
||||
TryCand, Cand, Weak)) {
|
||||
return;
|
||||
}
|
||||
// Avoid critical resource consumption and balance the schedule.
|
||||
@ -1908,6 +2088,7 @@ const char *ConvergingScheduler::getReasonStr(
|
||||
case SingleExcess: return "REG-EXCESS";
|
||||
case SingleCritical: return "REG-CRIT ";
|
||||
case Cluster: return "CLUSTER ";
|
||||
case Weak: return "WEAK ";
|
||||
case SingleMax: return "REG-MAX ";
|
||||
case MultiPressure: return "REG-MULTI ";
|
||||
case ResourceReduce: return "RES-REDUCE";
|
||||
@ -2177,6 +2358,12 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
|
||||
"-misched-topdown incompatible with -misched-bottomup");
|
||||
ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
|
||||
// Register DAG post-processors.
|
||||
//
|
||||
// FIXME: extend the mutation API to allow earlier mutations to instantiate
|
||||
// data and pass it to later mutations. Have a single mutation that gathers
|
||||
// the interesting nodes in one pass.
|
||||
if (EnableCopyConstrain)
|
||||
DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
|
||||
if (EnableLoadCluster)
|
||||
DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
|
||||
if (EnableMacroFusion)
|
||||
|
30
test/CodeGen/ARM/misched-copy-arm.ll
Normal file
30
test/CodeGen/ARM/misched-copy-arm.ll
Normal file
@ -0,0 +1,30 @@
|
||||
; REQUIRES: asserts
|
||||
; RUN: llc < %s -march=thumb -mcpu=swift -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
|
||||
;
|
||||
; Loop counter copies should be eliminated.
|
||||
; There is also a MUL here, but we don't care where it is scheduled.
|
||||
; CHECK: postinc
|
||||
; CHECK: *** Final schedule for BB#2 ***
|
||||
; CHECK: t2LDRs
|
||||
; CHECK: t2ADDrr
|
||||
; CHECK: t2CMPrr
|
||||
; CHECK: COPY
|
||||
define i32 @postinc(i32 %a, i32* nocapture %d, i32 %s) nounwind {
|
||||
entry:
|
||||
%cmp4 = icmp eq i32 %a, 0
|
||||
br i1 %cmp4, label %for.end, label %for.body
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
||||
%s.05 = phi i32 [ %mul, %for.body ], [ 0, %entry ]
|
||||
%indvars.iv.next = add i32 %indvars.iv, %s
|
||||
%arrayidx = getelementptr inbounds i32* %d, i32 %indvars.iv
|
||||
%0 = load i32* %arrayidx, align 4
|
||||
%mul = mul nsw i32 %0, %s.05
|
||||
%exitcond = icmp eq i32 %indvars.iv.next, %a
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %for.body, %entry
|
||||
%s.0.lcssa = phi i32 [ 0, %entry ], [ %mul, %for.body ]
|
||||
ret i32 %s.0.lcssa
|
||||
}
|
Loading…
Reference in New Issue
Block a user