MI Sched: eliminate local vreg copies.

For now, we just reschedule instructions that use the copied vregs and
let regalloc elliminate it. I would really like to eliminate the
copies on-the-fly during scheduling, but we need a complete
implementation of repairIntervalsInRange() first.

The general strategy is for the register coalescer to eliminate as
many global copies as possible and shrink live ranges to be
extended-basic-block local. The coalescer should not have to worry
about resolving local copies (e.g. it shouldn't attemp to reorder
instructions). The scheduler is a much better place to deal with local
interference. The coalescer side of this equation needs work.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180193 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrew Trick 2013-04-24 15:54:43 +00:00
parent e2326ad2c0
commit e38afe1e33
6 changed files with 242 additions and 10 deletions

View File

@ -399,6 +399,15 @@ namespace llvm {
return r != end() && r->containsRange(Start, End);
}
/// True iff this live range is a single segment that lies between the
/// specified boundaries, exclusively. Vregs live across a backedge are not
/// considered local. The boundaries are expected to lie within an extended
/// basic block, so vregs that are not live out should contain no holes.
bool isLocal(SlotIndex Start, SlotIndex End) const {
return beginIndex() > Start.getBaseIndex() &&
endIndex() < End.getBoundaryIndex();
}
/// removeRange - Remove the specified range from this interval. Note that
/// the range must be a single LiveRange in its entirety.
void removeRange(SlotIndex Start, SlotIndex End,

View File

@ -274,6 +274,10 @@ public:
Mutations.push_back(Mutation);
}
/// \brief True if an edge can be added from PredSU to SuccSU without creating
/// a cycle.
bool canAddEdge(SUnit *SuccSU, SUnit *PredSU);
/// \brief Add a DAG edge to the given SU with the given predecessor
/// dependence data.
///

View File

@ -727,9 +727,8 @@ namespace llvm {
/// IsReachable - Checks if SU is reachable from TargetSU.
bool IsReachable(const SUnit *SU, const SUnit *TargetSU);
/// WillCreateCycle - Returns true if adding an edge from SU to TargetSU
/// will create a cycle.
bool WillCreateCycle(SUnit *SU, SUnit *TargetSU);
/// WillCreateCycle - Return true if addPred(TargetSU, SU) creates a cycle.
bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
/// AddPred - Updates the topological ordering to accommodate an edge
/// to be added from SUnit X to SUnit Y.

View File

@ -150,6 +150,9 @@ namespace llvm {
virtual ~ScheduleDAGInstrs() {}
/// \brief Expose LiveIntervals for use in DAG mutators and such.
LiveIntervals *getLIS() const { return LIS; }
/// \brief Get the machine model for instruction scheduling.
const TargetSchedModel *getSchedModel() const { return &SchedModel; }

View File

@ -51,7 +51,11 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
static bool ViewMISchedDAGs = false;
#endif // NDEBUG
// Experimental heuristics
// FIXME: remove this flag after initial testing. It should always be a good
// thing.
static cl::opt<bool> EnableCopyConstrain("misched-vcopy", cl::Hidden,
cl::desc("Constrain vreg copies."), cl::init(true));
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
cl::desc("Enable load clustering."), cl::init(true));
@ -323,6 +327,10 @@ ScheduleDAGMI::~ScheduleDAGMI() {
delete SchedImpl;
}
bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
}
bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
if (SuccSU != &ExitSU) {
// Do not use WillCreateCycle, it assumes SD scheduling.
@ -914,6 +922,180 @@ void MacroFusion::apply(ScheduleDAGMI *DAG) {
}
}
//===----------------------------------------------------------------------===//
// CopyConstrain - DAG post-processing to encourage copy elimination.
//===----------------------------------------------------------------------===//
namespace {
/// \brief Post-process the DAG to create weak edges from all uses of a copy to
/// the one use that defines the copy's source vreg, most likely an induction
/// variable increment.
class CopyConstrain : public ScheduleDAGMutation {
// Transient state.
SlotIndex RegionBeginIdx;
SlotIndex RegionEndIdx;
public:
CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
virtual void apply(ScheduleDAGMI *DAG);
protected:
void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG);
};
} // anonymous
/// constrainLocalCopy handles two possibilities:
/// 1) Local src:
/// I0: = dst
/// I1: src = ...
/// I2: = dst
/// I3: dst = src (copy)
/// (create pred->succ edges I0->I1, I2->I1)
///
/// 2) Local copy:
/// I0: dst = src (copy)
/// I1: = dst
/// I2: src = ...
/// I3: = dst
/// (create pred->succ edges I1->I2, I3->I2)
///
/// Although the MachineScheduler is currently constrained to single blocks,
/// this algorithm should handle extended blocks. An EBB is a set of
/// contiguously numbered blocks such that the previous block in the EBB is
/// always the single predecessor.
void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
LiveIntervals *LIS = DAG->getLIS();
MachineInstr *Copy = CopySU->getInstr();
// Check for pure vreg copies.
unsigned SrcReg = Copy->getOperand(1).getReg();
if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
return;
unsigned DstReg = Copy->getOperand(0).getReg();
if (!TargetRegisterInfo::isVirtualRegister(DstReg))
return;
// Check if either the dest or source is local. If it's live across a back
// edge, it's not local. Note that if both vregs are live across the back
// edge, we cannot successfully contrain the copy without cyclic scheduling.
unsigned LocalReg = DstReg;
unsigned GlobalReg = SrcReg;
LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
LocalReg = SrcReg;
GlobalReg = DstReg;
LocalLI = &LIS->getInterval(LocalReg);
if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
return;
}
LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
// Find the global segment after the start of the local LI.
LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
// If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
// local live range. We could create edges from other global uses to the local
// start, but the coalescer should have already eliminated these cases, so
// don't bother dealing with it.
if (GlobalSegment == GlobalLI->end())
return;
// If GlobalSegment is killed at the LocalLI->start, the call to find()
// returned the next global segment. But if GlobalSegment overlaps with
// LocalLI->start, then advance to the next segement. If a hole in GlobalLI
// exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
if (GlobalSegment->contains(LocalLI->beginIndex()))
++GlobalSegment;
if (GlobalSegment == GlobalLI->end())
return;
// Check if GlobalLI contains a hole in the vicinity of LocalLI.
if (GlobalSegment != GlobalLI->begin()) {
// Two address defs have no hole.
if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->end,
GlobalSegment->start)) {
return;
}
// If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
// it would be a disconnected component in the live range.
assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
"Disconnected LRG within the scheduling region.");
}
MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
if (!GlobalDef)
return;
SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
if (!GlobalSU)
return;
// GlobalDef is the bottom of the GlobalLI hole. Open the hole by
// constraining the uses of the last local def to precede GlobalDef.
SmallVector<SUnit*,8> LocalUses;
const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
for (SUnit::const_succ_iterator
I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
I != E; ++I) {
if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
continue;
if (I->getSUnit() == GlobalSU)
continue;
if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
return;
LocalUses.push_back(I->getSUnit());
}
// Open the top of the GlobalLI hole by constraining any earlier global uses
// to precede the start of LocalLI.
SmallVector<SUnit*,8> GlobalUses;
MachineInstr *FirstLocalDef =
LIS->getInstructionFromIndex(LocalLI->beginIndex());
SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
for (SUnit::const_pred_iterator
I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
continue;
if (I->getSUnit() == FirstLocalSU)
continue;
if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
return;
GlobalUses.push_back(I->getSUnit());
}
DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
// Add the weak edges.
for (SmallVectorImpl<SUnit*>::const_iterator
I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
DEBUG(dbgs() << " Local use SU(" << (*I)->NodeNum << ") -> SU("
<< GlobalSU->NodeNum << ")\n");
DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
}
for (SmallVectorImpl<SUnit*>::const_iterator
I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
DEBUG(dbgs() << " Global use SU(" << (*I)->NodeNum << ") -> SU("
<< FirstLocalSU->NodeNum << ")\n");
DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
}
}
/// \brief Callback from DAG postProcessing to create weak edges to encourage
/// copy elimination.
void CopyConstrain::apply(ScheduleDAGMI *DAG) {
RegionBeginIdx = DAG->getLIS()->getInstructionIndex(
&*nextIfDebug(DAG->begin(), DAG->end()));
RegionEndIdx = DAG->getLIS()->getInstructionIndex(
&*priorNonDebug(DAG->end(), DAG->begin()));
for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
SUnit *SU = &DAG->SUnits[Idx];
if (!SU->getInstr()->isCopy())
continue;
constrainLocalCopy(SU, DAG);
}
}
//===----------------------------------------------------------------------===//
// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
//===----------------------------------------------------------------------===//
@ -926,7 +1108,7 @@ public:
/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason {
NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster,
NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster, Weak,
ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
NodeOrder};
@ -1802,13 +1984,11 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
TryCand, Cand, Cluster))
return;
// Currently, weak edges are for clustering, so we hard-code that reason.
// However, deferring the current TryCand will not change Cand's reason.
CandReason OrigReason = Cand.Reason;
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
getWeakLeft(Cand.SU, Zone.isTop()),
TryCand, Cand, Cluster)) {
Cand.Reason = OrigReason;
TryCand, Cand, Weak)) {
return;
}
// Avoid critical resource consumption and balance the schedule.
@ -1908,6 +2088,7 @@ const char *ConvergingScheduler::getReasonStr(
case SingleExcess: return "REG-EXCESS";
case SingleCritical: return "REG-CRIT ";
case Cluster: return "CLUSTER ";
case Weak: return "WEAK ";
case SingleMax: return "REG-MAX ";
case MultiPressure: return "REG-MULTI ";
case ResourceReduce: return "RES-REDUCE";
@ -2177,6 +2358,12 @@ static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
"-misched-topdown incompatible with -misched-bottomup");
ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
// Register DAG post-processors.
//
// FIXME: extend the mutation API to allow earlier mutations to instantiate
// data and pass it to later mutations. Have a single mutation that gathers
// the interesting nodes in one pass.
if (EnableCopyConstrain)
DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
if (EnableLoadCluster)
DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
if (EnableMacroFusion)

View File

@ -0,0 +1,30 @@
; REQUIRES: asserts
; RUN: llc < %s -march=thumb -mcpu=swift -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
;
; Loop counter copies should be eliminated.
; There is also a MUL here, but we don't care where it is scheduled.
; CHECK: postinc
; CHECK: *** Final schedule for BB#2 ***
; CHECK: t2LDRs
; CHECK: t2ADDrr
; CHECK: t2CMPrr
; CHECK: COPY
define i32 @postinc(i32 %a, i32* nocapture %d, i32 %s) nounwind {
entry:
%cmp4 = icmp eq i32 %a, 0
br i1 %cmp4, label %for.end, label %for.body
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
%s.05 = phi i32 [ %mul, %for.body ], [ 0, %entry ]
%indvars.iv.next = add i32 %indvars.iv, %s
%arrayidx = getelementptr inbounds i32* %d, i32 %indvars.iv
%0 = load i32* %arrayidx, align 4
%mul = mul nsw i32 %0, %s.05
%exitcond = icmp eq i32 %indvars.iv.next, %a
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %entry
%s.0.lcssa = phi i32 [ 0, %entry ], [ %mul, %for.body ]
ret i32 %s.0.lcssa
}