Comment and revise the cyclic critical path code.

This should be much more clear now. It's still disabled pending testing.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189597 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrew Trick 2013-08-29 18:04:49 +00:00
parent 33f4c796ac
commit 851bb2c9cb
4 changed files with 118 additions and 74 deletions

View File

@ -331,6 +331,9 @@ public:
BitVector &getScheduledTrees() { return ScheduledTrees; }
/// Compute the cyclic critical path through the DAG.
unsigned computeCyclicCriticalPath();
void viewGraph(const Twine &Name, const Twine &Title) LLVM_OVERRIDE;
void viewGraph() LLVM_OVERRIDE;

View File

@ -197,9 +197,6 @@ namespace llvm {
/// input.
void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
/// Compute the cyclic critical path through the DAG.
unsigned computeCyclicCriticalPath();
/// addSchedBarrierDeps - Add dependencies from instructions in the current
/// list of instructions being scheduled to scheduling barrier. We want to
/// make sure instructions which define registers that are either used by

View File

@ -642,6 +642,90 @@ void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
ExitSU.biasCriticalPath();
}
/// Compute the max cyclic critical path through the DAG. The scheduling DAG
/// only provides the critical path for single block loops. To handle loops that
/// span blocks, we could use the vreg path latencies provided by
/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
/// available for use in the scheduler.
///
/// The cyclic path estimation identifies a def-use pair that crosses the back
/// end and considers the depth and height of the nodes. For example, consider
/// the following instruction sequence where each instruction has unit latency
/// and defines an epomymous virtual register:
///
/// a->b(a,c)->c(b)->d(c)->exit
///
/// The cyclic critical path is a two cycles: b->c->b
/// The acyclic critical path is four cycles: a->b->c->d->exit
/// LiveOutHeight = height(c) = len(c->d->exit) = 2
/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
/// LiveInDepth = depth(b) = len(a->b) = 1
///
/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
/// CyclicCriticalPath = min(2, 2) = 2
unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
// This only applies to single block loop.
if (!BB->isSuccessor(BB))
return 0;
unsigned MaxCyclicLatency = 0;
// Visit each live out vreg def to find def/use pairs that cross iterations.
ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
RI != RE; ++RI) {
unsigned Reg = *RI;
if (!TRI->isVirtualRegister(Reg))
continue;
const LiveInterval &LI = LIS->getInterval(Reg);
const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
if (!DefVNI)
continue;
MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
const SUnit *DefSU = getSUnit(DefMI);
if (!DefSU)
continue;
unsigned LiveOutHeight = DefSU->getHeight();
unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
// Visit all local users of the vreg def.
for (VReg2UseMap::iterator
UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
if (UI->SU == &ExitSU)
continue;
// Only consider uses of the phi.
LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
if (!LRQ.valueIn()->isPHIDef())
continue;
// Assume that a path spanning two iterations is a cycle, which could
// overestimate in strange cases. This allows cyclic latency to be
// estimated as the minimum slack of the vreg's depth or height.
unsigned CyclicLatency = 0;
if (LiveOutDepth > UI->SU->getDepth())
CyclicLatency = LiveOutDepth - UI->SU->getDepth();
unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
if (LiveInHeight > LiveOutHeight) {
if (LiveInHeight - LiveOutHeight < CyclicLatency)
CyclicLatency = LiveInHeight - LiveOutHeight;
}
else
CyclicLatency = 0;
DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
<< UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
if (CyclicLatency > MaxCyclicLatency)
MaxCyclicLatency = CyclicLatency;
}
}
DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
return MaxCyclicLatency;
}
/// Identify DAG roots and setup scheduler queues.
void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
ArrayRef<SUnit*> BotRoots) {
@ -1557,21 +1641,39 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
Bot.releaseNode(SU, SU->BotReadyCycle);
}
/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
/// critical path by more cycles than it takes to drain the instruction buffer.
/// We estimate an upper bounds on in-flight instructions as:
///
/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
/// InFlightIterations = AcyclicPath / CyclesPerIteration
/// InFlightResources = InFlightIterations * LoopResources
///
/// TODO: Check execution resources in addition to IssueCount.
void ConvergingScheduler::checkAcyclicLatency() {
if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
return;
// Scaled number of cycles per loop iteration.
unsigned IterCount =
std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
Rem.RemIssueCount);
// Scaled acyclic critical path.
unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
// InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
unsigned InFlightCount =
(AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
unsigned BufferLimit =
SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;
Rem.IsAcyclicLatencyLimited =
(LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;
DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "
<< Rem.RemIssueCount << "u = "
<< (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. "
<< "Latency = " << LatencyLag << "c = "
<< LatencyLag * SchedModel->getLatencyFactor() << "u\n";
Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
DEBUG(dbgs() << "IssueCycles="
<< Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
<< "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
<< "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
<< " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
<< "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
if (Rem.IsAcyclicLatencyLimited)
dbgs() << " ACYCLIC LATENCY LIMIT\n");
}
@ -1579,10 +1681,6 @@ void ConvergingScheduler::checkAcyclicLatency() {
void ConvergingScheduler::registerRoots() {
Rem.CriticalPath = DAG->ExitSU.getDepth();
if (EnableCyclicPath) {
Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
checkAcyclicLatency();
}
// Some roots may not feed into ExitSU. Check all of them in case.
for (std::vector<SUnit*>::const_iterator
I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
@ -1590,6 +1688,11 @@ void ConvergingScheduler::registerRoots() {
Rem.CriticalPath = (*I)->getDepth();
}
DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
if (EnableCyclicPath) {
Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
checkAcyclicLatency();
}
}
/// Does this SU have a hazard within the current instruction group.

View File

@ -987,65 +987,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
PendingLoads.clear();
}
/// Compute the max cyclic critical path through the DAG. For loops that span
/// basic blocks, MachineTraceMetrics should be used for this instead.
unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {
// This only applies to single block loop.
if (!BB->isSuccessor(BB))
return 0;
unsigned MaxCyclicLatency = 0;
// Visit each live out vreg def to find def/use pairs that cross iterations.
for (SUnit::const_pred_iterator
PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) {
MachineInstr *MI = PI->getSUnit()->getInstr();
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI->getOperand(i);
if (!MO.isReg() || !MO.isDef())
break;
unsigned Reg = MO.getReg();
if (!Reg || TRI->isPhysicalRegister(Reg))
continue;
const LiveInterval &LI = LIS->getInterval(Reg);
unsigned LiveOutHeight = PI->getSUnit()->getHeight();
unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency();
// Visit all local users of the vreg def.
for (VReg2UseMap::iterator
UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
if (UI->SU == &ExitSU)
continue;
// Only consider uses of the phi.
LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
if (!LRQ.valueIn()->isPHIDef())
continue;
// Cheat a bit and assume that a path spanning two iterations is a
// cycle, which could overestimate in strange cases. This allows cyclic
// latency to be estimated as the minimum height or depth slack.
unsigned CyclicLatency = 0;
if (LiveOutDepth > UI->SU->getDepth())
CyclicLatency = LiveOutDepth - UI->SU->getDepth();
unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency();
if (LiveInHeight > LiveOutHeight) {
if (LiveInHeight - LiveOutHeight < CyclicLatency)
CyclicLatency = LiveInHeight - LiveOutHeight;
}
else
CyclicLatency = 0;
DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum
<< ") -> SU(" << UI->SU->NodeNum << ") = "
<< CyclicLatency << "\n");
if (CyclicLatency > MaxCyclicLatency)
MaxCyclicLatency = CyclicLatency;
}
}
}
DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n");
return MaxCyclicLatency;
}
void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
SU->getInstr()->dump();