mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-14 05:42:45 +00:00
Comment and revise the cyclic critical path code.
This should be much more clear now. It's still disabled pending testing. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189597 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
33f4c796ac
commit
851bb2c9cb
@ -331,6 +331,9 @@ public:
|
||||
|
||||
BitVector &getScheduledTrees() { return ScheduledTrees; }
|
||||
|
||||
/// Compute the cyclic critical path through the DAG.
|
||||
unsigned computeCyclicCriticalPath();
|
||||
|
||||
void viewGraph(const Twine &Name, const Twine &Title) LLVM_OVERRIDE;
|
||||
void viewGraph() LLVM_OVERRIDE;
|
||||
|
||||
|
@ -197,9 +197,6 @@ namespace llvm {
|
||||
/// input.
|
||||
void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
|
||||
|
||||
/// Compute the cyclic critical path through the DAG.
|
||||
unsigned computeCyclicCriticalPath();
|
||||
|
||||
/// addSchedBarrierDeps - Add dependencies from instructions in the current
|
||||
/// list of instructions being scheduled to scheduling barrier. We want to
|
||||
/// make sure instructions which define registers that are either used by
|
||||
|
@ -642,6 +642,90 @@ void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
|
||||
ExitSU.biasCriticalPath();
|
||||
}
|
||||
|
||||
/// Compute the max cyclic critical path through the DAG. The scheduling DAG
|
||||
/// only provides the critical path for single block loops. To handle loops that
|
||||
/// span blocks, we could use the vreg path latencies provided by
|
||||
/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
|
||||
/// available for use in the scheduler.
|
||||
///
|
||||
/// The cyclic path estimation identifies a def-use pair that crosses the back
|
||||
/// end and considers the depth and height of the nodes. For example, consider
|
||||
/// the following instruction sequence where each instruction has unit latency
|
||||
/// and defines an epomymous virtual register:
|
||||
///
|
||||
/// a->b(a,c)->c(b)->d(c)->exit
|
||||
///
|
||||
/// The cyclic critical path is a two cycles: b->c->b
|
||||
/// The acyclic critical path is four cycles: a->b->c->d->exit
|
||||
/// LiveOutHeight = height(c) = len(c->d->exit) = 2
|
||||
/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
|
||||
/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
|
||||
/// LiveInDepth = depth(b) = len(a->b) = 1
|
||||
///
|
||||
/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
|
||||
/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
|
||||
/// CyclicCriticalPath = min(2, 2) = 2
|
||||
unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
|
||||
// This only applies to single block loop.
|
||||
if (!BB->isSuccessor(BB))
|
||||
return 0;
|
||||
|
||||
unsigned MaxCyclicLatency = 0;
|
||||
// Visit each live out vreg def to find def/use pairs that cross iterations.
|
||||
ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
|
||||
for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
|
||||
RI != RE; ++RI) {
|
||||
unsigned Reg = *RI;
|
||||
if (!TRI->isVirtualRegister(Reg))
|
||||
continue;
|
||||
const LiveInterval &LI = LIS->getInterval(Reg);
|
||||
const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
|
||||
if (!DefVNI)
|
||||
continue;
|
||||
|
||||
MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
|
||||
const SUnit *DefSU = getSUnit(DefMI);
|
||||
if (!DefSU)
|
||||
continue;
|
||||
|
||||
unsigned LiveOutHeight = DefSU->getHeight();
|
||||
unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
|
||||
// Visit all local users of the vreg def.
|
||||
for (VReg2UseMap::iterator
|
||||
UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
|
||||
if (UI->SU == &ExitSU)
|
||||
continue;
|
||||
|
||||
// Only consider uses of the phi.
|
||||
LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
|
||||
if (!LRQ.valueIn()->isPHIDef())
|
||||
continue;
|
||||
|
||||
// Assume that a path spanning two iterations is a cycle, which could
|
||||
// overestimate in strange cases. This allows cyclic latency to be
|
||||
// estimated as the minimum slack of the vreg's depth or height.
|
||||
unsigned CyclicLatency = 0;
|
||||
if (LiveOutDepth > UI->SU->getDepth())
|
||||
CyclicLatency = LiveOutDepth - UI->SU->getDepth();
|
||||
|
||||
unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
|
||||
if (LiveInHeight > LiveOutHeight) {
|
||||
if (LiveInHeight - LiveOutHeight < CyclicLatency)
|
||||
CyclicLatency = LiveInHeight - LiveOutHeight;
|
||||
}
|
||||
else
|
||||
CyclicLatency = 0;
|
||||
|
||||
DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
|
||||
<< UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
|
||||
if (CyclicLatency > MaxCyclicLatency)
|
||||
MaxCyclicLatency = CyclicLatency;
|
||||
}
|
||||
}
|
||||
DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
|
||||
return MaxCyclicLatency;
|
||||
}
|
||||
|
||||
/// Identify DAG roots and setup scheduler queues.
|
||||
void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
|
||||
ArrayRef<SUnit*> BotRoots) {
|
||||
@ -1557,21 +1641,39 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
|
||||
Bot.releaseNode(SU, SU->BotReadyCycle);
|
||||
}
|
||||
|
||||
/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
|
||||
/// critical path by more cycles than it takes to drain the instruction buffer.
|
||||
/// We estimate an upper bounds on in-flight instructions as:
|
||||
///
|
||||
/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
|
||||
/// InFlightIterations = AcyclicPath / CyclesPerIteration
|
||||
/// InFlightResources = InFlightIterations * LoopResources
|
||||
///
|
||||
/// TODO: Check execution resources in addition to IssueCount.
|
||||
void ConvergingScheduler::checkAcyclicLatency() {
|
||||
if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
|
||||
return;
|
||||
|
||||
// Scaled number of cycles per loop iteration.
|
||||
unsigned IterCount =
|
||||
std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
|
||||
Rem.RemIssueCount);
|
||||
// Scaled acyclic critical path.
|
||||
unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
|
||||
// InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
|
||||
unsigned InFlightCount =
|
||||
(AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
|
||||
unsigned BufferLimit =
|
||||
SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
|
||||
unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;
|
||||
Rem.IsAcyclicLatencyLimited =
|
||||
(LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;
|
||||
|
||||
DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "
|
||||
<< Rem.RemIssueCount << "u = "
|
||||
<< (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. "
|
||||
<< "Latency = " << LatencyLag << "c = "
|
||||
<< LatencyLag * SchedModel->getLatencyFactor() << "u\n";
|
||||
Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
|
||||
|
||||
DEBUG(dbgs() << "IssueCycles="
|
||||
<< Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
|
||||
<< "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
|
||||
<< "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
|
||||
<< " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
|
||||
<< "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
|
||||
if (Rem.IsAcyclicLatencyLimited)
|
||||
dbgs() << " ACYCLIC LATENCY LIMIT\n");
|
||||
}
|
||||
@ -1579,10 +1681,6 @@ void ConvergingScheduler::checkAcyclicLatency() {
|
||||
void ConvergingScheduler::registerRoots() {
|
||||
Rem.CriticalPath = DAG->ExitSU.getDepth();
|
||||
|
||||
if (EnableCyclicPath) {
|
||||
Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
|
||||
checkAcyclicLatency();
|
||||
}
|
||||
// Some roots may not feed into ExitSU. Check all of them in case.
|
||||
for (std::vector<SUnit*>::const_iterator
|
||||
I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
|
||||
@ -1590,6 +1688,11 @@ void ConvergingScheduler::registerRoots() {
|
||||
Rem.CriticalPath = (*I)->getDepth();
|
||||
}
|
||||
DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
|
||||
|
||||
if (EnableCyclicPath) {
|
||||
Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
|
||||
checkAcyclicLatency();
|
||||
}
|
||||
}
|
||||
|
||||
/// Does this SU have a hazard within the current instruction group.
|
||||
|
@ -987,65 +987,6 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
|
||||
PendingLoads.clear();
|
||||
}
|
||||
|
||||
/// Compute the max cyclic critical path through the DAG. For loops that span
|
||||
/// basic blocks, MachineTraceMetrics should be used for this instead.
|
||||
unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {
|
||||
// This only applies to single block loop.
|
||||
if (!BB->isSuccessor(BB))
|
||||
return 0;
|
||||
|
||||
unsigned MaxCyclicLatency = 0;
|
||||
// Visit each live out vreg def to find def/use pairs that cross iterations.
|
||||
for (SUnit::const_pred_iterator
|
||||
PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) {
|
||||
MachineInstr *MI = PI->getSUnit()->getInstr();
|
||||
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
|
||||
const MachineOperand &MO = MI->getOperand(i);
|
||||
if (!MO.isReg() || !MO.isDef())
|
||||
break;
|
||||
unsigned Reg = MO.getReg();
|
||||
if (!Reg || TRI->isPhysicalRegister(Reg))
|
||||
continue;
|
||||
|
||||
const LiveInterval &LI = LIS->getInterval(Reg);
|
||||
unsigned LiveOutHeight = PI->getSUnit()->getHeight();
|
||||
unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency();
|
||||
// Visit all local users of the vreg def.
|
||||
for (VReg2UseMap::iterator
|
||||
UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
|
||||
if (UI->SU == &ExitSU)
|
||||
continue;
|
||||
|
||||
// Only consider uses of the phi.
|
||||
LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
|
||||
if (!LRQ.valueIn()->isPHIDef())
|
||||
continue;
|
||||
|
||||
// Cheat a bit and assume that a path spanning two iterations is a
|
||||
// cycle, which could overestimate in strange cases. This allows cyclic
|
||||
// latency to be estimated as the minimum height or depth slack.
|
||||
unsigned CyclicLatency = 0;
|
||||
if (LiveOutDepth > UI->SU->getDepth())
|
||||
CyclicLatency = LiveOutDepth - UI->SU->getDepth();
|
||||
unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency();
|
||||
if (LiveInHeight > LiveOutHeight) {
|
||||
if (LiveInHeight - LiveOutHeight < CyclicLatency)
|
||||
CyclicLatency = LiveInHeight - LiveOutHeight;
|
||||
}
|
||||
else
|
||||
CyclicLatency = 0;
|
||||
DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum
|
||||
<< ") -> SU(" << UI->SU->NodeNum << ") = "
|
||||
<< CyclicLatency << "\n");
|
||||
if (CyclicLatency > MaxCyclicLatency)
|
||||
MaxCyclicLatency = CyclicLatency;
|
||||
}
|
||||
}
|
||||
}
|
||||
DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n");
|
||||
return MaxCyclicLatency;
|
||||
}
|
||||
|
||||
void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
|
||||
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
|
||||
SU->getInstr()->dump();
|
||||
|
Loading…
x
Reference in New Issue
Block a user