Increased the register pressure limit on x86_64 from 8 to 12

regs. This is the only change in this checkin that may affects the
default scheduler. With better register tracking and heuristics, it
doesn't make sense to artificially lower the register limit so much.

Added -sched-high-latency-cycles and X86InstrInfo::isHighLatencyDef to
give the scheduler a way to account for div and sqrt on targets that
don't have an itinerary. It is currently defaults to 10 (the actual
number doesn't matter much), but only takes effect on non-default
schedulers: list-hybrid and list-ilp.

Added several heuristics that can be individually disabled for the
non-default sched=list-ilp mode. This helps us determine how much
better we can do on a given benchmark than the default
scheduler. Certain compute intensive loops run much faster in this
mode with the right set of heuristics, and it doesn't seem to have
much negative impact elsewhere. Not all of the heuristics are needed,
but we still need to experiment to decide which should be disabled by
default for sched=list-ilp.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127067 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Andrew Trick 2011-03-05 08:00:22 +00:00
parent a4a1c3f0d9
commit e0ef509aeb
6 changed files with 173 additions and 30 deletions

View File

@ -641,6 +641,10 @@ public:
virtual int getInstrLatency(const InstrItineraryData *ItinData, virtual int getInstrLatency(const InstrItineraryData *ItinData,
SDNode *Node) const; SDNode *Node) const;
/// isHighLatencyDef - Return true if this opcode has high latency to its
/// result.
bool isHighLatencyDef(int opc) const { return false; }
/// hasHighOperandLatency - Compute operand latency between a def of 'Reg' /// hasHighOperandLatency - Compute operand latency between a def of 'Reg'
/// and an use in the current loop, return true if the target considered /// and an use in the current loop, return true if the target considered
/// it 'high'. This is used by optimization passes such as machine LICM to /// it 'high'. This is used by optimization passes such as machine LICM to

View File

@ -70,6 +70,43 @@ static cl::opt<bool> DisableSchedCycles(
"disable-sched-cycles", cl::Hidden, cl::init(false), "disable-sched-cycles", cl::Hidden, cl::init(false),
cl::desc("Disable cycle-level precision during preRA scheduling")); cl::desc("Disable cycle-level precision during preRA scheduling"));
// Temporary sched=list-ilp flags until the heuristics are robust.
static cl::opt<bool> DisableSchedRegPressure(
"disable-sched-reg-pressure", cl::Hidden, cl::init(false),
cl::desc("Disable regpressure priority in sched=list-ilp"));
static cl::opt<bool> DisableSchedLiveUses(
"disable-sched-live-uses", cl::Hidden, cl::init(false),
cl::desc("Disable live use priority in sched=list-ilp"));
static cl::opt<bool> DisableSchedStalls(
"disable-sched-stalls", cl::Hidden, cl::init(false),
cl::desc("Disable no-stall priority in sched=list-ilp"));
static cl::opt<bool> DisableSchedCriticalPath(
"disable-sched-critical-path", cl::Hidden, cl::init(false),
cl::desc("Disable critical path priority in sched=list-ilp"));
static cl::opt<bool> DisableSchedHeight(
"disable-sched-height", cl::Hidden, cl::init(false),
cl::desc("Disable scheduled-height priority in sched=list-ilp"));
static cl::opt<int> MaxReorderWindow(
"max-sched-reorder", cl::Hidden, cl::init(6),
cl::desc("Number of instructions to allow ahead of the critical path "
"in sched=list-ilp"));
static cl::opt<unsigned> AvgIPC(
"sched-avg-ipc", cl::Hidden, cl::init(1),
cl::desc("Average inst/cycle whan no target itinerary exists."));
#ifndef NDEBUG
namespace {
// For sched=list-ilp, Count the number of times each factor comes into play.
enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactUllman,
NumFactors };
}
static const char *FactorName[NumFactors] =
{"PressureDiff", "RegUses", "Height", "Depth","Ullman"};
static int FactorCount[NumFactors];
#endif //!NDEBUG
namespace { namespace {
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
/// ScheduleDAGRRList - The actual register reduction list scheduler /// ScheduleDAGRRList - The actual register reduction list scheduler
@ -103,6 +140,10 @@ private:
/// MinAvailableCycle - Cycle of the soonest available instruction. /// MinAvailableCycle - Cycle of the soonest available instruction.
unsigned MinAvailableCycle; unsigned MinAvailableCycle;
/// IssueCount - Count instructions issued in this cycle
/// Currently valid only for bottom-up scheduling.
unsigned IssueCount;
/// LiveRegDefs - A set of physical registers and their definition /// LiveRegDefs - A set of physical registers and their definition
/// that are "live". These nodes must be scheduled before any other nodes that /// that are "live". These nodes must be scheduled before any other nodes that
/// modifies the registers can be scheduled. /// modifies the registers can be scheduled.
@ -234,8 +275,14 @@ void ScheduleDAGRRList::Schedule() {
DEBUG(dbgs() DEBUG(dbgs()
<< "********** List Scheduling BB#" << BB->getNumber() << "********** List Scheduling BB#" << BB->getNumber()
<< " '" << BB->getName() << "' **********\n"); << " '" << BB->getName() << "' **********\n");
#ifndef NDEBUG
for (int i = 0; i < NumFactors; ++i) {
FactorCount[i] = 0;
}
#endif //!NDEBUG
CurCycle = 0; CurCycle = 0;
IssueCount = 0;
MinAvailableCycle = DisableSchedCycles ? 0 : UINT_MAX; MinAvailableCycle = DisableSchedCycles ? 0 : UINT_MAX;
NumLiveRegs = 0; NumLiveRegs = 0;
LiveRegDefs.resize(TRI->getNumRegs(), NULL); LiveRegDefs.resize(TRI->getNumRegs(), NULL);
@ -258,6 +305,11 @@ void ScheduleDAGRRList::Schedule() {
else else
ListScheduleTopDown(); ListScheduleTopDown();
#ifndef NDEBUG
for (int i = 0; i < NumFactors; ++i) {
DEBUG(dbgs() << FactorName[i] << "\t" << FactorCount[i] << "\n");
}
#endif // !NDEBUG
AvailableQueue->releaseState(); AvailableQueue->releaseState();
} }
@ -383,6 +435,7 @@ void ScheduleDAGRRList::AdvanceToCycle(unsigned NextCycle) {
if (NextCycle <= CurCycle) if (NextCycle <= CurCycle)
return; return;
IssueCount = 0;
AvailableQueue->setCurCycle(NextCycle); AvailableQueue->setCurCycle(NextCycle);
if (!HazardRec->isEnabled()) { if (!HazardRec->isEnabled()) {
// Bypass lots of virtual calls in case of long latency. // Bypass lots of virtual calls in case of long latency.
@ -502,10 +555,10 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
AvailableQueue->ScheduledNode(SU); AvailableQueue->ScheduledNode(SU);
// If HazardRec is disabled, count each inst as one cycle. // If HazardRec is disabled, and each inst counts as one cycle, then
// Advance CurCycle before ReleasePredecessors to avoid useles pushed to // advance CurCycle before ReleasePredecessors to avoid useles pushed to
// PendingQueue for schedulers that implement HasReadyFilter. // PendingQueue for schedulers that implement HasReadyFilter.
if (!HazardRec->isEnabled()) if (!HazardRec->isEnabled() && AvgIPC < 2)
AdvanceToCycle(CurCycle + 1); AdvanceToCycle(CurCycle + 1);
// Update liveness of predecessors before successors to avoid treating a // Update liveness of predecessors before successors to avoid treating a
@ -533,7 +586,9 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
// If HazardRec is disabled, the cycle was advanced earlier. // If HazardRec is disabled, the cycle was advanced earlier.
// //
// Check AvailableQueue after ReleasePredecessors in case of zero latency. // Check AvailableQueue after ReleasePredecessors in case of zero latency.
++IssueCount;
if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
|| (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC)
|| AvailableQueue->empty()) || AvailableQueue->empty())
AdvanceToCycle(CurCycle + 1); AdvanceToCycle(CurCycle + 1);
} }
@ -1458,7 +1513,9 @@ public:
bool HighRegPressure(const SUnit *SU) const; bool HighRegPressure(const SUnit *SU) const;
bool MayReduceRegPressure(SUnit *SU); bool MayReduceRegPressure(SUnit *SU) const;
int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
void ScheduledNode(SUnit *SU); void ScheduledNode(SUnit *SU);
@ -1678,7 +1735,7 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
return false; return false;
} }
bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) { bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
const SDNode *N = SU->getNode(); const SDNode *N = SU->getNode();
if (!N->isMachineOpcode() || !SU->NumSuccs) if (!N->isMachineOpcode() || !SU->NumSuccs)
@ -1696,6 +1753,53 @@ bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) {
return false; return false;
} }
// Compute the register pressure contribution by this instruction by count up
// for uses that are not live and down for defs. Only count register classes
// that are already under high pressure. As a side effect, compute the number of
// uses of registers that are already live.
//
// FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
// so could probably be factored.
int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
LiveUses = 0;
int PDiff = 0;
for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
I != E; ++I) {
if (I->isCtrl())
continue;
SUnit *PredSU = I->getSUnit();
// NumRegDefsLeft is zero when enough uses of this node have been scheduled
// to cover the number of registers defined (they are all live).
if (PredSU->NumRegDefsLeft == 0) {
if (PredSU->getNode()->isMachineOpcode())
++LiveUses;
continue;
}
for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
RegDefPos.IsValid(); RegDefPos.Advance()) {
EVT VT = RegDefPos.GetValue();
unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
if (RegPressure[RCId] >= RegLimit[RCId])
++PDiff;
}
}
const SDNode *N = SU->getNode();
if (!N->isMachineOpcode() || !SU->NumSuccs)
return PDiff;
unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
for (unsigned i = 0; i != NumDefs; ++i) {
EVT VT = N->getValueType(i);
if (!N->hasAnyUseOfValue(i))
continue;
unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
if (RegPressure[RCId] >= RegLimit[RCId])
--PDiff;
}
return PDiff;
}
void RegReductionPQBase::ScheduledNode(SUnit *SU) { void RegReductionPQBase::ScheduledNode(SUnit *SU) {
if (!TracksRegPressure) if (!TracksRegPressure)
return; return;
@ -1998,9 +2102,10 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
unsigned LPriority = SPQ->getNodePriority(left); unsigned LPriority = SPQ->getNodePriority(left);
unsigned RPriority = SPQ->getNodePriority(right); unsigned RPriority = SPQ->getNodePriority(right);
if (LPriority != RPriority) if (LPriority != RPriority) {
DEBUG(++FactorCount[FactUllman]);
return LPriority > RPriority; return LPriority > RPriority;
}
// Try schedule def + use closer when Sethi-Ullman numbers are the same. // Try schedule def + use closer when Sethi-Ullman numbers are the same.
// e.g. // e.g.
// t1 = op t2, c1 // t1 = op t2, c1
@ -2128,21 +2233,37 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
// No way to compute latency of calls. // No way to compute latency of calls.
return BURRSort(left, right, SPQ); return BURRSort(left, right, SPQ);
bool LHigh = SPQ->HighRegPressure(left); unsigned LLiveUses, RLiveUses;
bool RHigh = SPQ->HighRegPressure(right); int LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
// Avoid causing spills. If register pressure is high, schedule for int RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
// register pressure reduction. if (!DisableSchedRegPressure && LPDiff != RPDiff) {
if (LHigh && !RHigh) DEBUG(++FactorCount[FactPressureDiff]);
return true; return LPDiff > RPDiff;
else if (!LHigh && RHigh) }
return false;
else if (!LHigh && !RHigh) { if (!DisableSchedLiveUses && LLiveUses != RLiveUses) {
// Low register pressure situation, schedule to maximize instruction level DEBUG(dbgs() << "Live uses " << left->NodeNum << " = " << LLiveUses
// parallelism. << " != " << right->NodeNum << " = " << RLiveUses << "\n");
if (left->NumPreds > right->NumPreds) DEBUG(++FactorCount[FactRegUses]);
return false; return LLiveUses < RLiveUses;
else if (left->NumPreds < right->NumPreds) }
return true;
bool LStall = BUHasStall(left, left->getHeight(), SPQ);
bool RStall = BUHasStall(right, right->getHeight(), SPQ);
if (!DisableSchedStalls && LStall != RStall) {
DEBUG(++FactorCount[FactHeight]);
return left->getHeight() > right->getHeight();
}
if (!DisableSchedCriticalPath
&& abs((long)left->getDepth() - right->getDepth()) > MaxReorderWindow) {
DEBUG(++FactorCount[FactDepth]);
return left->getDepth() < right->getDepth();
}
if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
DEBUG(++FactorCount[FactHeight]);
return left->getHeight() > right->getHeight();
} }
return BURRSort(left, right, SPQ); return BURRSort(left, right, SPQ);

View File

@ -27,12 +27,21 @@
#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h" #include "llvm/ADT/Statistic.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h" #include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h" #include "llvm/Support/raw_ostream.h"
using namespace llvm; using namespace llvm;
STATISTIC(LoadsClustered, "Number of loads clustered together"); STATISTIC(LoadsClustered, "Number of loads clustered together");
// This allows latency based scheduler to notice high latency instructions
// without a target itinerary. The choise if number here has more to do with
// balancing scheduler heursitics than with the actual machine latency.
static cl::opt<int> HighLatencyCycles(
"sched-high-latency-cycles", cl::Hidden, cl::init(10),
cl::desc("Roughly estimate the number of cycles that 'long latency'"
"instructions take for targets with no itinerary"));
ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf) ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
: ScheduleDAG(mf), : ScheduleDAG(mf),
InstrItins(mf.getTarget().getInstrItineraryData()) {} InstrItins(mf.getTarget().getInstrItineraryData()) {}
@ -506,7 +515,10 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
} }
if (!InstrItins || InstrItins->isEmpty()) { if (!InstrItins || InstrItins->isEmpty()) {
SU->Latency = 1; if (SU->getNode() && TII->isHighLatencyDef(SU->getNode()->getOpcode()))
SU->Latency = HighLatencyCycles;
else
SU->Latency = 1;
return; return;
} }

View File

@ -1284,7 +1284,7 @@ X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
case X86::GR32RegClassID: case X86::GR32RegClassID:
return 4 - FPDiff; return 4 - FPDiff;
case X86::GR64RegClassID: case X86::GR64RegClassID:
return 8 - FPDiff; return 12 - FPDiff;
case X86::VR128RegClassID: case X86::VR128RegClassID:
return Subtarget->is64Bit() ? 10 : 4; return Subtarget->is64Bit() ? 10 : 4;
case X86::VR64RegClassID: case X86::VR64RegClassID:

View File

@ -3085,12 +3085,8 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(X86::NOOP); NopInst.setOpcode(X86::NOOP);
} }
bool X86InstrInfo:: bool X86InstrInfo::isHighLatencyDef(int opc) const {
hasHighOperandLatency(const InstrItineraryData *ItinData, switch (opc) {
const MachineRegisterInfo *MRI,
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI, unsigned UseIdx) const {
switch (DefMI->getOpcode()) {
default: return false; default: return false;
case X86::DIVSDrm: case X86::DIVSDrm:
case X86::DIVSDrm_Int: case X86::DIVSDrm_Int:
@ -3120,6 +3116,14 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
} }
} }
bool X86InstrInfo::
hasHighOperandLatency(const InstrItineraryData *ItinData,
const MachineRegisterInfo *MRI,
const MachineInstr *DefMI, unsigned DefIdx,
const MachineInstr *UseMI, unsigned UseIdx) const {
return isHighLatencyDef(DefMI->getOpcode());
}
namespace { namespace {
/// CGBR - Create Global Base Reg pass. This initializes the PIC /// CGBR - Create Global Base Reg pass. This initializes the PIC
/// global base register for x86-32. /// global base register for x86-32.

View File

@ -858,6 +858,8 @@ public:
const SmallVectorImpl<MachineOperand> &MOs, const SmallVectorImpl<MachineOperand> &MOs,
unsigned Size, unsigned Alignment) const; unsigned Size, unsigned Alignment) const;
bool isHighLatencyDef(int opc) const;
bool hasHighOperandLatency(const InstrItineraryData *ItinData, bool hasHighOperandLatency(const InstrItineraryData *ItinData,
const MachineRegisterInfo *MRI, const MachineRegisterInfo *MRI,
const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *DefMI, unsigned DefIdx,