[AMDGPU] Add GCNMaxILPSchedStrategy

Creates a new scheduling strategy that attempts to maximize ILP for a single
wave.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D130869
This commit is contained in:
Austin Kerbow 2022-07-30 07:40:11 -07:00
parent ce6aff8d13
commit d7100b398b
7 changed files with 266 additions and 74 deletions

View File

@ -426,6 +426,15 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
return DAG; return DAG;
} }
static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
return DAG;
}
static ScheduleDAGInstrs * static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>(); const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
@ -464,19 +473,23 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
createGCNMaxOccupancyMachineScheduler); createGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry static MachineSchedRegistry
IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", GCNMaxILPSchedRegistry("gcn-max-ilp", "Run GCN scheduler to maximize ilp",
"Run GCN scheduler to maximize occupancy (experimental)", createGCNMaxILPMachineScheduler);
createIterativeGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry static MachineSchedRegistry IterativeGCNMaxOccupancySchedRegistry(
GCNMinRegSchedRegistry("gcn-minreg", "gcn-iterative-max-occupancy-experimental",
"Run GCN iterative scheduler for minimal register usage (experimental)", "Run GCN scheduler to maximize occupancy (experimental)",
createMinRegScheduler); createIterativeGCNMaxOccupancyMachineScheduler);
static MachineSchedRegistry static MachineSchedRegistry GCNMinRegSchedRegistry(
GCNILPSchedRegistry("gcn-ilp", "gcn-iterative-minreg",
"Run GCN iterative scheduler for ILP scheduling (experimental)", "Run GCN iterative scheduler for minimal register usage (experimental)",
createIterativeILPMachineScheduler); createMinRegScheduler);
static MachineSchedRegistry GCNILPSchedRegistry(
"gcn-iterative-ilp",
"Run GCN iterative scheduler for ILP scheduling (experimental)",
createIterativeILPMachineScheduler);
static StringRef computeDataLayout(const Triple &TT) { static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) { if (TT.getArch() == Triple::r600) {

View File

@ -38,12 +38,11 @@ cl::opt<bool>
"reduction scheduling stage."), "reduction scheduling stage."),
cl::init(false)); cl::init(false));
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy( GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
const MachineSchedContext *C)
: GenericScheduler(C), TargetOccupancy(0), MF(nullptr), : GenericScheduler(C), TargetOccupancy(0), MF(nullptr),
HasHighPressure(false) {} HasHighPressure(false) {}
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) { void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG); GenericScheduler::initialize(DAG);
MF = &DAG->MF; MF = &DAG->MF;
@ -74,8 +73,9 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit); VGPRExcessLimit = std::min(VGPRExcessLimit - ErrorMargin, VGPRExcessLimit);
} }
void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU, void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop, const RegPressureTracker &RPTracker, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, const SIRegisterInfo *SRI,
unsigned SGPRPressure, unsigned SGPRPressure,
unsigned VGPRPressure) { unsigned VGPRPressure) {
@ -161,7 +161,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// This function is mostly cut and pasted from // This function is mostly cut and pasted from
// GenericScheduler::pickNodeFromQueue() // GenericScheduler::pickNodeFromQueue()
void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone, void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy, const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker, const RegPressureTracker &RPTracker,
SchedCandidate &Cand) { SchedCandidate &Cand) {
@ -181,7 +181,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SGPRPressure, VGPRPressure); SGPRPressure, VGPRPressure);
// Pass SchedBoundary only when comparing nodes from the same boundary. // Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr; SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg); tryCandidate(Cand, TryCand, ZoneArg);
if (TryCand.Reason != NoCand) { if (TryCand.Reason != NoCand) {
// Initialize resource delta if needed in case future heuristics query it. // Initialize resource delta if needed in case future heuristics query it.
if (TryCand.ResDelta == SchedResourceDelta()) if (TryCand.ResDelta == SchedResourceDelta())
@ -194,7 +194,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
// This function is mostly cut and pasted from // This function is mostly cut and pasted from
// GenericScheduler::pickNodeBidirectional() // GenericScheduler::pickNodeBidirectional()
SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) { SUnit *GCNSchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// Schedule as far as possible in the direction of no choice. This is most // Schedule as far as possible in the direction of no choice. This is most
// efficient, but also provides the best heuristics for CriticalPSets. // efficient, but also provides the best heuristics for CriticalPSets.
if (SUnit *SU = Bot.pickOnlyChoice()) { if (SUnit *SU = Bot.pickOnlyChoice()) {
@ -259,7 +259,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
dbgs() << "Bot Cand: "; traceCandidate(BotCand);); dbgs() << "Bot Cand: "; traceCandidate(BotCand););
SchedCandidate Cand = BotCand; SchedCandidate Cand = BotCand;
TopCand.Reason = NoCand; TopCand.Reason = NoCand;
GenericScheduler::tryCandidate(Cand, TopCand, nullptr); tryCandidate(Cand, TopCand, nullptr);
if (TopCand.Reason != NoCand) { if (TopCand.Reason != NoCand) {
Cand.setBest(TopCand); Cand.setBest(TopCand);
} }
@ -271,7 +271,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
// This function is mostly cut and pasted from // This function is mostly cut and pasted from
// GenericScheduler::pickNode() // GenericScheduler::pickNode()
SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) { SUnit *GCNSchedStrategy::pickNode(bool &IsTopNode) {
if (DAG->top() == DAG->bottom()) { if (DAG->top() == DAG->bottom()) {
assert(Top.Available.empty() && Top.Pending.empty() && assert(Top.Available.empty() && Top.Pending.empty() &&
Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage"); Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
@ -314,6 +314,129 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
return SU; return SU;
} }
GCNSchedStageID GCNSchedStrategy::getCurrentStage() {
assert(CurrentStage && CurrentStage != SchedStages.end());
return *CurrentStage;
}
bool GCNSchedStrategy::advanceStage() {
assert(CurrentStage != SchedStages.end());
if (!CurrentStage)
CurrentStage = SchedStages.begin();
else
CurrentStage++;
return CurrentStage != SchedStages.end();
}
bool GCNSchedStrategy::hasNextStage() const {
assert(CurrentStage);
return std::next(CurrentStage) != SchedStages.end();
}
GCNSchedStageID GCNSchedStrategy::getNextStage() const {
assert(CurrentStage && std::next(CurrentStage) != SchedStages.end());
return *std::next(CurrentStage);
}
GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::OccInitialSchedule);
SchedStages.push_back(GCNSchedStageID::UnclusteredHighRPReschedule);
SchedStages.push_back(GCNSchedStageID::ClusteredLowOccupancyReschedule);
SchedStages.push_back(GCNSchedStageID::PreRARematerialize);
}
GCNMaxILPSchedStrategy::GCNMaxILPSchedStrategy(const MachineSchedContext *C)
: GCNSchedStrategy(C) {
SchedStages.push_back(GCNSchedStageID::ILPInitialSchedule);
}
bool GCNMaxILPSchedStrategy::tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary *Zone) const {
// Initialize the candidate if needed.
if (!Cand.isValid()) {
TryCand.Reason = NodeOrder;
return true;
}
// Avoid spilling by exceeding the register limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Bias PhysReg Defs and copies to their uses and defined respectively.
if (tryGreater(biasPhysReg(TryCand.SU, TryCand.AtTop),
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;
bool SameBoundary = Zone != nullptr;
if (SameBoundary) {
// Prioritize instructions that read unbuffered resources by stall cycles.
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
return TryCand.Reason != NoCand;
// Avoid critical resource consumption and balance the schedule.
TryCand.initResourceDelta(DAG, SchedModel);
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
TryCand, Cand, ResourceReduce))
return TryCand.Reason != NoCand;
if (tryGreater(TryCand.ResDelta.DemandedResources,
Cand.ResDelta.DemandedResources, TryCand, Cand,
ResourceDemand))
return TryCand.Reason != NoCand;
// Unconditionally try to reduce latency.
if (tryLatency(TryCand, Cand, *Zone))
return TryCand.Reason != NoCand;
// Weak edges are for clustering and other constraints.
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
getWeakLeft(Cand.SU, Cand.AtTop), TryCand, Cand, Weak))
return TryCand.Reason != NoCand;
}
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
// This is a best effort to set things up for a post-RA pass. Optimizations
// like generating loads of multiple registers should ideally be done within
// the scheduler pass by combining the loads during DAG postprocessing.
const SUnit *CandNextClusterSU =
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
const SUnit *TryCandNextClusterSU =
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
Cand.SU == CandNextClusterSU, TryCand, Cand, Cluster))
return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max pressure of the entire region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CurrentMax, Cand.RPDelta.CurrentMax, TryCand,
Cand, RegMax, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (SameBoundary) {
// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
TryCand.Reason = NodeOrder;
return true;
}
}
return false;
}
GCNScheduleDAGMILive::GCNScheduleDAGMILive( GCNScheduleDAGMILive::GCNScheduleDAGMILive(
MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S) MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
: ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()), : ScheduleDAGMILive(C, std::move(S)), ST(MF.getSubtarget<GCNSubtarget>()),
@ -323,6 +446,22 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
} }
std::unique_ptr<GCNSchedStage>
GCNScheduleDAGMILive::createSchedStage(GCNSchedStageID SchedStageID) {
switch (SchedStageID) {
case GCNSchedStageID::OccInitialSchedule:
return std::make_unique<OccInitialScheduleStage>(SchedStageID, *this);
case GCNSchedStageID::UnclusteredHighRPReschedule:
return std::make_unique<UnclusteredHighRPStage>(SchedStageID, *this);
case GCNSchedStageID::ClusteredLowOccupancyReschedule:
return std::make_unique<ClusteredLowOccStage>(SchedStageID, *this);
case GCNSchedStageID::PreRARematerialize:
return std::make_unique<PreRARematStage>(SchedStageID, *this);
case GCNSchedStageID::ILPInitialSchedule:
return std::make_unique<ILPInitialScheduleStage>(SchedStageID, *this);
}
}
void GCNScheduleDAGMILive::schedule() { void GCNScheduleDAGMILive::schedule() {
// Collect all scheduling regions. The actual scheduling is performed in // Collect all scheduling regions. The actual scheduling is performed in
// GCNScheduleDAGMILive::finalizeSchedule. // GCNScheduleDAGMILive::finalizeSchedule.
@ -439,18 +578,13 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
void GCNScheduleDAGMILive::runSchedStages() { void GCNScheduleDAGMILive::runSchedStages() {
LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); LLVM_DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n");
InitialScheduleStage S0(GCNSchedStageID::InitialSchedule, *this);
UnclusteredHighRPStage S1(GCNSchedStageID::UnclusteredHighRPReschedule,
*this);
ClusteredLowOccStage S2(GCNSchedStageID::ClusteredLowOccupancyReschedule,
*this);
PreRARematStage S3(GCNSchedStageID::PreRARematerialize, *this);
GCNSchedStage *SchedStages[] = {&S0, &S1, &S2, &S3};
if (!Regions.empty()) if (!Regions.empty())
BBLiveInMap = getBBLiveInMap(); BBLiveInMap = getBBLiveInMap();
for (auto *Stage : SchedStages) { GCNSchedStrategy &S = static_cast<GCNSchedStrategy &>(*SchedImpl);
while (S.advanceStage()) {
auto Stage = createSchedStage(S.getCurrentStage());
if (!Stage->initGCNSchedStage()) if (!Stage->initGCNSchedStage())
continue; continue;
@ -475,8 +609,8 @@ void GCNScheduleDAGMILive::runSchedStages() {
#ifndef NDEBUG #ifndef NDEBUG
raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) { raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
switch (StageID) { switch (StageID) {
case GCNSchedStageID::InitialSchedule: case GCNSchedStageID::OccInitialSchedule:
OS << "Initial Schedule"; OS << "Max Occupancy Initial Schedule";
break; break;
case GCNSchedStageID::UnclusteredHighRPReschedule: case GCNSchedStageID::UnclusteredHighRPReschedule:
OS << "Unclustered High Register Pressure Reschedule"; OS << "Unclustered High Register Pressure Reschedule";
@ -487,14 +621,18 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const GCNSchedStageID &StageID) {
case GCNSchedStageID::PreRARematerialize: case GCNSchedStageID::PreRARematerialize:
OS << "Pre-RA Rematerialize"; OS << "Pre-RA Rematerialize";
break; break;
case GCNSchedStageID::ILPInitialSchedule:
OS << "Max ILP Initial Schedule";
break;
} }
return OS; return OS;
} }
#endif #endif
GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) GCNSchedStage::GCNSchedStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: DAG(DAG), S(static_cast<GCNMaxOccupancySchedStrategy &>(*DAG.SchedImpl)), : DAG(DAG), S(static_cast<GCNSchedStrategy &>(*DAG.SchedImpl)), MF(DAG.MF),
MF(DAG.MF), MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {} MFI(DAG.MFI), ST(DAG.ST), StageID(StageID) {}
bool GCNSchedStage::initGCNSchedStage() { bool GCNSchedStage::initGCNSchedStage() {
if (!DAG.LIS) if (!DAG.LIS)
@ -564,6 +702,7 @@ bool PreRARematStage::initGCNSchedStage() {
// inbetween the defs and region we sinked the def to. Cached pressure // inbetween the defs and region we sinked the def to. Cached pressure
// for regions where a def is sinked from will also be invalidated. Will // for regions where a def is sinked from will also be invalidated. Will
// need to be fixed if there is another pass after this pass. // need to be fixed if there is another pass after this pass.
assert(!S.hasNextStage());
collectRematerializableInstructions(); collectRematerializableInstructions();
if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII)) if (RematerializableInsts.empty() || !sinkTriviallyRematInsts(ST, TII))
@ -674,7 +813,7 @@ void GCNSchedStage::setupNewBlock() {
DAG.startBlock(CurrentMBB); DAG.startBlock(CurrentMBB);
// Get real RP for the region if it hasn't be calculated before. After the // Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling. // initial schedule stage real RP will be collected after scheduling.
if (StageID == GCNSchedStageID::InitialSchedule) if (StageID == GCNSchedStageID::OccInitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB); DAG.computeBlockPressure(RegionIdx, CurrentMBB);
} }
@ -767,7 +906,7 @@ bool GCNSchedStage::shouldRevertScheduling(unsigned WavesAfter) {
return false; return false;
} }
bool InitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) { bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (GCNSchedStage::shouldRevertScheduling(WavesAfter)) if (GCNSchedStage::shouldRevertScheduling(WavesAfter))
return true; return true;
@ -810,6 +949,13 @@ bool PreRARematStage::shouldRevertScheduling(unsigned WavesAfter) {
return false; return false;
} }
bool ILPInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
if (mayCauseSpilling(WavesAfter))
return true;
return false;
}
bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) { bool GCNSchedStage::mayCauseSpilling(unsigned WavesAfter) {
if (WavesAfter <= MFI.getMinWavesPerEU() && if (WavesAfter <= MFI.getMinWavesPerEU() &&
!PressureAfter.less(ST, PressureBefore) && !PressureAfter.less(ST, PressureBefore) &&
@ -826,7 +972,8 @@ void GCNSchedStage::revertScheduling() {
PressureBefore.getOccupancy(ST) == DAG.MinOccupancy; PressureBefore.getOccupancy(ST) == DAG.MinOccupancy;
LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n"); LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
DAG.RescheduleRegions[RegionIdx] = DAG.RescheduleRegions[RegionIdx] =
(nextStage(StageID)) != GCNSchedStageID::UnclusteredHighRPReschedule; S.hasNextStage() &&
S.getNextStage() != GCNSchedStageID::UnclusteredHighRPReschedule;
DAG.RegionEnd = DAG.RegionBegin; DAG.RegionEnd = DAG.RegionBegin;
int SkippedDebugInstr = 0; int SkippedDebugInstr = 0;
for (MachineInstr *MI : Unsched) { for (MachineInstr *MI : Unsched) {

View File

@ -22,12 +22,25 @@ namespace llvm {
class SIMachineFunctionInfo; class SIMachineFunctionInfo;
class SIRegisterInfo; class SIRegisterInfo;
class GCNSubtarget; class GCNSubtarget;
class GCNSchedStage;
enum class GCNSchedStageID : unsigned {
OccInitialSchedule = 0,
UnclusteredHighRPReschedule = 1,
ClusteredLowOccupancyReschedule = 2,
PreRARematerialize = 3,
ILPInitialSchedule = 4
};
#ifndef NDEBUG
raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID);
#endif
/// This is a minimal scheduler strategy. The main difference between this /// This is a minimal scheduler strategy. The main difference between this
/// and the GenericScheduler is that GCNSchedStrategy uses different /// and the GenericScheduler is that GCNSchedStrategy uses different
/// heuristics to determine excess/critical pressure sets. Its goal is to /// heuristics to determine excess/critical pressure sets.
/// maximize kernel occupancy (i.e. maximum number of waves per simd). class GCNSchedStrategy : public GenericScheduler {
class GCNMaxOccupancySchedStrategy final : public GenericScheduler { protected:
SUnit *pickNodeBidirectional(bool &IsTopNode); SUnit *pickNodeBidirectional(bool &IsTopNode);
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
@ -51,6 +64,12 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
MachineFunction *MF; MachineFunction *MF;
// Scheduling stages for this strategy.
SmallVector<GCNSchedStageID, 4> SchedStages;
// Pointer to the current SchedStageID.
SmallVectorImpl<GCNSchedStageID>::iterator CurrentStage = nullptr;
public: public:
// schedule() have seen register pressure over the critical limits and had to // schedule() have seen register pressure over the critical limits and had to
// track register pressure for actual scheduling heuristics. // track register pressure for actual scheduling heuristics.
@ -69,7 +88,7 @@ public:
unsigned VGPRCriticalLimit; unsigned VGPRCriticalLimit;
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C); GCNSchedStrategy(const MachineSchedContext *C);
SUnit *pickNode(bool &IsTopNode) override; SUnit *pickNode(bool &IsTopNode) override;
@ -78,40 +97,42 @@ public:
unsigned getTargetOccupancy() { return TargetOccupancy; } unsigned getTargetOccupancy() { return TargetOccupancy; }
void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
GCNSchedStageID getCurrentStage();
// Advances stage. Returns true if there are remaining stages.
bool advanceStage();
bool hasNextStage() const;
GCNSchedStageID getNextStage() const;
}; };
enum class GCNSchedStageID : unsigned { /// The goal of this scheduling strategy is to maximize kernel occupancy (i.e.
InitialSchedule = 0, /// maximum number of waves per simd).
UnclusteredHighRPReschedule = 1, class GCNMaxOccupancySchedStrategy final : public GCNSchedStrategy {
ClusteredLowOccupancyReschedule = 2, public:
PreRARematerialize = 3, GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
LastStage = PreRARematerialize
}; };
#ifndef NDEBUG /// The goal of this scheduling strategy is to maximize ILP for a single wave
raw_ostream &operator<<(raw_ostream &OS, const GCNSchedStageID &StageID); /// (i.e. latency hiding).
#endif class GCNMaxILPSchedStrategy final : public GCNSchedStrategy {
protected:
bool tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone) const override;
inline GCNSchedStageID &operator++(GCNSchedStageID &Stage, int) { public:
assert(Stage != GCNSchedStageID::PreRARematerialize); GCNMaxILPSchedStrategy(const MachineSchedContext *C);
Stage = static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1); };
return Stage;
}
inline GCNSchedStageID nextStage(const GCNSchedStageID Stage) {
return static_cast<GCNSchedStageID>(static_cast<unsigned>(Stage) + 1);
}
inline bool operator>(GCNSchedStageID &LHS, GCNSchedStageID &RHS) {
return static_cast<unsigned>(LHS) > static_cast<unsigned>(RHS);
}
class GCNScheduleDAGMILive final : public ScheduleDAGMILive { class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
friend class GCNSchedStage; friend class GCNSchedStage;
friend class InitialScheduleStage; friend class OccInitialScheduleStage;
friend class UnclusteredHighRPStage; friend class UnclusteredHighRPStage;
friend class ClusteredLowOccStage; friend class ClusteredLowOccStage;
friend class PreRARematStage; friend class PreRARematStage;
friend class ILPInitialScheduleStage;
const GCNSubtarget &ST; const GCNSubtarget &ST;
@ -169,6 +190,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
void runSchedStages(); void runSchedStages();
std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
public: public:
GCNScheduleDAGMILive(MachineSchedContext *C, GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S); std::unique_ptr<MachineSchedStrategy> S);
@ -183,7 +206,7 @@ class GCNSchedStage {
protected: protected:
GCNScheduleDAGMILive &DAG; GCNScheduleDAGMILive &DAG;
GCNMaxOccupancySchedStrategy &S; GCNSchedStrategy &S;
MachineFunction &MF; MachineFunction &MF;
@ -245,11 +268,11 @@ public:
virtual ~GCNSchedStage() = default; virtual ~GCNSchedStage() = default;
}; };
class InitialScheduleStage : public GCNSchedStage { class OccInitialScheduleStage : public GCNSchedStage {
public: public:
bool shouldRevertScheduling(unsigned WavesAfter) override; bool shouldRevertScheduling(unsigned WavesAfter) override;
InitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG) OccInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {} : GCNSchedStage(StageID, DAG) {}
}; };
@ -324,6 +347,14 @@ public:
: GCNSchedStage(StageID, DAG) {} : GCNSchedStage(StageID, DAG) {}
}; };
class ILPInitialScheduleStage : public GCNSchedStage {
public:
bool shouldRevertScheduling(unsigned WavesAfter) override;
ILPInitialScheduleStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG) {}
};
} // End namespace llvm } // End namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H #endif // LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H

View File

@ -1,4 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-ilp -verify-machineinstrs < %s | FileCheck %s
; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}} ; CHECK: NumVgprs: {{[0-9][0-9][0-9]$}}

View File

@ -1,6 +1,6 @@
; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck %s
; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -enable-amdgpu-aa=0 -march=amdgcn -mcpu=tonga -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
; We expect a two digit VGPR usage here, not a three digit. ; We expect a two digit VGPR usage here, not a three digit.
; CHECK: NumVgprs: {{[0-9][0-9]$}} ; CHECK: NumVgprs: {{[0-9][0-9]$}}

View File

@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s ; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MINREG %s
; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s ; RUN: llc -march=amdgcn -mcpu=tahiti -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=SI-MAXOCC %s
; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-minreg -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s ; RUN: llc -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -misched=gcn-iterative-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
; SI-MINREG: NumSgprs: {{[1-9]$}} ; SI-MINREG: NumSgprs: {{[1-9]$}}
; SI-MINREG: NumVgprs: {{[1-9]$}} ; SI-MINREG: NumVgprs: {{[1-9]$}}

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=MISCHED %s
; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s ; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-iterative-ilp -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-ILP %s
; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP. ; Test the scheduler when only one wave is requested. The result should be high register usage and max ILP.