mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-10 06:03:52 +00:00
MachineScheduler: Fully compare top/bottom candidates
In bidirectional scheduling this gives more stable results than just comparing the "reason" fields of the top/bottom node because the reason field may be higher depending on what other nodes are in the queue. Differential Revision: http://reviews.llvm.org/D19401 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273755 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
c1fd19fd0b
commit
f011e37181
@ -763,7 +763,7 @@ class GenericSchedulerBase : public MachineSchedStrategy {
|
||||
public:
|
||||
/// Represent the type of SchedCandidate found within a single queue.
|
||||
/// pickNodeBidirectional depends on these listed by decreasing priority.
|
||||
enum CandReason {
|
||||
enum CandReason : uint8_t {
|
||||
NoCand, Only1, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak,
|
||||
RegMax, ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
|
||||
TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
|
||||
@ -811,8 +811,8 @@ public:
|
||||
// The reason for this candidate.
|
||||
CandReason Reason;
|
||||
|
||||
// Set of reasons that apply to multiple candidates.
|
||||
uint32_t RepeatReasonSet;
|
||||
// Whether this candidate should be scheduled at top/bottom.
|
||||
bool AtTop;
|
||||
|
||||
// Register pressure values for the best candidate.
|
||||
RegPressureDelta RPDelta;
|
||||
@ -821,7 +821,7 @@ public:
|
||||
SchedResourceDelta ResDelta;
|
||||
|
||||
SchedCandidate(const CandPolicy &policy)
|
||||
: Policy(policy), SU(nullptr), Reason(NoCand), RepeatReasonSet(0) {}
|
||||
: Policy(policy), SU(nullptr), Reason(NoCand), AtTop(false) {}
|
||||
|
||||
bool isValid() const { return SU; }
|
||||
|
||||
@ -830,13 +830,11 @@ public:
|
||||
assert(Best.Reason != NoCand && "uninitialized Sched candidate");
|
||||
SU = Best.SU;
|
||||
Reason = Best.Reason;
|
||||
AtTop = Best.AtTop;
|
||||
RPDelta = Best.RPDelta;
|
||||
ResDelta = Best.ResDelta;
|
||||
}
|
||||
|
||||
bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
|
||||
void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
|
||||
|
||||
void initResourceDelta(const ScheduleDAGMI *DAG,
|
||||
const TargetSchedModel *SchedModel);
|
||||
};
|
||||
@ -913,11 +911,12 @@ protected:
|
||||
|
||||
void tryCandidate(SchedCandidate &Cand,
|
||||
SchedCandidate &TryCand,
|
||||
SchedBoundary &Zone);
|
||||
SchedBoundary *Zone);
|
||||
|
||||
SUnit *pickNodeBidirectional(bool &IsTopNode);
|
||||
|
||||
void pickNodeFromQueue(SchedBoundary &Zone,
|
||||
const CandPolicy &ZonePolicy,
|
||||
const RegPressureTracker &RPTracker,
|
||||
SchedCandidate &Candidate);
|
||||
|
||||
|
@ -2478,7 +2478,6 @@ static bool tryLess(int TryVal, int CandVal,
|
||||
Cand.Reason = Reason;
|
||||
return true;
|
||||
}
|
||||
Cand.setRepeat(Reason);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2495,7 +2494,6 @@ static bool tryGreater(int TryVal, int CandVal,
|
||||
Cand.Reason = Reason;
|
||||
return true;
|
||||
}
|
||||
Cand.setRepeat(Reason);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -2529,9 +2527,8 @@ static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
|
||||
<< GenericSchedulerBase::getReasonStr(Reason) << '\n');
|
||||
}
|
||||
|
||||
static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
|
||||
bool IsTop) {
|
||||
tracePick(Cand.Reason, IsTop);
|
||||
static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
|
||||
tracePick(Cand.Reason, Cand.AtTop);
|
||||
}
|
||||
|
||||
void GenericScheduler::initialize(ScheduleDAGMI *dag) {
|
||||
@ -2682,19 +2679,25 @@ static bool tryPressure(const PressureChange &TryP,
|
||||
GenericSchedulerBase::CandReason Reason,
|
||||
const TargetRegisterInfo *TRI,
|
||||
const MachineFunction &MF) {
|
||||
unsigned TryPSet = TryP.getPSetOrMax();
|
||||
unsigned CandPSet = CandP.getPSetOrMax();
|
||||
// If both candidates affect the same set, go with the smallest increase.
|
||||
if (TryPSet == CandPSet) {
|
||||
return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
|
||||
Reason);
|
||||
}
|
||||
// If one candidate decreases and the other increases, go with it.
|
||||
// Invalid candidates have UnitInc==0.
|
||||
if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
|
||||
Reason)) {
|
||||
return true;
|
||||
}
|
||||
// Do not compare the magnitude of pressure changes between top and bottom
|
||||
// boundary.
|
||||
if (Cand.AtTop != TryCand.AtTop)
|
||||
return false;
|
||||
|
||||
// If both candidates affect the same set in the same boundary, go with the
|
||||
// smallest increase.
|
||||
unsigned TryPSet = TryP.getPSetOrMax();
|
||||
unsigned CandPSet = CandP.getPSetOrMax();
|
||||
if (TryPSet == CandPSet) {
|
||||
return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
|
||||
Reason);
|
||||
}
|
||||
|
||||
int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
|
||||
std::numeric_limits<int>::max();
|
||||
@ -2745,6 +2748,7 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
|
||||
const RegPressureTracker &RPTracker,
|
||||
RegPressureTracker &TempTracker) {
|
||||
Cand.SU = SU;
|
||||
Cand.AtTop = AtTop;
|
||||
if (DAG->isTrackingPressure()) {
|
||||
if (AtTop) {
|
||||
TempTracker.getMaxDownwardPressureDelta(
|
||||
@ -2784,18 +2788,19 @@ void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
|
||||
///
|
||||
/// \param Cand provides the policy and current best candidate.
|
||||
/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
|
||||
/// \param Zone describes the scheduled zone that we are extending.
|
||||
/// \param Zone describes the scheduled zone that we are extending, or nullptr
|
||||
// if Cand is from a different zone than TryCand.
|
||||
void GenericScheduler::tryCandidate(SchedCandidate &Cand,
|
||||
SchedCandidate &TryCand,
|
||||
SchedBoundary &Zone) {
|
||||
SchedBoundary *Zone) {
|
||||
// Initialize the candidate if needed.
|
||||
if (!Cand.isValid()) {
|
||||
TryCand.Reason = NodeOrder;
|
||||
return;
|
||||
}
|
||||
|
||||
if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),
|
||||
biasPhysRegCopy(Cand.SU, Zone.isTop()),
|
||||
if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop),
|
||||
biasPhysRegCopy(Cand.SU, Cand.AtTop),
|
||||
TryCand, Cand, PhysRegCopy))
|
||||
return;
|
||||
|
||||
@ -2813,17 +2818,26 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
|
||||
DAG->MF))
|
||||
return;
|
||||
|
||||
// For loops that are acyclic path limited, aggressively schedule for latency.
|
||||
// This can result in very long dependence chains scheduled in sequence, so
|
||||
// once every cycle (when CurrMOps == 0), switch to normal heuristics.
|
||||
if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps()
|
||||
&& tryLatency(TryCand, Cand, Zone))
|
||||
return;
|
||||
// We only compare a subset of features when comparing nodes between
|
||||
// Top and Bottom boundary. Some properties are simply incomparable, in many
|
||||
// other instances we should only override the other boundary if something
|
||||
// is a clear good pick on one boundary. Skip heuristics that are more
|
||||
// "tie-breaking" in nature.
|
||||
bool SameBoundary = Zone != nullptr;
|
||||
if (SameBoundary) {
|
||||
// For loops that are acyclic path limited, aggressively schedule for
|
||||
// latency. This can result in very long dependence chains scheduled in
|
||||
// sequence, so once every cycle (when CurrMOps == 0), switch to normal
|
||||
// heuristics.
|
||||
if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
|
||||
tryLatency(TryCand, Cand, *Zone))
|
||||
return;
|
||||
|
||||
// Prioritize instructions that read unbuffered resources by stall cycles.
|
||||
if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
|
||||
Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
|
||||
return;
|
||||
// Prioritize instructions that read unbuffered resources by stall cycles.
|
||||
if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
|
||||
Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
|
||||
return;
|
||||
}
|
||||
|
||||
// Keep clustered nodes together to encourage downstream peephole
|
||||
// optimizations which may reduce resource requirements.
|
||||
@ -2831,18 +2845,23 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
|
||||
// This is a best effort to set things up for a post-RA pass. Optimizations
|
||||
// like generating loads of multiple registers should ideally be done within
|
||||
// the scheduler pass by combining the loads during DAG postprocessing.
|
||||
const SUnit *NextClusterSU =
|
||||
Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
||||
if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
|
||||
const SUnit *CandNextClusterSU =
|
||||
Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
||||
const SUnit *TryCandNextClusterSU =
|
||||
TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
|
||||
if (tryGreater(TryCand.SU == TryCandNextClusterSU,
|
||||
Cand.SU == CandNextClusterSU,
|
||||
TryCand, Cand, Cluster))
|
||||
return;
|
||||
|
||||
// Weak edges are for clustering and other constraints.
|
||||
if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
|
||||
getWeakLeft(Cand.SU, Zone.isTop()),
|
||||
TryCand, Cand, Weak)) {
|
||||
return;
|
||||
if (SameBoundary) {
|
||||
// Weak edges are for clustering and other constraints.
|
||||
if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
|
||||
getWeakLeft(Cand.SU, Cand.AtTop),
|
||||
TryCand, Cand, Weak))
|
||||
return;
|
||||
}
|
||||
|
||||
// Avoid increasing the max pressure of the entire region.
|
||||
if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
|
||||
Cand.RPDelta.CurrentMax,
|
||||
@ -2850,34 +2869,35 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
|
||||
DAG->MF))
|
||||
return;
|
||||
|
||||
// Avoid critical resource consumption and balance the schedule.
|
||||
TryCand.initResourceDelta(DAG, SchedModel);
|
||||
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
||||
TryCand, Cand, ResourceReduce))
|
||||
return;
|
||||
if (tryGreater(TryCand.ResDelta.DemandedResources,
|
||||
Cand.ResDelta.DemandedResources,
|
||||
TryCand, Cand, ResourceDemand))
|
||||
return;
|
||||
if (SameBoundary) {
|
||||
// Avoid critical resource consumption and balance the schedule.
|
||||
TryCand.initResourceDelta(DAG, SchedModel);
|
||||
if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
|
||||
TryCand, Cand, ResourceReduce))
|
||||
return;
|
||||
if (tryGreater(TryCand.ResDelta.DemandedResources,
|
||||
Cand.ResDelta.DemandedResources,
|
||||
TryCand, Cand, ResourceDemand))
|
||||
return;
|
||||
|
||||
// Avoid serializing long latency dependence chains.
|
||||
// For acyclic path limited loops, latency was already checked above.
|
||||
if (!RegionPolicy.DisableLatencyHeuristic && Cand.Policy.ReduceLatency &&
|
||||
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone)) {
|
||||
return;
|
||||
}
|
||||
// Avoid serializing long latency dependence chains.
|
||||
// For acyclic path limited loops, latency was already checked above.
|
||||
if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
|
||||
!Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
|
||||
return;
|
||||
|
||||
// Prefer immediate defs/users of the last scheduled instruction. This is a
|
||||
// local pressure avoidance strategy that also makes the machine code
|
||||
// readable.
|
||||
if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU),
|
||||
TryCand, Cand, NextDefUse))
|
||||
return;
|
||||
// Prefer immediate defs/users of the last scheduled instruction. This is a
|
||||
// local pressure avoidance strategy that also makes the machine code
|
||||
// readable.
|
||||
if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU),
|
||||
TryCand, Cand, NextDefUse))
|
||||
return;
|
||||
|
||||
// Fall through to original instruction order.
|
||||
if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
|
||||
|| (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
|
||||
TryCand.Reason = NodeOrder;
|
||||
// Fall through to original instruction order.
|
||||
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
|
||||
|| (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
|
||||
TryCand.Reason = NodeOrder;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -2887,6 +2907,7 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
|
||||
/// DAG building. To adjust for the current scheduling location we need to
|
||||
/// maintain the number of vreg uses remaining to be top-scheduled.
|
||||
void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
|
||||
const CandPolicy &ZonePolicy,
|
||||
const RegPressureTracker &RPTracker,
|
||||
SchedCandidate &Cand) {
|
||||
// getMaxPressureDelta temporarily modifies the tracker.
|
||||
@ -2895,9 +2916,11 @@ void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
|
||||
ReadyQueue &Q = Zone.Available;
|
||||
for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
|
||||
|
||||
SchedCandidate TryCand(Cand.Policy);
|
||||
SchedCandidate TryCand(ZonePolicy);
|
||||
initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);
|
||||
tryCandidate(Cand, TryCand, Zone);
|
||||
// Pass SchedBoundary only when comparing nodes from the same boundary.
|
||||
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
|
||||
tryCandidate(Cand, TryCand, ZoneArg);
|
||||
if (TryCand.Reason != NoCand) {
|
||||
// Initialize resource delta if needed in case future heuristics query it.
|
||||
if (TryCand.ResDelta == SchedResourceDelta())
|
||||
@ -2922,50 +2945,30 @@ SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
|
||||
tracePick(Only1, true);
|
||||
return SU;
|
||||
}
|
||||
CandPolicy NoPolicy;
|
||||
SchedCandidate BotCand(NoPolicy);
|
||||
SchedCandidate TopCand(NoPolicy);
|
||||
// Set the bottom-up policy based on the state of the current bottom zone and
|
||||
// the instructions outside the zone, including the top zone.
|
||||
setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top);
|
||||
CandPolicy BotPolicy;
|
||||
setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
|
||||
// Set the top-down policy based on the state of the current top zone and
|
||||
// the instructions outside the zone, including the bottom zone.
|
||||
setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot);
|
||||
CandPolicy TopPolicy;
|
||||
setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
|
||||
|
||||
// Prefer bottom scheduling when heuristics are silent.
|
||||
CandPolicy NoPolicy;
|
||||
SchedCandidate Cand(NoPolicy);
|
||||
DEBUG(dbgs() << "Picking from Bot:\n");
|
||||
pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
|
||||
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
|
||||
pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), Cand);
|
||||
assert(Cand.Reason != NoCand && "failed to find the first candidate");
|
||||
|
||||
// If either Q has a single candidate that provides the least increase in
|
||||
// Excess pressure, we can immediately schedule from that Q.
|
||||
//
|
||||
// RegionCriticalPSets summarizes the pressure within the scheduled region and
|
||||
// affects picking from either Q. If scheduling in one direction must
|
||||
// increase pressure for one of the excess PSets, then schedule in that
|
||||
// direction first to provide more freedom in the other direction.
|
||||
if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess))
|
||||
|| (BotCand.Reason == RegCritical && !BotCand.isRepeat(RegCritical)))
|
||||
{
|
||||
IsTopNode = false;
|
||||
tracePick(BotCand, IsTopNode);
|
||||
return BotCand.SU;
|
||||
}
|
||||
// Check if the top Q has a better candidate.
|
||||
DEBUG(dbgs() << "Picking from Top:\n");
|
||||
pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
|
||||
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
|
||||
pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), Cand);
|
||||
assert(Cand.Reason != NoCand && "failed to find the first candidate");
|
||||
|
||||
// Choose the queue with the most important (lowest enum) reason.
|
||||
if (TopCand.Reason < BotCand.Reason) {
|
||||
IsTopNode = true;
|
||||
tracePick(TopCand, IsTopNode);
|
||||
return TopCand.SU;
|
||||
}
|
||||
// Otherwise prefer the bottom candidate, in node order if all else failed.
|
||||
IsTopNode = false;
|
||||
tracePick(BotCand, IsTopNode);
|
||||
return BotCand.SU;
|
||||
IsTopNode = Cand.AtTop;
|
||||
tracePick(Cand);
|
||||
return Cand.SU;
|
||||
}
|
||||
|
||||
/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
|
||||
@ -2982,9 +2985,9 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
|
||||
if (!SU) {
|
||||
CandPolicy NoPolicy;
|
||||
SchedCandidate TopCand(NoPolicy);
|
||||
pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
|
||||
pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
|
||||
assert(TopCand.Reason != NoCand && "failed to find a candidate");
|
||||
tracePick(TopCand, true);
|
||||
tracePick(TopCand);
|
||||
SU = TopCand.SU;
|
||||
}
|
||||
IsTopNode = true;
|
||||
@ -2993,9 +2996,9 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
|
||||
if (!SU) {
|
||||
CandPolicy NoPolicy;
|
||||
SchedCandidate BotCand(NoPolicy);
|
||||
pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
|
||||
pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
|
||||
assert(BotCand.Reason != NoCand && "failed to find a candidate");
|
||||
tracePick(BotCand, false);
|
||||
tracePick(BotCand);
|
||||
SU = BotCand.SU;
|
||||
}
|
||||
IsTopNode = false;
|
||||
@ -3165,6 +3168,7 @@ void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
|
||||
for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
|
||||
SchedCandidate TryCand(Cand.Policy);
|
||||
TryCand.SU = *I;
|
||||
TryCand.AtTop = true;
|
||||
TryCand.initResourceDelta(DAG, SchedModel);
|
||||
tryCandidate(Cand, TryCand);
|
||||
if (TryCand.Reason != NoCand) {
|
||||
@ -3193,7 +3197,7 @@ SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
|
||||
setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
|
||||
pickNodeFromQueue(TopCand);
|
||||
assert(TopCand.Reason != NoCand && "failed to find a candidate");
|
||||
tracePick(TopCand, true);
|
||||
tracePick(TopCand);
|
||||
SU = TopCand.SU;
|
||||
}
|
||||
} while (SU->isScheduled);
|
||||
|
@ -23,8 +23,8 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(<8 x double>* %ptr) {
|
||||
; CHECK-DAG: xtn2 v[[NA2]].4s, v[[CONV3]].2d
|
||||
; CHECK-DAG: xtn v[[NA0:[0-9]+]].2s, v[[CONV0]].2d
|
||||
; CHECK-DAG: xtn2 v[[NA0]].4s, v[[CONV1]].2d
|
||||
; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA0]].4s
|
||||
; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA2]].4s
|
||||
; CHECK-DAG: xtn v[[TMP1:[0-9]+]].4h, v[[NA2]].4s
|
||||
; CHECK-DAG: xtn2 v[[TMP1]].8h, v[[NA0]].4s
|
||||
; CHECK: xtn v0.8b, v[[TMP1]].8h
|
||||
%tmp1 = load <8 x double>, <8 x double>* %ptr
|
||||
%tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
|
||||
|
@ -52,7 +52,7 @@ define <8 x i8> @g_vec(<8 x i8> %a) {
|
||||
; CHECK-DAG: movi [[M2:v.*]], #64
|
||||
; CHECK-DAG: movi [[M3:v.*]], #32
|
||||
; CHECK-DAG: movi [[M4:v.*]], #16
|
||||
; CHECK-DAG: movi [[M5:v.*]], #8
|
||||
; CHECK-DAG: movi [[M5:v.*]], #8{{$}}
|
||||
; CHECK-DAG: movi [[M6:v.*]], #4{{$}}
|
||||
; CHECK-DAG: movi [[M7:v.*]], #2{{$}}
|
||||
; CHECK-DAG: movi [[M8:v.*]], #1{{$}}
|
||||
|
@ -44,7 +44,9 @@ __tls_init.exit:
|
||||
; CHECK-NOT: stp d3, d2
|
||||
; CHECK-NOT: stp d1, d0
|
||||
; CHECK-NOT: stp x20, x19
|
||||
; CHECK-NOT: stp x14, x13
|
||||
; FIXME: The splitting logic in the register allocator fails to split along
|
||||
; control flow here, we used to get this right by accident before...
|
||||
; CHECK-NOTXX: stp x14, x13
|
||||
; CHECK-NOT: stp x12, x11
|
||||
; CHECK-NOT: stp x10, x9
|
||||
; CHECK-NOT: stp x8, x7
|
||||
@ -63,7 +65,7 @@ __tls_init.exit:
|
||||
; CHECK-NOT: ldp x8, x7
|
||||
; CHECK-NOT: ldp x10, x9
|
||||
; CHECK-NOT: ldp x12, x11
|
||||
; CHECK-NOT: ldp x14, x13
|
||||
; CHECK-NOTXX: ldp x14, x13
|
||||
; CHECK-NOT: ldp x20, x19
|
||||
; CHECK-NOT: ldp d1, d0
|
||||
; CHECK-NOT: ldp d3, d2
|
||||
|
@ -2,8 +2,9 @@
|
||||
|
||||
define <8 x i8> @float_to_i8(<8 x float>* %in) {
|
||||
; CHECK-LABEL: float_to_i8:
|
||||
; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v0.4s, v0.4s
|
||||
; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v1.4s, v1.4s
|
||||
; CHECK: ldp q1, q0, [x0]
|
||||
; CHECK-DAG: fadd v[[LSB:[0-9]+]].4s, v1.4s, v1.4s
|
||||
; CHECK-DAG: fadd v[[MSB:[0-9]+]].4s, v0.4s, v0.4s
|
||||
; CHECK-DAG: fcvtzu v[[LSB2:[0-9]+]].4s, v[[LSB]].4s
|
||||
; CHECK-DAG: fcvtzu v[[MSB2:[0-9]+]].4s, v[[MSB]].4s
|
||||
; CHECK-DAG: xtn v[[TMP:[0-9]+]].4h, v[[LSB]].4s
|
||||
|
@ -94,21 +94,21 @@ define <4 x float> @test_copysign_v4f32_v4f32(<4 x float> %a, <4 x float> %b) #0
|
||||
define <4 x float> @test_copysign_v4f32_v4f64(<4 x float> %a, <4 x double> %b) #0 {
|
||||
; CHECK-LABEL: test_copysign_v4f32_v4f64:
|
||||
; CHECK-NEXT: mov s3, v0[1]
|
||||
; CHECK-NEXT: mov d4, v1[1]
|
||||
; CHECK-NEXT: movi.4s v5, #128, lsl #24
|
||||
; CHECK-NEXT: fcvt s1, d1
|
||||
; CHECK-NEXT: movi.4s v4, #128, lsl #24
|
||||
; CHECK-NEXT: fcvt s5, d1
|
||||
; CHECK-NEXT: mov s6, v0[2]
|
||||
; CHECK-NEXT: mov s7, v0[3]
|
||||
; CHECK-NEXT: fcvt s16, d2
|
||||
; CHECK-NEXT: bit.16b v0, v1, v5
|
||||
; CHECK-NEXT: bit.16b v6, v16, v5
|
||||
; CHECK-NEXT: fcvt s1, d4
|
||||
; CHECK-NEXT: bit.16b v3, v1, v5
|
||||
; CHECK-NEXT: bit.16b v0, v5, v4
|
||||
; CHECK-NEXT: fcvt s5, d2
|
||||
; CHECK-NEXT: bit.16b v6, v5, v4
|
||||
; CHECK-NEXT: mov d1, v1[1]
|
||||
; CHECK-NEXT: fcvt s1, d1
|
||||
; CHECK-NEXT: bit.16b v3, v1, v4
|
||||
; CHECK-NEXT: mov d1, v2[1]
|
||||
; CHECK-NEXT: fcvt s1, d1
|
||||
; CHECK-NEXT: ins.s v0[1], v3[0]
|
||||
; CHECK-NEXT: ins.s v0[2], v6[0]
|
||||
; CHECK-NEXT: bit.16b v7, v1, v5
|
||||
; CHECK-NEXT: bit.16b v7, v1, v4
|
||||
; CHECK-NEXT: ins.s v0[3], v7[0]
|
||||
; CHECK-NEXT: ret
|
||||
%tmp0 = fptrunc <4 x double> %b to <4 x float>
|
||||
|
@ -486,8 +486,8 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
|
||||
; low 32-bits, which is not a valid 64-bit inline immmediate.
|
||||
|
||||
; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
|
||||
; SI: s_load_dwordx2
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dwordx2
|
||||
; SI-NOT: and
|
||||
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
|
||||
; SI-NOT: and
|
||||
|
@ -21,8 +21,8 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
|
||||
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SICI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; VI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
|
||||
|
@ -155,10 +155,10 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}s_ctpop_i65:
|
||||
; GCN: s_bcnt1_i32_b64
|
||||
; GCN: s_and_b32
|
||||
; GCN: s_bcnt1_i32_b64
|
||||
; GCN: s_add_i32
|
||||
; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
|
||||
; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
|
||||
; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
|
||||
; GCN: s_endpgm
|
||||
define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
|
||||
%ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
|
||||
|
@ -8,7 +8,6 @@
|
||||
|
||||
; SI-LABEL: {{^}}offset_order:
|
||||
|
||||
; SI: ds_read2st64_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:4{{$}}
|
||||
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
|
||||
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:14 offset1:12
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
|
||||
|
@ -197,8 +197,8 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
|
||||
|
||||
; SI-LABEL: @simple_read2st64_f64_over_max_offset
|
||||
; SI-NOT: ds_read2st64_b64
|
||||
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
|
||||
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
|
||||
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
|
||||
; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
|
||||
; SI: s_endpgm
|
||||
define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
|
||||
|
@ -55,7 +55,7 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fneg_fabs_f64:
|
||||
; GCN: s_load_dwordx2
|
||||
; GCN-DAG: s_load_dwordx2
|
||||
; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
|
||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
|
||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
|
||||
|
@ -179,7 +179,7 @@ entry:
|
||||
|
||||
; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
|
||||
|
||||
; CHECK: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
|
||||
; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
|
||||
; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
|
||||
; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
|
||||
; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
|
||||
@ -199,7 +199,7 @@ entry:
|
||||
|
||||
; FIXME: Redundant copy
|
||||
; CHECK: s_mov_b64 exec, [[MASK]]
|
||||
; CHECK: s_mov_b64 [[MASK]], exec
|
||||
; CHECK: s_mov_b64 [[MASK2:s\[[0-9]+:[0-9]+\]]], exec
|
||||
|
||||
; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
|
||||
; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
|
||||
|
@ -10,13 +10,13 @@
|
||||
; not just directly into the vector component?
|
||||
|
||||
; GCN-LABEL: {{^}}insertelement_v4f32_0:
|
||||
; GCN: s_load_dwordx4 s{{\[}}[[LOW_REG:[0-9]+]]:
|
||||
; GCN: s_load_dwordx4
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||
; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOW_REG]], [[CONSTREG]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
|
||||
; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
|
||||
define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
|
||||
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
|
||||
|
@ -10,10 +10,9 @@ declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
|
||||
; TODO: this constant should be folded:
|
||||
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
|
||||
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
|
||||
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
|
||||
; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
|
||||
; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
|
||||
; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
|
||||
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
|
||||
; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
|
||||
|
||||
define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
|
||||
%rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
|
||||
|
@ -29,9 +29,8 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
|
||||
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
|
||||
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
|
||||
; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
|
||||
; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
|
||||
; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
|
||||
; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
|
||||
; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
|
||||
define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
|
||||
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
|
||||
store double %rsq_clamp, double addrspace(1)* %out
|
||||
|
@ -32,7 +32,8 @@
|
||||
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
||||
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]*}}, {{v[0-9]+}} offset0:4
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
|
||||
|
||||
|
||||
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
|
||||
|
@ -9,11 +9,11 @@
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG1LO:[0-9]+]]:[[ARG1HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
|
||||
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
|
||||
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
|
||||
; GCN-NOT: v_mov_b32
|
||||
|
||||
; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
|
||||
; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
|
||||
|
@ -228,9 +228,9 @@ define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_ashr_63_i64:
|
||||
; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
|
||||
; GCN: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
|
||||
; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
|
||||
; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}}
|
||||
; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
|
||||
; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
|
||||
define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
||||
%result = ashr i64 %a, 63
|
||||
|
@ -629,10 +629,11 @@ end:
|
||||
; CHECK-LABEL: transpose
|
||||
;
|
||||
; Store of callee-save register saved by shrink wrapping
|
||||
; CHECK: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
|
||||
; FIXME: Test disabled: Improved scheduling needs no spills/reloads any longer!
|
||||
; CHECKXX: std [[CSR:[0-9]+]], -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
|
||||
;
|
||||
; Reload of callee-save register
|
||||
; CHECK: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload
|
||||
; CHECKXX: ld [[CSR]], -[[STACK_OFFSET]](1) # 8-byte Folded Reload
|
||||
;
|
||||
; Ensure no subsequent uses of callee-save register before end of function
|
||||
; CHECK-NOT: {{[a-z]+}} [[CSR]]
|
||||
|
@ -35,8 +35,7 @@ entry:
|
||||
ret i64 %0
|
||||
}
|
||||
; CHECK-LABEL: @callee2
|
||||
; CHECK: ld [[REG:[0-9]+]], 128(1)
|
||||
; CHECK: mr 3, [[REG]]
|
||||
; CHECK: ld 3, 128(1)
|
||||
; CHECK: blr
|
||||
|
||||
declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16)
|
||||
|
Loading…
Reference in New Issue
Block a user