[ScheduleDAG] Deal with already scheduled loads in ScheduleDAG.

Summary:
If we attempt to unfold an SUnit in ScheduleDAG that results in
finding an already scheduled load, we must should abort the
unfold as it will not improve scheduling.

This fixes PR32610.

Reviewers: jmolloy, sunfish, bogner, spatel

Subscribers: llvm-commits, MatzeB

Differential Revision: https://reviews.llvm.org/D32911

llvm-svn: 304321
This commit is contained in:
Nirav Dave 2017-05-31 18:43:17 +00:00
parent 6d04c8c1dc
commit 334d91ac51
2 changed files with 190 additions and 128 deletions
lib/CodeGen/SelectionDAG
test/CodeGen/X86

@ -226,6 +226,7 @@ private:
void UnscheduleNodeBottomUp(SUnit*);
void RestoreHazardCheckerBottomUp();
void BacktrackBottomUp(SUnit*, SUnit*);
SUnit *TryUnfoldSU(SUnit *);
SUnit *CopyAndMoveSuccessors(SUnit*);
void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
const TargetRegisterClass*,
@ -780,7 +781,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
}
/// CapturePred - This does the opposite of ReleasePred. Since SU is being
/// unscheduled, incrcease the succ left count of its predecessors. Remove
/// unscheduled, increase the succ left count of its predecessors. Remove
/// them from AvailableQueue if necessary.
void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
SUnit *PredSU = PredEdge->getSUnit();
@ -934,6 +935,146 @@ static bool isOperandOf(const SUnit *SU, SDNode *N) {
return false;
}
/// TryUnfold - Attempt to unfold
SUnit *ScheduleDAGRRList::TryUnfoldSU(SUnit *SU) {
SDNode *N = SU->getNode();
// Use while over if to ease fall through.
SmallVector<SDNode *, 2> NewNodes;
if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
return nullptr;
// unfolding an x86 DEC64m operation results in store, dec, load which
// can't be handled here so quit
if (NewNodes.size() == 3)
return nullptr;
assert(NewNodes.size() == 2 && "Expected a load folding node!");
N = NewNodes[1];
SDNode *LoadNode = NewNodes[0];
unsigned NumVals = N->getNumValues();
unsigned OldNumVals = SU->getNode()->getNumValues();
// LoadNode may already exist. This can happen when there is another
// load from the same location and producing the same type of value
// but it has different alignment or volatileness.
bool isNewLoad = true;
SUnit *LoadSU;
if (LoadNode->getNodeId() != -1) {
LoadSU = &SUnits[LoadNode->getNodeId()];
// If LoadSU has already been scheduled, we should clone it but
// this would negate the benefit to unfolding so just return SU.
if (LoadSU->isScheduled)
return SU;
isNewLoad = false;
} else {
LoadSU = CreateNewSUnit(LoadNode);
LoadNode->setNodeId(LoadSU->NodeNum);
InitNumRegDefsLeft(LoadSU);
computeLatency(LoadSU);
}
DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
// Now that we are committed to unfolding replace DAG Uses.
for (unsigned i = 0; i != NumVals; ++i)
DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals - 1),
SDValue(LoadNode, 1));
SUnit *NewSU = CreateNewSUnit(N);
assert(N->getNodeId() == -1 && "Node already inserted!");
N->setNodeId(NewSU->NodeNum);
const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
NewSU->isTwoAddress = true;
break;
}
}
if (MCID.isCommutable())
NewSU->isCommutable = true;
InitNumRegDefsLeft(NewSU);
computeLatency(NewSU);
// Record all the edges to and from the old SU, by category.
SmallVector<SDep, 4> ChainPreds;
SmallVector<SDep, 4> ChainSuccs;
SmallVector<SDep, 4> LoadPreds;
SmallVector<SDep, 4> NodePreds;
SmallVector<SDep, 4> NodeSuccs;
for (SDep &Pred : SU->Preds) {
if (Pred.isCtrl())
ChainPreds.push_back(Pred);
else if (isOperandOf(Pred.getSUnit(), LoadNode))
LoadPreds.push_back(Pred);
else
NodePreds.push_back(Pred);
}
for (SDep &Succ : SU->Succs) {
if (Succ.isCtrl())
ChainSuccs.push_back(Succ);
else
NodeSuccs.push_back(Succ);
}
// Now assign edges to the newly-created nodes.
for (const SDep &Pred : ChainPreds) {
RemovePred(SU, Pred);
if (isNewLoad)
AddPred(LoadSU, Pred);
}
for (const SDep &Pred : LoadPreds) {
RemovePred(SU, Pred);
if (isNewLoad)
AddPred(LoadSU, Pred);
}
for (const SDep &Pred : NodePreds) {
RemovePred(SU, Pred);
AddPred(NewSU, Pred);
}
for (SDep D : NodeSuccs) {
SUnit *SuccDep = D.getSUnit();
D.setSUnit(SU);
RemovePred(SuccDep, D);
D.setSUnit(NewSU);
AddPred(SuccDep, D);
// Balance register pressure.
if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled &&
!D.isCtrl() && NewSU->NumRegDefsLeft > 0)
--NewSU->NumRegDefsLeft;
}
for (SDep D : ChainSuccs) {
SUnit *SuccDep = D.getSUnit();
D.setSUnit(SU);
RemovePred(SuccDep, D);
if (isNewLoad) {
D.setSUnit(LoadSU);
AddPred(SuccDep, D);
}
}
// Add a data dependency to reflect that NewSU reads the value defined
// by LoadSU.
SDep D(LoadSU, SDep::Data, 0);
D.setLatency(LoadSU->Latency);
AddPred(NewSU, D);
if (isNewLoad)
AvailableQueue->addNode(LoadSU);
AvailableQueue->addNode(NewSU);
++NumUnfolds;
if (NewSU->NumSuccsLeft == 0)
NewSU->isAvailable = true;
return NewSU;
}
/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
/// successors to the newly created node.
SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
@ -959,135 +1100,16 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
return nullptr;
}
// If possible unfold instruction.
if (TryUnfold) {
SmallVector<SDNode*, 2> NewNodes;
if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
SUnit *UnfoldSU = TryUnfoldSU(SU);
if (!UnfoldSU)
return nullptr;
// unfolding an x86 DEC64m operation results in store, dec, load which
// can't be handled here so quit
if (NewNodes.size() == 3)
return nullptr;
DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
assert(NewNodes.size() == 2 && "Expected a load folding node!");
N = NewNodes[1];
SDNode *LoadNode = NewNodes[0];
unsigned NumVals = N->getNumValues();
unsigned OldNumVals = SU->getNode()->getNumValues();
for (unsigned i = 0; i != NumVals; ++i)
DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
SDValue(LoadNode, 1));
// LoadNode may already exist. This can happen when there is another
// load from the same location and producing the same type of value
// but it has different alignment or volatileness.
bool isNewLoad = true;
SUnit *LoadSU;
if (LoadNode->getNodeId() != -1) {
LoadSU = &SUnits[LoadNode->getNodeId()];
isNewLoad = false;
} else {
LoadSU = CreateNewSUnit(LoadNode);
LoadNode->setNodeId(LoadSU->NodeNum);
InitNumRegDefsLeft(LoadSU);
computeLatency(LoadSU);
}
SUnit *NewSU = CreateNewSUnit(N);
assert(N->getNodeId() == -1 && "Node already inserted!");
N->setNodeId(NewSU->NodeNum);
const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
NewSU->isTwoAddress = true;
break;
}
}
if (MCID.isCommutable())
NewSU->isCommutable = true;
InitNumRegDefsLeft(NewSU);
computeLatency(NewSU);
// Record all the edges to and from the old SU, by category.
SmallVector<SDep, 4> ChainPreds;
SmallVector<SDep, 4> ChainSuccs;
SmallVector<SDep, 4> LoadPreds;
SmallVector<SDep, 4> NodePreds;
SmallVector<SDep, 4> NodeSuccs;
for (SDep &Pred : SU->Preds) {
if (Pred.isCtrl())
ChainPreds.push_back(Pred);
else if (isOperandOf(Pred.getSUnit(), LoadNode))
LoadPreds.push_back(Pred);
else
NodePreds.push_back(Pred);
}
for (SDep &Succ : SU->Succs) {
if (Succ.isCtrl())
ChainSuccs.push_back(Succ);
else
NodeSuccs.push_back(Succ);
}
// Now assign edges to the newly-created nodes.
for (const SDep &Pred : ChainPreds) {
RemovePred(SU, Pred);
if (isNewLoad)
AddPred(LoadSU, Pred);
}
for (const SDep &Pred : LoadPreds) {
RemovePred(SU, Pred);
if (isNewLoad)
AddPred(LoadSU, Pred);
}
for (const SDep &Pred : NodePreds) {
RemovePred(SU, Pred);
AddPred(NewSU, Pred);
}
for (SDep D : NodeSuccs) {
SUnit *SuccDep = D.getSUnit();
D.setSUnit(SU);
RemovePred(SuccDep, D);
D.setSUnit(NewSU);
AddPred(SuccDep, D);
// Balance register pressure.
if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled
&& !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
--NewSU->NumRegDefsLeft;
}
for (SDep D : ChainSuccs) {
SUnit *SuccDep = D.getSUnit();
D.setSUnit(SU);
RemovePred(SuccDep, D);
if (isNewLoad) {
D.setSUnit(LoadSU);
AddPred(SuccDep, D);
}
}
// Add a data dependency to reflect that NewSU reads the value defined
// by LoadSU.
SDep D(LoadSU, SDep::Data, 0);
D.setLatency(LoadSU->Latency);
AddPred(NewSU, D);
if (isNewLoad)
AvailableQueue->addNode(LoadSU);
AvailableQueue->addNode(NewSU);
++NumUnfolds;
if (NewSU->NumSuccsLeft == 0) {
NewSU->isAvailable = true;
return NewSU;
}
SU = NewSU;
SU = UnfoldSU;
N = SU->getNode();
// If this can be scheduled don't bother duplicating and just return
if (SU->NumSuccsLeft == 0)
return SU;
}
DEBUG(dbgs() << " Duplicating SU #" << SU->NodeNum << "\n");

@ -0,0 +1,40 @@
; RUN: llc -o - %s | FileCheck %s
; CHECK-LABEL: @pr32610
; CHECK: movl L_b$non_lazy_ptr, [[BASEREG:%[a-z]+]]
; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}}
; CHECK: cmpl ([[BASEREG]]), {{%[a-z]+}}
target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.13.0"
@c = external local_unnamed_addr global i32, align 4
@b = external local_unnamed_addr global [1 x i32], align 4
@d = external local_unnamed_addr global i32, align 4
; Function Attrs: norecurse nounwind optsize ssp
define void @pr32610() local_unnamed_addr #0 {
entry:
%0 = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i32 0, i32 undef), align 4, !tbaa !1
%cmp = icmp eq i32 undef, %0
%conv = zext i1 %cmp to i32
%tobool1.i = icmp ne i32 undef, 0
%or.cond.i = and i1 %cmp, %tobool1.i
%cond.i = select i1 %or.cond.i, i32 %conv, i32 undef
store i32 %cond.i, i32* @c, align 4, !tbaa !1
%1 = load i32, i32* getelementptr inbounds ([1 x i32], [1 x i32]* @b, i32 0, i32 0), align 4
%tobool = icmp ne i32 %1, 0
%2 = select i1 %tobool, i32 %1, i32 undef
store i32 %2, i32* @d, align 4, !tbaa !1
ret void
}
attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.ident = !{!0}
!0 = !{!"clang version 5.0.0 (trunk 301507) (llvm/trunk 301505)"}
!1 = !{!2, !2, i64 0}
!2 = !{!"int", !3, i64 0}
!3 = !{!"omnipotent char", !4, i64 0}
!4 = !{!"Simple C/C++ TBAA"}