ScheduleDAGInstrs: Add condjump deps to addSchedBarrierDeps()

addSchedBarrierDeps() is supposed to add use operands to the ExitSU
node. The current implementation adds uses for calls/barrier instruction
and the MBB live-outs in all other cases. The use
operands of conditional jump instructions were missed.

Also added code to macrofusion to set the latencies between nodes to
zero to avoid problems with the fusing nodes lingering around in the
pending list now.

Differential Revision: https://reviews.llvm.org/D25140

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@286544 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Matthias Braun 2016-11-11 01:34:21 +00:00
parent 9295796ca7
commit ee5205bfae
15 changed files with 66 additions and 66 deletions

View File

@ -1019,8 +1019,7 @@ createStoreClusterDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI);
std::unique_ptr<ScheduleDAGMutation>
createMacroFusionDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI);
createMacroFusionDAGMutation(const TargetInstrInfo *TII);
std::unique_ptr<ScheduleDAGMutation>
createCopyConstrainDAGMutation(const TargetInstrInfo *TII,

View File

@ -1501,10 +1501,9 @@ namespace {
/// that may be fused by the processor into a single operation.
class MacroFusion : public ScheduleDAGMutation {
const TargetInstrInfo &TII;
const TargetRegisterInfo &TRI;
public:
MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
: TII(TII), TRI(TRI) {}
MacroFusion(const TargetInstrInfo &TII)
: TII(TII) {}
void apply(ScheduleDAGInstrs *DAGInstrs) override;
};
@ -1513,27 +1512,12 @@ public:
namespace llvm {
std::unique_ptr<ScheduleDAGMutation>
createMacroFusionDAGMutation(const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI) {
return make_unique<MacroFusion>(*TII, *TRI);
createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
return make_unique<MacroFusion>(*TII);
}
} // namespace llvm
/// Returns true if \p MI reads a register written by \p Other.
static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
const MachineInstr &Other) {
for (const MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || !MO.readsReg())
continue;
unsigned Reg = MO.getReg();
if (Other.modifiesRegister(Reg, &TRI))
return true;
}
return false;
}
/// \brief Callback from DAG postProcessing to create cluster edges to encourage
/// fused operations.
void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
@ -1545,16 +1529,12 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
if (!Branch)
return;
for (SUnit &SU : DAG->SUnits) {
// SUnits with successors can't be schedule in front of the ExitSU.
if (!SU.Succs.empty())
for (SDep &PredDep : ExitSU.Preds) {
if (PredDep.isWeak())
continue;
// We only care if the node writes to a register that the branch reads.
MachineInstr *Pred = SU.getInstr();
if (!HasDataDep(TRI, *Branch, *Pred))
continue;
if (!TII.shouldScheduleAdjacent(*Pred, *Branch))
SUnit &SU = *PredDep.getSUnit();
MachineInstr &Pred = *SU.getInstr();
if (!TII.shouldScheduleAdjacent(Pred, *Branch))
continue;
// Create a single weak edge from SU to ExitSU. The only effect is to cause
@ -1567,6 +1547,16 @@ void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
(void)Success;
assert(Success && "No DAG nodes should be reachable from ExitSU");
// Adjust latency of data deps between the nodes.
for (SDep &PredDep : ExitSU.Preds) {
if (PredDep.getSUnit() == &SU)
PredDep.setLatency(0);
}
for (SDep &SuccDep : SU.Succs) {
if (SuccDep.getSUnit() == &ExitSU)
SuccDep.setLatency(0);
}
DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
break;
}
@ -3128,7 +3118,7 @@ static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
}
if (EnableMacroFusion)
DAG->addMutation(createMacroFusionDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
return DAG;
}

View File

@ -247,11 +247,8 @@ void ScheduleDAGInstrs::exitRegion() {
void ScheduleDAGInstrs::addSchedBarrierDeps() {
MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
ExitSU.setInstr(ExitMI);
bool AllDepKnown = ExitMI &&
(ExitMI->isCall() || ExitMI->isBarrier());
if (ExitMI && AllDepKnown) {
// If it's a call or a barrier, add dependencies on the defs and uses of
// instruction.
// Add dependencies on the defs and uses of the instruction.
if (ExitMI) {
for (const MachineOperand &MO : ExitMI->operands()) {
if (!MO.isReg() || MO.isDef()) continue;
unsigned Reg = MO.getReg();
@ -261,10 +258,10 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
}
}
} else {
}
if (!ExitMI || (!ExitMI->isCall() && !ExitMI->isBarrier())) {
// For others, e.g. fallthrough, conditional branch, assume the exit
// uses all the registers that are livein to the successor blocks.
assert(Uses.empty() && "Uses in set before adding deps?");
for (const MachineBasicBlock *Succ : BB->successors()) {
for (const auto &LI : Succ->liveins()) {
if (!Uses.contains(LI.PhysReg))

View File

@ -78,8 +78,8 @@ declare i32 @doSomething(i32, i32*)
; Next BB.
; CHECK: [[LOOP:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP]]
;
; Next BB.
@ -144,8 +144,8 @@ declare i32 @something(...)
; Next BB.
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: ; %for.end
@ -188,8 +188,8 @@ for.end: ; preds = %for.body
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: bl _somethingElse
@ -259,8 +259,8 @@ declare void @somethingElse(...)
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: bl _something
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: sub [[IV]], [[IV]], #1
; CHECK-NEXT: add [[SUM]], w0, [[SUM]]
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: lsl w0, [[SUM]], #3
@ -333,32 +333,32 @@ entry:
;
; Sum is merged with the returned register.
; CHECK: add [[VA_BASE:x[0-9]+]], sp, #16
; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
; CHECK-NEXT: cmp w1, #1
; CHECK-NEXT: str [[VA_BASE]], [sp, #8]
; CHECK-NEXT: mov [[SUM:w0]], wzr
; CHECK-NEXT: b.lt [[IFEND_LABEL:LBB[0-9_]+]]
; CHECK: mov [[SUM:w0]], wzr
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; CHECK: ldr [[VA_ADDR:x[0-9]+]], [sp, #8]
; CHECK-NEXT: add [[NEXT_VA_ADDR:x[0-9]+]], [[VA_ADDR]], #8
; CHECK-NEXT: str [[NEXT_VA_ADDR]], [sp, #8]
; CHECK-NEXT: ldr [[VA_VAL:w[0-9]+]], {{\[}}[[VA_ADDR]]]
; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
; CHECK-NEXT: sub w1, w1, #1
; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
; DISABLE-NEXT: b [[IFEND_LABEL]]
;
; DISABLE-NEXT: b
; DISABLE: [[ELSE_LABEL]]: ; %if.else
; DISABLE: lsl w0, w1, #1
;
; ENABLE: [[ELSE_LABEL]]: ; %if.else
; ENABLE: lsl w0, w1, #1
; ENABLE-NEXT: ret
;
; CHECK: [[IFEND_LABEL]]:
; Epilogue code.
; CHECK: add sp, sp, #16
; CHECK-NEXT: ret
;
; ENABLE: [[ELSE_LABEL]]: ; %if.else
; ENABLE-NEXT: lsl w0, w1, #1
; ENABLE_NEXT: ret
define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
entry:
%ap = alloca i8*, align 8
@ -413,9 +413,9 @@ declare void @llvm.va_end(i8*)
;
; CHECK: [[LOOP_LABEL:LBB[0-9_]+]]: ; %for.body
; Inline asm statement.
; CHECK: add x19, x19, #1
; CHECK: sub [[IV]], [[IV]], #1
; CHECK-NEXT: cbnz [[IV]], [[LOOP_LABEL]]
; CHECK: add x19, x19, #1
; CHECK: cbnz [[IV]], [[LOOP_LABEL]]
; Next BB.
; CHECK: mov w0, wzr
; Epilogue code.

View File

@ -1,4 +1,4 @@
; RUN: llc -o - %s -mattr=+arith-cbz-fusion,+use-postra-scheduler | FileCheck %s
; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s
; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
target triple = "arm64-apple-ios"

View File

@ -30,9 +30,9 @@ if.then3:
for.inc:
; CHECK_LABEL: %for.inc
; CHECK: add
; CHECK-NEXT: cmp
; CHECK: b.le
; CHECK: cmp
; CHECK-NEXT: add
; CHECK-NEXT: b.le
; CHECK_LABEL: %for.cond.cleanup
%inc = add nsw i32 %x.015, 1
%cmp1 = icmp sgt i32 %x.015, %px

View File

@ -171,6 +171,7 @@ bb3:
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_add_i32_e32 [[INC:v[0-9]+]], vcc, 1, [[LOOPIDX]]
; GCN-NEXT: v_cmp_gt_i32_e32 vcc, 10, [[INC]]
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: v_nop_e64
@ -178,7 +179,6 @@ bb3:
; GCN-NEXT: v_nop_e64
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_and_b64 vcc, exec, vcc
; GCN-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
; GCN-NEXT: [[LONG_JUMP:BB[0-9]+_[0-9]+]]: ; %bb2
@ -426,6 +426,8 @@ endif:
; GCN-NEXT: s_setpc_b64 vcc
; GCN-NEXT: [[LOOP_BODY]]: ; %loop_body
; GCN: s_mov_b64 vcc, -1{{$}}
; GCN: ;;#ASMSTART
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: v_nop_e64
@ -433,7 +435,6 @@ endif:
; GCN: v_nop_e64
; GCN: v_nop_e64
; GCN: ;;#ASMEND
; GCN-NEXT: s_mov_b64 vcc, -1{{$}}
; GCN-NEXT: s_cbranch_vccz [[RET]]
; GCN-NEXT: [[LONGBB:BB[0-9]+_[0-9]+]]: ; %loop_body
@ -493,6 +494,7 @@ ret:
; GCN: [[LONG_BR_DEST0]]
; GCN: s_cmp_eq_u32
; GCN-NEXT: ; implicit-def
; GCN-NEXT: s_cbranch_scc0
; GCN: s_setpc_b64

View File

@ -7,7 +7,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
; GCN-LABEL: {{^}}vcc_shrink_vcc_def:
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
bb0:
%tmp = icmp sgt i32 %arg1, 4
@ -34,7 +34,7 @@ bb2:
; GCN-LABEL: {{^}}preserve_condition_undef_flag:
; GCN-NOT: vcc
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
; GCN: v_cndmask_b32_e64 v1, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
bb0:
%tmp = icmp sgt i32 %arg1, 4

View File

@ -78,6 +78,8 @@ entry:
; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: v_mov_b32_e32 v2, 2
; IDXMODE: v_mov_b32_e32 v3, 3
; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
@ -95,6 +97,10 @@ entry:
; MOVREL: v_movrels_b32_e32 v{{[0-9]}}, v0
; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: v_mov_b32_e32 v0,
; IDXMODE: v_mov_b32_e32 v1,
; IDXMODE: v_mov_b32_e32 v2,
; IDXMODE: v_mov_b32_e32 v3,
; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off
@ -572,12 +578,12 @@ bb7: ; preds = %bb4, %bb1
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41a80000
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x41b00000
; GCN-DAG: s_load_dword [[ARG:s[0-9]+]]
; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
; MOVREL-DAG: s_add_i32 m0, [[ARG]], -16
; MOVREL: v_movreld_b32_e32 v[[VEC0_ELT0]], 4.0
; GCN-NOT: m0
; IDXMODE-DAG: s_add_i32 [[ARG_ADD:s[0-9]+]], [[ARG]], -16
; IDXMODE: s_set_gpr_idx_on [[ARG_ADD]], dst
; IDXMODE: v_mov_b32_e32 v[[VEC0_ELT0]], 4.0
; IDXMODE: s_set_gpr_idx_off

View File

@ -138,6 +138,7 @@ exit:
; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
; CHECK-NEXT: ; BB#1: ; %bb

View File

@ -213,8 +213,8 @@ END:
;CHECK: image_sample
;CHECK: s_and_b64 exec, exec, [[ORIG]]
;CHECK: image_sample
;CHECK: store
;CHECK: v_cmp
;CHECK: store
define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
main_body:
%tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)

View File

@ -644,6 +644,7 @@ declare double @llvm.pow.f64(double, double)
; CHECK: push
;
; DISABLE: tst{{(\.w)?}} r2, #1
; DISABLE-NEXT: vst1.64
; DISABLE-NEXT: beq [[BB13:LBB[0-9_]+]]
;
; CHECK: bl{{x?}} _pow

View File

@ -210,6 +210,8 @@ for.end: ; preds = %for.body
; CHECK: mflr {{[0-9]+}}
;
; DISABLE: cmplwi 0, 3, 0
; DISABLE-NEXT: std
; DISABLE-NEXT: std
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
;
; Loop preheader
@ -290,6 +292,8 @@ declare void @somethingElse(...)
; CHECK: mflr {{[0-9]+}}
;
; DISABLE: cmplwi 0, 3, 0
; DISABLE-NEXT: std
; DISABLE-NEXT: std
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
;
; CHECK: bl somethingElse
@ -377,8 +381,8 @@ entry:
; ENABLE-DAG: li [[IV:[0-9]+]], 10
; ENABLE-DAG: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
;
; DISABLE: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
; DISABLE: cmplwi 0, 3, 0
; DISABLE-NEXT: std 14, -[[STACK_OFFSET:[0-9]+]](1) # 8-byte Folded Spill
; DISABLE-NEXT: beq 0, .[[ELSE_LABEL:LBB[0-9_]+]]
; DISABLE: li [[IV:[0-9]+]], 10
;

View File

@ -56,9 +56,9 @@ bb5: ; preds = %bb, %entry
define i32 @test_inlineasm(i32 %a) nounwind {
entry:
;CHECK-LABEL: test_inlineasm:
;CHECK: cmp
;CHECK: sethi
;CHECK: !NO_APP
;CHECK-NEXT: cmp
;CHECK-NEXT: ble
;CHECK-NEXT: mov
tail call void asm sideeffect "sethi 0, %g0", ""() nounwind

View File

@ -29,8 +29,8 @@ exit:
define void @f2(i8 *%src) {
; CHECK-LABEL: f2:
; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
; CHECK: mvi 0(%r2), 0
; CHECK: tmll [[REG]], 1
; CHECK: mvi 0(%r2), 0
; CHECK: ber %r14
; CHECK: br %r14
entry: