[ARM] Allow the scheduler to clone a node with glue to avoid a copy CPSR ↔ GPR.

In Thumb 1, with the new ADDCARRY / SUBCARRY the scheduler may need to do
copies CPSR ↔ GPR but not all Thumb1 targets implement them.

The schedule can attempt, before attempting a copy, to clone the instructions
but it does not currently do that for nodes with input glue. In this patch we
introduce a target-hook to let the hook decide if a glued machinenode is still
eligible for copying. In this case these are ARM::tADCS and ARM::tSBCS .

As a follow-up of this change we should actually implement the copies for the
Thumb1 targets that do implement them and restrict the hook to the targets that
can't really do such copy as these clones are not ideal.

This change fixes PR35836.

Differential Revision: https://reviews.llvm.org/D42051

llvm-svn: 323857
This commit is contained in:
Roger Ferrer Ibanez 2018-01-31 09:23:43 +00:00
parent 4c0185c682
commit a8d946115e
6 changed files with 147 additions and 4 deletions

View File

@ -953,6 +953,10 @@ public:
/// Return true when a target supports MachineCombiner.
virtual bool useMachineCombiner() const { return false; }
/// Return true if the given SDNode can be copied during scheduling
/// even if it has glue.
virtual bool canCopyGluedNodeDuringSchedule(SDNode *N) const { return false; }
protected:
/// Target-dependent implementation for foldMemoryOperand.
/// Target-independent code in foldMemoryOperand will

View File

@ -1117,22 +1117,34 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
if (!N)
return nullptr;
if (SU->getNode()->getGluedNode())
DEBUG(dbgs() << "Considering duplicating the SU\n");
DEBUG(SU->dump(this));
if (N->getGluedNode() &&
!TII->canCopyGluedNodeDuringSchedule(N)) {
DEBUG(dbgs()
<< "Giving up because it has incoming glue and the target does not "
"want to copy it\n");
return nullptr;
}
SUnit *NewSU;
bool TryUnfold = false;
for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
MVT VT = N->getSimpleValueType(i);
if (VT == MVT::Glue)
if (VT == MVT::Glue) {
DEBUG(dbgs() << "Giving up because it has outgoing glue\n");
return nullptr;
else if (VT == MVT::Other)
} else if (VT == MVT::Other)
TryUnfold = true;
}
for (const SDValue &Op : N->op_values()) {
MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
if (VT == MVT::Glue)
if (VT == MVT::Glue && !TII->canCopyGluedNodeDuringSchedule(N)) {
DEBUG(dbgs() << "Giving up because it one of the operands is glue and "
"the target does not want to copy it\n");
return nullptr;
}
}
// If possible unfold instruction.

View File

@ -141,3 +141,16 @@ void Thumb1InstrInfo::expandLoadStackGuard(
else
expandLoadStackGuardBase(MI, ARM::tLDRLIT_ga_abs, ARM::tLDRi);
}
bool Thumb1InstrInfo::canCopyGluedNodeDuringSchedule(SDNode *N) const {
// In Thumb1 the scheduler may need to schedule a cross-copy between GPRS and CPSR
// but this is not always possible there, so allow the Scheduler to clone tADCS and tSBCS
// even if they have glue.
// FIXME. Actually implement the cross-copy where it is possible (post v6)
// because these copies entail more spilling.
unsigned Opcode = N->getMachineOpcode();
if (Opcode == ARM::tADCS || Opcode == ARM::tSBCS)
return true;
return false;
}

View File

@ -53,6 +53,7 @@ public:
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
bool canCopyGluedNodeDuringSchedule(SDNode *N) const override;
private:
void expandLoadStackGuard(MachineBasicBlock::iterator MI) const override;
};

View File

@ -0,0 +1,56 @@
; RUN: llc < %s | FileCheck %s
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
target triple = "thumbv5e-none-linux-gnueabi"
; Function Attrs: norecurse nounwind optsize
define void @f(i32,i32,i32,i32,i32* %x4p, i32* %x5p, i32* %x6p) {
if.end:
br label %while.body
while.body:
%ll.0100 = phi i64 [ 0, %if.end ], [ %shr32, %while.body ]
%add = add nuw nsw i64 %ll.0100, 0
%add3 = add nuw nsw i64 %add, 0
%shr = lshr i64 %add3, 32
%conv7 = zext i32 %0 to i64
%conv9 = zext i32 %1 to i64
%add10 = add nuw nsw i64 %conv9, %conv7
%add11 = add nuw nsw i64 %add10, %shr
%shr14 = lshr i64 %add11, 32
%conv16 = zext i32 %2 to i64
%conv18 = zext i32 %3 to i64
%add19 = add nuw nsw i64 %conv18, %conv16
%add20 = add nuw nsw i64 %add19, %shr14
%conv21 = trunc i64 %add20 to i32
store i32 %conv21, i32* %x6p, align 4
%shr23 = lshr i64 %add20, 32
%x4 = load i32, i32* %x4p, align 4
%conv25 = zext i32 %x4 to i64
%x5 = load i32, i32* %x5p, align 4
%conv27 = zext i32 %x5 to i64
%add28 = add nuw nsw i64 %conv27, %conv25
%add29 = add nuw nsw i64 %add28, %shr23
%shr32 = lshr i64 %add29, 32
br label %while.body
}
; CHECK: adds r3, r0, r1
; CHECK: push {r5}
; CHECK: pop {r1}
; CHECK: adcs r1, r1
; CHECK: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK: ldr r2, [sp, #8] @ 4-byte Reload
; CHECK: adds r2, r0, r2
; CHECK: push {r5}
; CHECK: pop {r4}
; CHECK: adcs r4, r4
; CHECK: adds r0, r2, r5
; CHECK: push {r3}
; CHECK: pop {r0}
; CHECK: adcs r0, r4
; CHECK: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK: str r0, [r6]
; CHECK: ldr r0, [r7]
; CHECK: ldr r6, [sp] @ 4-byte Reload
; CHECK: ldr r6, [r6]
; CHECK: adds r0, r6, r0

View File

@ -0,0 +1,57 @@
; RUN: llc < %s | FileCheck %s
target datalayout = "e-m:e-p:64:64-i128:64-v128:64:128-a:0:64-n64-S64"
target triple = "thumbv6---gnueabi"
; Function Attrs: norecurse nounwind readonly
define i128 @a(i64* nocapture readonly %z) local_unnamed_addr #0 {
entry:
%0 = load i64, i64* %z, align 4
%conv.i = zext i64 %0 to i128
%arrayidx1 = getelementptr inbounds i64, i64* %z, i64 2
%1 = load i64, i64* %arrayidx1, align 4
%conv.i38 = zext i64 %1 to i128
%shl.i39 = shl nuw i128 %conv.i38, 64
%or = or i128 %shl.i39, %conv.i
%arrayidx3 = getelementptr inbounds i64, i64* %z, i64 1
%2 = load i64, i64* %arrayidx3, align 4
%conv.i37 = zext i64 %2 to i128
%arrayidx5 = getelementptr inbounds i64, i64* %z, i64 3
%3 = load i64, i64* %arrayidx5, align 4
%conv.i35 = zext i64 %3 to i128
%shl.i36 = shl nuw i128 %conv.i35, 64
%or7 = or i128 %shl.i36, %conv.i37
%arrayidx10 = getelementptr inbounds i64, i64* %z, i64 4
%4 = load i64, i64* %arrayidx10, align 4
%conv.i64 = zext i64 %4 to i128
%shl.i33 = shl nuw i128 %conv.i64, 64
%or12 = or i128 %shl.i33, %conv.i
%arrayidx15 = getelementptr inbounds i64, i64* %z, i64 5
%5 = load i64, i64* %arrayidx15, align 4
%conv.i30 = zext i64 %5 to i128
%shl.i = shl nuw i128 %conv.i30, 64
%or17 = or i128 %shl.i, %conv.i37
%add = add i128 %or7, %or
%add18 = add i128 %or17, %or12
%mul = mul i128 %add18, %add
ret i128 %mul
}
; CHECK: adds r4, r2, r7
; CHECK: mov r4, r1
; CHECK: adcs r4, r6
; CHECK: ldr r4, [sp, #20] @ 4-byte Reload
; CHECK: adcs r5, r4
; CHECK: ldr r4, [sp, #24] @ 4-byte Reload
; CHECK: adcs r3, r4
; CHECK: adds r4, r2, r7
; CHECK: adcs r1, r6
; CHECK: mov r2, sp
; CHECK: str r4, [r2]
; CHECK: str r1, [r2, #4]
; CHECK: ldr r6, [r0, #16]
; CHECK: ldr r7, [r0, #24]
; CHECK: adcs r7, r6
; CHECK: str r7, [r2, #8]
; CHECK: ldr r6, [r0, #20]
; CHECK: ldr r0, [r0, #28]
; CHECK: adcs r0, r6