mirror of
https://github.com/RPCS3/llvm.git
synced 2025-01-01 09:18:30 +00:00
[RegisterCoalescer] Add new subtarget hook allowing targets to opt-out of coalescing.
The coalescer is very aggressive at propagating constraints on the register classes, and the register allocator doesn’t know how to split sub-registers later to recover. This patch provides an escape valve for targets that encounter this problem to limit coalescing. This patch also implements such for ARM to lower register pressure when using lots of large register classes. This works around PR18825. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213078 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
2dade438d7
commit
01d8611240
@ -115,6 +115,16 @@ public:
|
||||
|
||||
/// \brief Reset the features for the subtarget.
|
||||
virtual void resetSubtargetFeatures(const MachineFunction *MF) { }
|
||||
|
||||
/// \brief SrcRC and DstRC will be morphed into NewRC if this returns true.
|
||||
virtual bool shouldCoalesce(MachineInstr *MI,
|
||||
const TargetRegisterClass *SrcRC,
|
||||
unsigned SubReg,
|
||||
const TargetRegisterClass *DstRC,
|
||||
unsigned DstSubReg,
|
||||
const TargetRegisterClass *NewRC) const
|
||||
{ return true; }
|
||||
|
||||
};
|
||||
|
||||
} // End llvm namespace
|
||||
|
@ -1037,6 +1037,23 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (CP.getNewRC()) {
|
||||
const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
|
||||
auto SrcRC = MRI->getRegClass(CP.getSrcReg());
|
||||
auto DstRC = MRI->getRegClass(CP.getDstReg());
|
||||
unsigned SrcIdx = CP.getSrcIdx();
|
||||
unsigned DstIdx = CP.getDstIdx();
|
||||
if (CP.isFlipped()) {
|
||||
std::swap(SrcIdx, DstIdx);
|
||||
std::swap(SrcRC, DstRC);
|
||||
}
|
||||
if (!ST.shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
|
||||
CP.getNewRC())) {
|
||||
DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Dead code elimination. This really should be handled by MachineDCE, but
|
||||
// sometimes dead copies slip through, and we can't generate invalid live
|
||||
// ranges.
|
||||
|
@ -19,6 +19,7 @@
|
||||
#include "llvm/CodeGen/MachineFunction.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetRegisterInfo.h"
|
||||
#include "llvm/ADT/DenseMap.h"
|
||||
|
||||
namespace llvm {
|
||||
|
||||
@ -118,6 +119,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
|
||||
/// being passed on the stack
|
||||
unsigned ArgumentStackSize;
|
||||
|
||||
/// CoalescedWeights - mapping of basic blocks to the rolling counter of
|
||||
/// coalesced weights.
|
||||
DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights;
|
||||
|
||||
public:
|
||||
ARMFunctionInfo() :
|
||||
isThumb(false),
|
||||
@ -221,6 +226,15 @@ public:
|
||||
else
|
||||
return -1U;
|
||||
}
|
||||
|
||||
DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight(
|
||||
MachineBasicBlock* MBB) {
|
||||
auto It = CoalescedWeights.find(MBB);
|
||||
if (It == CoalescedWeights.end()) {
|
||||
It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first;
|
||||
}
|
||||
return It;
|
||||
}
|
||||
};
|
||||
} // End llvm namespace
|
||||
|
||||
|
@ -18,6 +18,7 @@
|
||||
#include "ARMJITInfo.h"
|
||||
#include "ARMSelectionDAGInfo.h"
|
||||
#include "ARMSubtarget.h"
|
||||
#include "ARMMachineFunctionInfo.h"
|
||||
#include "Thumb1FrameLowering.h"
|
||||
#include "Thumb1InstrInfo.h"
|
||||
#include "Thumb2InstrInfo.h"
|
||||
@ -27,6 +28,8 @@
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Target/TargetInstrInfo.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Target/TargetRegisterInfo.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
@ -449,3 +452,51 @@ bool ARMSubtarget::useMovt(const MachineFunction &MF) const {
|
||||
!MF.getFunction()->getAttributes().hasAttribute(
|
||||
AttributeSet::FunctionIndex, Attribute::MinSize));
|
||||
}
|
||||
|
||||
bool ARMSubtarget::shouldCoalesce(MachineInstr *MI,
|
||||
const TargetRegisterClass *SrcRC,
|
||||
unsigned SubReg,
|
||||
const TargetRegisterClass *DstRC,
|
||||
unsigned DstSubReg,
|
||||
const TargetRegisterClass *NewRC) const {
|
||||
auto MBB = MI->getParent();
|
||||
auto MF = MBB->getParent();
|
||||
const MachineRegisterInfo &MRI = MF->getRegInfo();
|
||||
// If not copying into a sub-register this should be ok because we shouldn't
|
||||
// need to split the reg.
|
||||
if (!DstSubReg)
|
||||
return true;
|
||||
// Small registers don't frequently cause a problem, so we can coalesce them.
|
||||
if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32)
|
||||
return true;
|
||||
|
||||
auto NewRCWeight =
|
||||
MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC);
|
||||
auto SrcRCWeight =
|
||||
MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC);
|
||||
auto DstRCWeight =
|
||||
MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC);
|
||||
// If the source register class is more expensive than the destination, the
|
||||
// coalescing is probably profitable.
|
||||
if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight)
|
||||
return true;
|
||||
if (DstRCWeight.RegWeight > NewRCWeight.RegWeight)
|
||||
return true;
|
||||
|
||||
// If the register allocator isn't constrained, we can always allow coalescing
|
||||
// unfortunately we don't know yet if we will be constrained.
|
||||
// The goal of this heuristic is to restrict how many expensive registers
|
||||
// we allow to coalesce in a given basic block.
|
||||
auto AFI = MF->getInfo<ARMFunctionInfo>();
|
||||
auto It = AFI->getCoalescedWeight(MBB);
|
||||
|
||||
DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: " << It->second << "\n");
|
||||
DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: " << NewRCWeight.RegWeight << "\n");
|
||||
unsigned SizeMultiplier = MBB->size()/100;
|
||||
SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1;
|
||||
if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) {
|
||||
It->second += NewRCWeight.RegWeight;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -451,6 +451,14 @@ public:
|
||||
/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
|
||||
/// symbol.
|
||||
bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
|
||||
|
||||
/// \brief SrcRC and DstRC will be morphed into NewRC if this returns true
|
||||
bool shouldCoalesce(MachineInstr *MI,
|
||||
const TargetRegisterClass *SrcRC,
|
||||
unsigned SubReg,
|
||||
const TargetRegisterClass *DstRC,
|
||||
unsigned DstSubReg,
|
||||
const TargetRegisterClass *NewRC) const override;
|
||||
};
|
||||
} // End llvm namespace
|
||||
|
||||
|
42
test/CodeGen/ARM/out-of-registers.ll
Normal file
42
test/CodeGen/ARM/out-of-registers.ll
Normal file
@ -0,0 +1,42 @@
|
||||
; RUN: llc -O3 %s -o - | FileCheck %s
|
||||
; ModuleID = 'fo.c'
|
||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64"
|
||||
target triple = "thumbv7-none-linux-gnueabi"
|
||||
|
||||
; CHECK: vpush
|
||||
; CHECK: vpop
|
||||
|
||||
define void @foo(float* nocapture %A) #0 {
|
||||
%1= bitcast float* %A to i8*
|
||||
%2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
|
||||
%3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0
|
||||
%divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3
|
||||
%4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1
|
||||
%div3p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %4
|
||||
%5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2
|
||||
%div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5
|
||||
%6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3
|
||||
%div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6
|
||||
tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
|
||||
|
||||
; Function Attrs: nounwind readonly
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1
|
||||
declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2
|
||||
|
||||
; Function Attrs: nounwind
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind readonly }
|
||||
|
||||
!llvm.ident = !{!0}
|
||||
|
||||
!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"}
|
||||
!1 = metadata !{metadata !1}
|
34
test/CodeGen/ARM/vector-spilling.ll
Normal file
34
test/CodeGen/ARM/vector-spilling.ll
Normal file
@ -0,0 +1,34 @@
|
||||
; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
|
||||
|
||||
; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory.
|
||||
; Check that we don't crash when we generate these instructions on Cortex-A9.
|
||||
|
||||
; CHECK: test:
|
||||
; CHECK: vstmia
|
||||
; CHECK: vldmia
|
||||
define void @test(<8 x i64>* %src) #0 {
|
||||
entry:
|
||||
%0 = getelementptr inbounds <8 x i64>* %src, i32 0
|
||||
%1 = load <8 x i64>* %0, align 8
|
||||
|
||||
%2 = getelementptr inbounds <8 x i64>* %src, i32 1
|
||||
%3 = load <8 x i64>* %2, align 8
|
||||
|
||||
%4 = getelementptr inbounds <8 x i64>* %src, i32 2
|
||||
%5 = load <8 x i64>* %4, align 8
|
||||
|
||||
%6 = getelementptr inbounds <8 x i64>* %src, i32 3
|
||||
%7 = load <8 x i64>* %6, align 8
|
||||
|
||||
%8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
|
||||
%9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
|
||||
|
||||
tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
|
||||
|
||||
attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
@ -2,12 +2,12 @@
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64"
|
||||
|
||||
; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory.
|
||||
; Check that we don't crash when we generate these instructions on Cortex-A9.
|
||||
; This test used to test vector spilling using vstmia/vldmia instructions, but
|
||||
; the changes for PR:18825 prevent that spilling.
|
||||
|
||||
; CHECK: test:
|
||||
; CHECK: vstmia
|
||||
; CHECK: vldmia
|
||||
; CHECK-NOT: vstmia
|
||||
; CHECK-NOT: vldmia
|
||||
define void @test(i64* %src) #0 {
|
||||
entry:
|
||||
%arrayidx39 = getelementptr inbounds i64* %src, i32 13
|
||||
|
Loading…
Reference in New Issue
Block a user