mirror of
https://github.com/RPCS3/llvm.git
synced 2024-12-02 16:56:39 +00:00
Materialize global addresses via movt/movw pair, this is always better
than doing the same via constpool: 1. Load from constpool costs 3 cycles on A9, movt/movw pair - just 2. 2. Load from constpool might stall up to 300 cycles due to cache miss. 3. Movt/movw does not use load/store unit. 4. Less constpool entries => better compiler performance. This is only enabled on ELF systems, since darwin does not have needed relocations (yet). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@89720 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
6935efcb66
commit
5cdc3a949a
@ -162,6 +162,22 @@ namespace ARMII {
|
||||
I_BitShift = 25,
|
||||
CondShift = 28
|
||||
};
|
||||
|
||||
/// Target Operand Flag enum.
|
||||
enum TOF {
|
||||
//===------------------------------------------------------------------===//
|
||||
// ARM Specific MachineOperand flags.
|
||||
|
||||
MO_NO_FLAG,
|
||||
|
||||
/// MO_LO16 - On a symbol operand, this represents a relocation containing
|
||||
/// lower 16 bit of the address. Used only via movw instruction.
|
||||
MO_LO16,
|
||||
|
||||
/// MO_HI16 - On a symbol operand, this represents a relocation containing
|
||||
/// higher 16 bit of the address. Used only via movt instruction.
|
||||
MO_HI16
|
||||
};
|
||||
}
|
||||
|
||||
class ARMBaseInstrInfo : public TargetInstrInfoImpl {
|
||||
|
@ -75,17 +75,30 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
|
||||
}
|
||||
case ARM::t2MOVi32imm: {
|
||||
unsigned DstReg = MI.getOperand(0).getReg();
|
||||
unsigned Imm = MI.getOperand(1).getImm();
|
||||
unsigned Lo16 = Imm & 0xffff;
|
||||
unsigned Hi16 = (Imm >> 16) & 0xffff;
|
||||
if (!MI.getOperand(0).isDead()) {
|
||||
AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
|
||||
TII->get(ARM::t2MOVi16), DstReg)
|
||||
.addImm(Lo16));
|
||||
AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
|
||||
TII->get(ARM::t2MOVTi16))
|
||||
.addReg(DstReg, getDefRegState(true))
|
||||
.addReg(DstReg).addImm(Hi16));
|
||||
const MachineOperand &MO = MI.getOperand(1);
|
||||
MachineInstrBuilder LO16, HI16;
|
||||
|
||||
LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVi16),
|
||||
DstReg);
|
||||
HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVTi16))
|
||||
.addReg(DstReg, getDefRegState(true)).addReg(DstReg);
|
||||
|
||||
if (MO.isImm()) {
|
||||
unsigned Imm = MO.getImm();
|
||||
unsigned Lo16 = Imm & 0xffff;
|
||||
unsigned Hi16 = (Imm >> 16) & 0xffff;
|
||||
LO16 = LO16.addImm(Lo16);
|
||||
HI16 = HI16.addImm(Hi16);
|
||||
} else {
|
||||
GlobalValue *GV = MO.getGlobal();
|
||||
unsigned TF = MO.getTargetFlags();
|
||||
LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
|
||||
HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
|
||||
// FIXME: What's about memoperands?
|
||||
}
|
||||
AddDefaultPred(LO16);
|
||||
AddDefaultPred(HI16);
|
||||
}
|
||||
MI.eraseFromParent();
|
||||
Modified = true;
|
||||
|
@ -261,7 +261,9 @@ bool ARMDAGToDAGISel::SelectAddrMode2(SDValue Op, SDValue N,
|
||||
if (N.getOpcode() == ISD::FrameIndex) {
|
||||
int FI = cast<FrameIndexSDNode>(N)->getIndex();
|
||||
Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
|
||||
} else if (N.getOpcode() == ARMISD::Wrapper) {
|
||||
} else if (N.getOpcode() == ARMISD::Wrapper &&
|
||||
!(Subtarget->useMovt() &&
|
||||
N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
|
||||
Base = N.getOperand(0);
|
||||
}
|
||||
Offset = CurDAG->getRegister(0, MVT::i32);
|
||||
@ -463,7 +465,9 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue Op, SDValue N,
|
||||
if (N.getOpcode() == ISD::FrameIndex) {
|
||||
int FI = cast<FrameIndexSDNode>(N)->getIndex();
|
||||
Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
|
||||
} else if (N.getOpcode() == ARMISD::Wrapper) {
|
||||
} else if (N.getOpcode() == ARMISD::Wrapper &&
|
||||
!(Subtarget->useMovt() &&
|
||||
N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
|
||||
Base = N.getOperand(0);
|
||||
}
|
||||
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
|
||||
@ -558,7 +562,13 @@ ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDValue Op, SDValue N,
|
||||
}
|
||||
|
||||
if (N.getOpcode() != ISD::ADD) {
|
||||
Base = (N.getOpcode() == ARMISD::Wrapper) ? N.getOperand(0) : N;
|
||||
if (N.getOpcode() == ARMISD::Wrapper &&
|
||||
!(Subtarget->useMovt() &&
|
||||
N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
|
||||
Base = N.getOperand(0);
|
||||
} else
|
||||
Base = N;
|
||||
|
||||
Offset = CurDAG->getRegister(0, MVT::i32);
|
||||
OffImm = CurDAG->getTargetConstant(0, MVT::i32);
|
||||
return true;
|
||||
@ -681,7 +691,9 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue Op, SDValue N,
|
||||
Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
|
||||
OffImm = CurDAG->getTargetConstant(0, MVT::i32);
|
||||
return true;
|
||||
} else if (N.getOpcode() == ARMISD::Wrapper) {
|
||||
} else if (N.getOpcode() == ARMISD::Wrapper &&
|
||||
!(Subtarget->useMovt() &&
|
||||
N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
|
||||
Base = N.getOperand(0);
|
||||
if (Base.getOpcode() == ISD::TargetConstantPool)
|
||||
return false; // We want to select t2LDRpci instead.
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include "llvm/CodeGen/SelectionDAG.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/ADT/VectorExtras.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include <sstream>
|
||||
@ -1356,10 +1357,17 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
|
||||
PseudoSourceValue::getGOT(), 0);
|
||||
return Result;
|
||||
} else {
|
||||
SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
|
||||
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
|
||||
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
|
||||
PseudoSourceValue::getConstantPool(), 0);
|
||||
// If we have T2 ops, we can materialize the address directly via movt/movw
|
||||
// pair. This is always cheaper.
|
||||
if (Subtarget->useMovt()) {
|
||||
return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
|
||||
DAG.getTargetGlobalAddress(GV, PtrVT));
|
||||
} else {
|
||||
SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
|
||||
CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
|
||||
return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
|
||||
PseudoSourceValue::getConstantPool(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -116,6 +116,10 @@ def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
|
||||
def CarryDefIsUnused : Predicate<"!N.getNode()->hasAnyUseOfValue(1)">;
|
||||
def CarryDefIsUsed : Predicate<"N.getNode()->hasAnyUseOfValue(1)">;
|
||||
|
||||
// FIXME: Eventually this will be just "hasV6T2Ops".
|
||||
def UseMovt : Predicate<"Subtarget->useMovt()">;
|
||||
def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ARM Flag Definitions.
|
||||
|
||||
@ -204,7 +208,7 @@ def hi16 : SDNodeXForm<imm, [{
|
||||
def lo16AllZero : PatLeaf<(i32 imm), [{
|
||||
// Returns true if all low 16-bits are 0.
|
||||
return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
|
||||
}], hi16>;
|
||||
}], hi16>;
|
||||
|
||||
/// imm0_65535 predicate - True if the 32-bit immediate is in the range
|
||||
/// [0.65535].
|
||||
@ -1002,7 +1006,7 @@ def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src),
|
||||
let Constraints = "$src = $dst" in
|
||||
def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
|
||||
DPFrm, IIC_iMOVi,
|
||||
"movt", "\t$dst, $imm",
|
||||
"movt", "\t$dst, $imm",
|
||||
[(set GPR:$dst,
|
||||
(or (and GPR:$src, 0xffff),
|
||||
lo16AllZero:$imm))]>, UnaryDP,
|
||||
@ -1603,12 +1607,6 @@ let Defs =
|
||||
// Non-Instruction Patterns
|
||||
//
|
||||
|
||||
// ConstantPool, GlobalAddress, and JumpTable
|
||||
def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>;
|
||||
def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>;
|
||||
def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
|
||||
(LEApcrelJT tjumptable:$dst, imm:$id)>;
|
||||
|
||||
// Large immediate handling.
|
||||
|
||||
// Two piece so_imms.
|
||||
@ -1638,10 +1636,19 @@ def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS),
|
||||
// FIXME: Remove this when we can do generalized remat.
|
||||
let isReMaterializable = 1 in
|
||||
def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
|
||||
"movw", "\t$dst, ${src:lo16}\n\tmovt${p} $dst, ${src:hi16}",
|
||||
"movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
|
||||
[(set GPR:$dst, (i32 imm:$src))]>,
|
||||
Requires<[IsARM, HasV6T2]>;
|
||||
|
||||
// ConstantPool, GlobalAddress, and JumpTable
|
||||
def : ARMPat<(ARMWrapper tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
|
||||
Requires<[IsARM, DontUseMovt]>;
|
||||
def : ARMPat<(ARMWrapper tconstpool :$dst), (LEApcrel tconstpool :$dst)>;
|
||||
def : ARMPat<(ARMWrapper tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
|
||||
Requires<[IsARM, UseMovt]>;
|
||||
def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
|
||||
(LEApcrelJT tjumptable:$dst, imm:$id)>;
|
||||
|
||||
// TODO: add,sub,and, 3-instr forms?
|
||||
|
||||
|
||||
|
@ -1181,12 +1181,6 @@ def : T2Pat<(add GPR:$LHS, t2_so_neg_imm2part:$RHS),
|
||||
(t2SUBri (t2SUBri GPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)),
|
||||
(t2_so_neg_imm2part_2 imm:$RHS))>;
|
||||
|
||||
// ConstantPool, GlobalAddress, and JumpTable
|
||||
def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>;
|
||||
def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
|
||||
def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
|
||||
(t2LEApcrelJT tjumptable:$dst, imm:$id)>;
|
||||
|
||||
// 32-bit immediate using movw + movt.
|
||||
// This is a single pseudo instruction to make it re-materializable. Remove
|
||||
// when we can do generalized remat.
|
||||
@ -1195,6 +1189,16 @@ def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
|
||||
"movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
|
||||
[(set GPR:$dst, (i32 imm:$src))]>;
|
||||
|
||||
// ConstantPool, GlobalAddress, and JumpTable
|
||||
def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
|
||||
Requires<[IsThumb2, DontUseMovt]>;
|
||||
def : T2Pat<(ARMWrapper tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>;
|
||||
def : T2Pat<(ARMWrapper tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
|
||||
Requires<[IsThumb2, UseMovt]>;
|
||||
|
||||
def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
|
||||
(t2LEApcrelJT tjumptable:$dst, imm:$id)>;
|
||||
|
||||
// Pseudo instruction that combines ldr from constpool and add pc. This should
|
||||
// be expanded into two instructions late to allow if-conversion and
|
||||
// scheduling.
|
||||
|
@ -27,6 +27,10 @@ UseNEONFP("arm-use-neon-fp",
|
||||
cl::desc("Use NEON for single-precision FP"),
|
||||
cl::init(false), cl::Hidden);
|
||||
|
||||
static cl::opt<bool>
|
||||
UseMOVT("arm-use-movt",
|
||||
cl::init(true), cl::Hidden);
|
||||
|
||||
ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
|
||||
bool isT)
|
||||
: ARMArchVersion(V4T)
|
||||
@ -36,6 +40,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
|
||||
, ThumbMode(Thumb1)
|
||||
, PostRAScheduler(false)
|
||||
, IsR9Reserved(ReserveR9)
|
||||
, UseMovt(UseMOVT)
|
||||
, stackAlignment(4)
|
||||
, CPUString("generic")
|
||||
, TargetType(isELF) // Default to ELF unless otherwise specified.
|
||||
|
@ -65,6 +65,10 @@ protected:
|
||||
/// IsR9Reserved - True if R9 is a not available as general purpose register.
|
||||
bool IsR9Reserved;
|
||||
|
||||
/// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
|
||||
/// imms (including global addresses).
|
||||
bool UseMovt;
|
||||
|
||||
/// stackAlignment - The minimum alignment known to hold of the stack frame on
|
||||
/// entry to the function and which must be maintained by every function.
|
||||
unsigned stackAlignment;
|
||||
@ -130,8 +134,10 @@ protected:
|
||||
|
||||
bool isR9Reserved() const { return IsR9Reserved; }
|
||||
|
||||
bool useMovt() const { return UseMovt && hasV6T2Ops(); }
|
||||
|
||||
const std::string & getCPUString() const { return CPUString; }
|
||||
|
||||
|
||||
/// enablePostRAScheduler - True at 'More' optimization.
|
||||
bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
|
||||
TargetSubtarget::AntiDepBreakMode& Mode,
|
||||
|
@ -330,6 +330,8 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
|
||||
void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
|
||||
const char *Modifier) {
|
||||
const MachineOperand &MO = MI->getOperand(OpNum);
|
||||
unsigned TF = MO.getTargetFlags();
|
||||
|
||||
switch (MO.getType()) {
|
||||
default:
|
||||
assert(0 && "<unknown operand type>");
|
||||
@ -356,12 +358,12 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
|
||||
case MachineOperand::MO_Immediate: {
|
||||
int64_t Imm = MO.getImm();
|
||||
O << '#';
|
||||
if (Modifier) {
|
||||
if (strcmp(Modifier, "lo16") == 0)
|
||||
O << ":lower16:";
|
||||
else if (strcmp(Modifier, "hi16") == 0)
|
||||
O << ":upper16:";
|
||||
}
|
||||
if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
|
||||
(TF & ARMII::MO_LO16))
|
||||
O << ":lower16:";
|
||||
else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
|
||||
(TF & ARMII::MO_HI16))
|
||||
O << ":upper16:";
|
||||
O << Imm;
|
||||
break;
|
||||
}
|
||||
@ -371,6 +373,13 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
|
||||
case MachineOperand::MO_GlobalAddress: {
|
||||
bool isCallOp = Modifier && !strcmp(Modifier, "call");
|
||||
GlobalValue *GV = MO.getGlobal();
|
||||
|
||||
if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
|
||||
(TF & ARMII::MO_LO16))
|
||||
O << ":lower16:";
|
||||
else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
|
||||
(TF & ARMII::MO_HI16))
|
||||
O << ":upper16:";
|
||||
O << Mang->getMangledName(GV);
|
||||
|
||||
printOffset(MO.getOffset());
|
||||
|
@ -78,7 +78,7 @@ namespace {
|
||||
{ ARM::t2LSRri, ARM::tLSRri, 0, 5, 0, 1, 0, 0,0, 0 },
|
||||
{ ARM::t2LSRrr, 0, ARM::tLSRrr, 0, 0, 0, 1, 0,0, 0 },
|
||||
{ ARM::t2MOVi, ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 },
|
||||
{ ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 0 },
|
||||
{ ARM::t2MOVi16,ARM::tMOVi8, 0, 8, 0, 1, 0, 0,0, 1 },
|
||||
// FIXME: Do we need the 16-bit 'S' variant?
|
||||
{ ARM::t2MOVr,ARM::tMOVgpr2gpr,0, 0, 0, 0, 0, 1,0, 0 },
|
||||
{ ARM::t2MOVCCr,0, ARM::tMOVCCr, 0, 0, 0, 0, 0,1, 0 },
|
||||
@ -413,6 +413,12 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
|
||||
if (MI->getOperand(2).getImm() == 0)
|
||||
return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
|
||||
break;
|
||||
case ARM::t2MOVi16:
|
||||
// Can convert only 'pure' immediate operands, not immediates obtained as
|
||||
// globals' addresses.
|
||||
if (MI->getOperand(1).isImm())
|
||||
return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
20
test/CodeGen/ARM/movt-movw-global.ll
Normal file
20
test/CodeGen/ARM/movt-movw-global.ll
Normal file
@ -0,0 +1,20 @@
|
||||
; RUN: llc < %s | FileCheck %s
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
|
||||
target triple = "armv7-eabi"
|
||||
|
||||
@foo = common global i32 0 ; <i32*> [#uses=1]
|
||||
|
||||
define arm_aapcs_vfpcc i32* @bar1() nounwind readnone {
|
||||
entry:
|
||||
; CHECK: movw r0, :lower16:foo
|
||||
; CHECK-NEXT: movt r0, :upper16:foo
|
||||
ret i32* @foo
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc void @bar2(i32 %baz) nounwind {
|
||||
entry:
|
||||
; CHECK: movw r1, :lower16:foo
|
||||
; CHECK-NEXT: movt r1, :upper16:foo
|
||||
store i32 %baz, i32* @foo, align 4
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user