[ARM] generate armv6m eXecute Only (XO) code

[ARM] generate armv6m eXecute Only (XO) code for immediates, globals

Previously eXecute Only (XO) support was implemented for targets that support
MOVW/MOVT (~armv7+). See: https://reviews.llvm.org/D27449

XO prevents the compiler from generating data accesses to code sections. This
patch implements XO codegen for armv6-M, which does not support MOVW/MOVT, and
must resort to the following general pattern to avoid loads:

    movs    r3, :upper8_15:foo
    lsls    r3, #8
    adds    r3, :upper0_7:foo
    lsls    r3, #8
    adds    r3, :lower8_15:foo
    lsls    r3, #8
    adds    r3, :lower0_7:foo
    ldr     r3, [r3]

This is equivalent to the code pattern generated by GCC.

The above relocations are new to LLVM and have been implemented in a parent
patch: https://reviews.llvm.org/D149443.

This patch limits itself to implementing codegen for this pattern and enabling
XO for armv6-M in the backend.

Separate patches will follow for:
- switch tables
- replacing specific loads from constant islands which are spread out over the
  ARM backend codebase. Amongst others: FastISel, call lowering, stack frames.

Reviewed By: john.brawn

Differential Revision: https://reviews.llvm.org/D152795
This commit is contained in:
Ties Stuij 2023-06-23 10:16:48 +01:00
parent 2f7ab29624
commit 2273741ea2
12 changed files with 241 additions and 19 deletions

View File

@ -199,6 +199,15 @@ void ARMAsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
O << ":lower16:";
else if (TF & ARMII::MO_HI16)
O << ":upper16:";
else if (TF & ARMII::MO_LO_0_7)
O << ":lower0_7:";
else if (TF & ARMII::MO_LO_8_15)
O << ":lower8_15:";
else if (TF & ARMII::MO_HI_0_7)
O << ":upper0_7:";
else if (TF & ARMII::MO_HI_8_15)
O << ":upper8_15:";
GetARMGVSymbol(MO.getGlobal(), TF)->print(O, MAI);
printOffset(MO.getOffset(), O);
}
@ -228,6 +237,14 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
O << ":lower16:";
else if (TF == ARMII::MO_HI16)
O << ":upper16:";
else if (TF == ARMII::MO_LO_0_7)
O << ":lower0_7:";
else if (TF == ARMII::MO_LO_8_15)
O << ":lower8_15:";
else if (TF == ARMII::MO_HI_0_7)
O << ":upper0_7:";
else if (TF == ARMII::MO_HI_8_15)
O << ":upper8_15:";
O << MO.getImm();
break;
}

View File

@ -3327,7 +3327,8 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MachineRegisterInfo *MRI) const {
// Fold large immediates into add, sub, or, xor.
unsigned DefOpc = DefMI.getOpcode();
if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm)
if (DefOpc != ARM::t2MOVi32imm && DefOpc != ARM::MOVi32imm &&
DefOpc != ARM::tMOVi32imm)
return false;
if (!DefMI.getOperand(1).isImm())
// Could be t2MOVi32imm @xx
@ -5538,7 +5539,10 @@ ARMBaseInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
using namespace ARMII;
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"}};
{MO_LO16, "arm-lo16"}, {MO_HI16, "arm-hi16"},
{MO_LO_0_7, "arm-lo-0-7"}, {MO_HI_0_7, "arm-hi-0-7"},
{MO_LO_8_15, "arm-lo-8-15"}, {MO_HI_8_15, "arm-hi-8-15"},
};
return ArrayRef(TargetFlags);
}

View File

@ -71,6 +71,8 @@ namespace {
void ExpandVTBL(MachineBasicBlock::iterator &MBBI,
unsigned Opc, bool IsExt);
void ExpandMQQPRLoadStore(MachineBasicBlock::iterator &MBBI);
void ExpandTMOV32BitImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI);
void ExpandMOV32BitImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI);
void CMSEClearGPRegs(MachineBasicBlock &MBB,
@ -969,6 +971,106 @@ static MachineOperand makeImplicit(const MachineOperand &MO) {
return NewMO;
}
void ARMExpandPseudo::ExpandTMOV32BitImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
Register DstReg = MI.getOperand(0).getReg();
bool DstIsDead = MI.getOperand(0).isDead();
const MachineOperand &MO = MI.getOperand(1);
MachineInstrBuilder Upper8_15, LSL_U8_15, Upper0_7, Lower8_15, Lower0_7;
unsigned MIFlags = MI.getFlags();
LLVM_DEBUG(dbgs() << "Expanding: "; MI.dump());
Upper8_15 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tMOVi8), DstReg)
.addReg(ARM::CPSR, RegState::Kill);
LSL_U8_15 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tLSLri), DstReg)
.addReg(ARM::CPSR, RegState::Kill)
.addReg(DstReg)
.addImm(8)
.add(predOps(ARMCC::AL))
.setMIFlags(MIFlags);
Upper0_7 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tADDi8), DstReg)
.addReg(ARM::CPSR, RegState::Kill)
.addReg(DstReg);
MachineInstr *LSL_U0_7 = MBB.getParent()->CloneMachineInstr(LSL_U8_15);
MBB.insert(MBBI, LSL_U0_7);
Lower8_15 =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tADDi8), DstReg)
.addReg(ARM::CPSR, RegState::Kill)
.addReg(DstReg);
MachineInstr *LSL_L8_15 = MBB.getParent()->CloneMachineInstr(LSL_U8_15);
MBB.insert(MBBI, LSL_L8_15);
Lower0_7 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tADDi8))
.addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
.addReg(ARM::CPSR, RegState::Kill)
.addReg(DstReg);
Upper8_15.setMIFlags(MIFlags);
Upper0_7.setMIFlags(MIFlags);
Lower8_15.setMIFlags(MIFlags);
Lower0_7.setMIFlags(MIFlags);
switch (MO.getType()) {
case MachineOperand::MO_Immediate: {
unsigned Imm = MO.getImm();
unsigned Hi8_15 = (Imm >> 24) & 0xff;
unsigned Hi0_7 = (Imm >> 16) & 0xff;
unsigned Lo8_15 = (Imm >> 8) & 0xff;
unsigned Lo0_7 = Imm & 0xff;
Upper8_15 = Upper8_15.addImm(Hi8_15);
Upper0_7 = Upper0_7.addImm(Hi0_7);
Lower8_15 = Lower8_15.addImm(Lo8_15);
Lower0_7 = Lower0_7.addImm(Lo0_7);
break;
}
case MachineOperand::MO_ExternalSymbol: {
const char *ES = MO.getSymbolName();
unsigned TF = MO.getTargetFlags();
Upper8_15 = Upper8_15.addExternalSymbol(ES, TF | ARMII::MO_HI_8_15);
Upper0_7 = Upper0_7.addExternalSymbol(ES, TF | ARMII::MO_HI_0_7);
Lower8_15 = Lower8_15.addExternalSymbol(ES, TF | ARMII::MO_LO_8_15);
Lower0_7 = Lower0_7.addExternalSymbol(ES, TF | ARMII::MO_LO_0_7);
break;
}
default: {
const GlobalValue *GV = MO.getGlobal();
unsigned TF = MO.getTargetFlags();
Upper8_15 =
Upper8_15.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI_8_15);
Upper0_7 =
Upper0_7.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI_0_7);
Lower8_15 =
Lower8_15.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO_8_15);
Lower0_7 =
Lower0_7.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO_0_7);
break;
}
}
Upper8_15 = Upper8_15.add(predOps(ARMCC::AL));
Upper0_7 = Upper0_7.add(predOps(ARMCC::AL));
Lower8_15 = Lower8_15.add(predOps(ARMCC::AL));
Lower0_7 = Lower0_7.add(predOps(ARMCC::AL));
MI.eraseFromParent();
LLVM_DEBUG(dbgs() << "To: "; Upper8_15.getInstr()->dump(););
LLVM_DEBUG(dbgs() << "And: "; LSL_U8_15.getInstr()->dump(););
LLVM_DEBUG(dbgs() << "And: "; Upper0_7.getInstr()->dump(););
LLVM_DEBUG(dbgs() << "And: "; LSL_U0_7->dump(););
LLVM_DEBUG(dbgs() << "And: "; Lower8_15.getInstr()->dump(););
LLVM_DEBUG(dbgs() << "And: "; LSL_L8_15->dump(););
LLVM_DEBUG(dbgs() << "And: "; Lower0_7.getInstr()->dump(););
}
void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI) {
MachineInstr &MI = *MBBI;
@ -2658,6 +2760,10 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
ExpandMOV32BitImm(MBB, MBBI);
return true;
case ARM::tMOVi32imm:
ExpandTMOV32BitImm(MBB, MBBI);
return true;
case ARM::SUBS_PC_LR: {
MachineInstrBuilder MIB =
BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)

View File

@ -3701,7 +3701,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
case ISD::Constant: {
unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
// If we can't materialize the constant we need to use a literal pool
if (ConstantMaterializationCost(Val, Subtarget) > 2) {
if (ConstantMaterializationCost(Val, Subtarget) > 2 &&
!Subtarget->genExecuteOnly()) {
SDValue CPIdx = CurDAG->getTargetConstantPool(
ConstantInt::get(Type::getInt32Ty(*CurDAG->getContext()), Val),
TLI->getPointerTy(CurDAG->getDataLayout()));

View File

@ -3956,9 +3956,12 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
}
// If we have T2 ops, we can materialize the address directly via movt/movw
// pair. This is always cheaper.
if (Subtarget->useMovt()) {
++NumMovwMovt;
// pair. This is always cheaper. If need to generate Execute Only code, and we
// only have Thumb1 available, we can't use a constant pool and are forced to
// use immediate relocations.
if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
if (Subtarget->useMovt())
++NumMovwMovt;
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes.
return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,

View File

@ -1603,7 +1603,22 @@ def tLDRLIT_ga_abs : PseudoInst<(outs tGPR:$dst), (ins i32imm:$src),
IIC_iLoad_i,
[(set tGPR:$dst,
(ARMWrapper tglobaladdr:$src))]>,
Requires<[IsThumb, DontUseMovt]>;
Requires<[IsThumb, DontUseMovt, DontGenExecuteOnly]>;
// 32-bit immediate using mov/add with the 4 :lower0_7: to :upper8_15:
// relocations.
// This is a single pseudo instruction to make it re-materializable.
// FIXME: Remove this when we can do generalized remat.
let isReMaterializable = 1, isMoveImm = 1, Size = 16, hasNoSchedulingInfo = 1 in
def tMOVi32imm : PseudoInst<(outs rGPR:$dst), (ins i32imm:$src), NoItinerary,
[(set rGPR:$dst, (i32 imm:$src))]>,
Requires<[IsThumb1Only, GenExecuteOnly, DontUseMovt]>;
def : ARMPat<(ARMWrapper tglobaladdr :$dst), (tMOVi32imm tglobaladdr :$dst)>,
Requires<[GenT1ExecuteOnly]>;
def : ARMPat<(ARMWrapper texternalsym :$dst), (tMOVi32imm texternalsym :$dst)>,
Requires<[GenT1ExecuteOnly]>;
// TLS globals
def : Pat<(ARMWrapperPIC tglobaltlsaddr:$addr),

View File

@ -58,6 +58,22 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
Expr = ARMMCExpr::createUpper16(Expr, OutContext);
break;
case ARMII::MO_LO_0_7:
Expr = MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
Expr = ARMMCExpr::createLower0_7(Expr, OutContext);
break;
case ARMII::MO_LO_8_15:
Expr = MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
Expr = ARMMCExpr::createLower8_15(Expr, OutContext);
break;
case ARMII::MO_HI_0_7:
Expr = MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
Expr = ARMMCExpr::createUpper0_7(Expr, OutContext);
break;
case ARMII::MO_HI_8_15:
Expr = MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
Expr = ARMMCExpr::createUpper8_15(Expr, OutContext);
break;
}
if (!MO.isJTI() && MO.getOffset())

View File

@ -224,6 +224,10 @@ let RecomputePerFunction = 1 in {
}
def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
def DontGenExecuteOnly : Predicate<"!Subtarget->genExecuteOnly()">;
def GenT1ExecuteOnly : Predicate<"Subtarget->genExecuteOnly() && "
"Subtarget->isThumb1Only() && "
"!Subtarget->hasV8MBaselineOps()">;
// Armv8.5-A extensions
def HasSB : Predicate<"Subtarget->hasSB()">,

View File

@ -187,10 +187,12 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
// Assert this for now to make the change obvious.
assert(hasV6T2Ops() || !hasThumb2());
// Execute only support requires movt support
if (genExecuteOnly()) {
NoMovt = false;
assert(hasV8MBaselineOps() && "Cannot generate execute-only code for this target");
// Execute only support for >= v8-M Baseline requires movt support
if (hasV8MBaselineOps())
NoMovt = false;
if (!hasV6MOps())
report_fatal_error("Cannot generate execute-only code for this target");
}
// Keep a pointer to static instruction cost data for the specified CPU.

View File

@ -255,7 +255,7 @@ namespace ARMII {
/// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
/// just that part of the flag set.
MO_OPTION_MASK = 0x3,
MO_OPTION_MASK = 0xf03,
/// MO_COFFSTUB - On a symbol operand "FOO", this indicates that the
/// reference is actually to the ".refptr.FOO" symbol. This is used for
@ -287,11 +287,27 @@ namespace ARMII {
/// example).
MO_NONLAZY = 0x80,
// It's undefined behaviour if an enum overflows the range between its
// smallest and largest values, but since these are |ed together, it can
// happen. Put a sentinel in (values of this enum are stored as "unsigned
// char").
MO_UNUSED_MAXIMUM = 0xff
/// MO_LO_0_7 - On a symbol operand, this represents a relocation containing
/// bits 0 through 7 of the address. Used only with Thumb1 MOV and ADD
// instructions.
MO_LO_0_7 = 0x100,
/// MO_LO_8_15 - On a symbol operand, this represents a relocation
/// containing
/// bits 8 through 15 of the address. Used only with Thumb1 MOV and ADD
// instructions.
MO_LO_8_15 = 0x200,
/// MO_HI_0_7 - On a symbol operand, this represents a relocation containing
/// bits 16 through 23 of the address. Used only with Thumb1 MOV and ADD
// instructions.
MO_HI_0_7 = 0x400,
/// MO_HI_8_15 - On a symbol operand, this represents a relocation
/// containing
/// bits 24 through 31 of the address. Used only with Thumb1 MOV and ADD
// instructions.
MO_HI_8_15 = 0x800
};
enum {

View File

@ -1,3 +1,4 @@
; RUN: llc < %s -mtriple=thumbv6m -mattr=+execute-only %s -o - | FileCheck %s
; RUN: llc < %s -mtriple=thumbv7m -mattr=+execute-only %s -o - | FileCheck %s
; RUN: llc < %s -mtriple=thumbv8m.base -mattr=+execute-only %s -o - | FileCheck %s
; RUN: llc < %s -mtriple=thumbv8m.base -mcpu=cortex-m23 -mattr=+execute-only %s -o - | FileCheck %s

View File

@ -2,6 +2,7 @@
; RUN: llc -mtriple=thumbv8m.base-eabi -mcpu=cortex-m23 -mattr=+execute-only %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-T2BASE %s
; RUN: llc -mtriple=thumbv7m-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-T2 %s
; RUN: llc -mtriple=thumbv8m.main-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-T2 %s
; RUN: llc -mtriple=thumbv6m-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefix=CHECK-T1 %s
; CHECK-NOT: {{^ *}}.text{{$}}
; CHECK: .section .text,"axy",%progbits,unique,0
@ -11,7 +12,15 @@
define i32 @global() minsize {
; CHECK-LABEL: global:
; CHECK: movw [[GLOBDEST:r[0-9]+]], :lower16:var
; CHECK: movt [[GLOBDEST]], :upper16:var
; CHECK-NEXT: movt [[GLOBDEST]], :upper16:var
; CHECK-T1-LABEL: global:
; CHECK-T1: movs [[GLOBDEST:r[0-9]+]], :upper8_15:var
; CHECK-T1-NEXT: lsls [[GLOBDEST]], [[GLOBDEST]], #8
; CHECK-T1-NEXT: adds [[GLOBDEST]], :upper0_7:var
; CHECK-T1-NEXT: lsls [[GLOBDEST]], [[GLOBDEST]], #8
; CHECK-T1-NEXT: adds [[GLOBDEST]], :lower8_15:var
; CHECK-T1-NEXT: lsls [[GLOBDEST]], [[GLOBDEST]], #8
; CHECK-T1-NEXT: adds [[GLOBDEST]], :lower0_7:var
%val = load i32, ptr @var
ret i32 %val
@ -80,7 +89,35 @@ return: ; preds = %entry, %sw.bb8, %sw
define hidden ptr @string_literal() {
entry:
; CHECK-LABEL: string_literal:
; CHECK-NOT: .asciz
; CHECK: .fnend
; CHECK: movw [[STRLIT:r[0-9]+]], :lower16:.L.str
; CHECK-NEXT: movt [[STRLIT]], :upper16:.L.str
; CHECK-T1-LABEL: string_literal:
; CHECK-T1: movs [[STRLIT:r[0-9]+]], :upper8_15:.L.str
; CHECK-T1-NEXT: lsls [[STRLIT]], [[STRLIT]], #8
; CHECK-T1-NEXT: adds [[STRLIT]], :upper0_7:.L.str
; CHECK-T1-NEXT: lsls [[STRLIT]], [[STRLIT]], #8
; CHECK-T1-NEXT: adds [[STRLIT]], :lower8_15:.L.str
; CHECK-T1-NEXT: lsls [[STRLIT]], [[STRLIT]], #8
; CHECK-T1-NEXT: adds [[STRLIT]], :lower0_7:.L.str
ret ptr @.str
}
@external_global = external global i32
define i32 @test_external_global() {
entry:
; CHECK-LABEL: external_global:
; CHECK: movw [[EXTGLOB:r[0-9]+]], :lower16:external_global
; CHECK-NEXT: movt [[EXTGLOB]], :upper16:external_global
; CHECK-T1-LABEL: external_global:
; CHECK-T1: movs [[EXTGLOB:r[0-9]+]], :upper8_15:external_global
; CHECK-T1-NEXT: lsls [[EXTGLOB]], [[EXTGLOB]], #8
; CHECK-T1-NEXT: adds [[EXTGLOB]], :upper0_7:external_global
; CHECK-T1-NEXT: lsls [[EXTGLOB]], [[EXTGLOB]], #8
; CHECK-T1-NEXT: adds [[EXTGLOB]], :lower8_15:external_global
; CHECK-T1-NEXT: lsls [[EXTGLOB]], [[EXTGLOB]], #8
; CHECK-T1-NEXT: adds [[EXTGLOB]], :lower0_7:external_global
%v = load i32, ptr @external_global
ret i32 %v
}