mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-09 21:32:49 +00:00
[RISCV] Add peepholes for Global Address lowering patterns
Summary: Base and offset are always separated when a GlobalAddress node is lowered (rL332641) as an optimization to reduce instruction count. However, this optimization is not profitable if the Global Address ends up being used in only instruction. This patch adds peephole optimizations that merge an offset of an address calculation into the LUI %%hi and ADD %lo of the lowering sequence. The peephole handles three patterns: 1) ADDI (ADDI (LUI %hi(global)) %lo(global)), offset ---> ADDI (LUI %hi(global + offset)) %lo(global + offset). This generates: lui a0, hi (global + offset) add a0, a0, lo (global + offset) Instead of lui a0, hi (global) addi a0, hi (global) addi a0, offset This pattern is for cases when the offset is small enough to fit in the immediate filed of ADDI (less than 12 bits). 2) ADD ((ADDI (LUI %hi(global)) %lo(global)), (LUI hi_offset)) ---> offset = hi_offset << 12 ADDI (LUI %hi(global + offset)) %lo(global + offset) Which generates the ASM: lui a0, hi(global + offset) addi a0, lo(global + offset) Instead of: lui a0, hi(global) addi a0, lo(global) lui a1, (offset) add a0, a0, a1 This pattern is for cases when the offset doesn't fit in an immediate field of ADDI but the lower 12 bits are all zeros. 3) ADD ((ADDI (LUI %hi(global)) %lo(global)), (ADDI lo_offset, (LUI hi_offset))) ---> offset = global + offhi20<<12 + offlo12 ADDI (LUI %hi(global + offset)) %lo(global + offset) Which generates the ASM: lui a1, %hi(global + offset) addi a1, %lo(global + offset) Instead of: lui a0, hi(global) addi a0, lo(global) lui a1, (offhi20) addi a1, (offlo12) add a0, a0, a1 This pattern is for cases when the offset doesn't fit in an immediate field of ADDI and both the lower 1 bits and high 20 bits are non zero. Reviewers: asb Reviewed By: asb Subscribers: rbar, johnrusso, simoncook, jordy.potman.lists, apazos, niosHD, kito-cheng, shiva0217, zzheng, edward-jones, mgrang llvm-svn: 333455
This commit is contained in:
parent
6a4ca59d1b
commit
d114bbf2bb
@ -56,12 +56,14 @@ public:
|
||||
|
||||
private:
|
||||
void doPeepholeLoadStoreADDI();
|
||||
void doPeepholeGlobalAddiLuiOffset();
|
||||
void doPeepholeBuildPairF64SplitF64();
|
||||
};
|
||||
}
|
||||
|
||||
void RISCVDAGToDAGISel::PostprocessISelDAG() {
|
||||
doPeepholeLoadStoreADDI();
|
||||
doPeepholeGlobalAddiLuiOffset();
|
||||
doPeepholeBuildPairF64SplitF64();
|
||||
}
|
||||
|
||||
@ -128,6 +130,212 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Detect the pattern lui %hi(global) --> ADDI %lo(global)
|
||||
// HiLUI LoADDI
|
||||
static bool detectLuiAddiGlobal(SDNode *Tail, unsigned &Idx, SDValue &LoADDI,
|
||||
SDValue &HiLUI, GlobalAddressSDNode *&GAlo,
|
||||
GlobalAddressSDNode *&GAhi) {
|
||||
// Try to detect the pattern on every operand of the tail instruction.
|
||||
for (Idx = 0; Idx < Tail->getNumOperands(); Idx++) {
|
||||
LoADDI = Tail->getOperand(Idx);
|
||||
// LoADDI should only be used by one instruction (Tail).
|
||||
if (!LoADDI->isMachineOpcode() ||
|
||||
!(LoADDI->getMachineOpcode() == RISCV::ADDI) ||
|
||||
!isa<GlobalAddressSDNode>(LoADDI->getOperand(1)) ||
|
||||
!LoADDI->hasOneUse())
|
||||
continue;
|
||||
// Check for existence of %lo target flag.
|
||||
GAlo = cast<GlobalAddressSDNode>(LoADDI->getOperand(1));
|
||||
if (!(GAlo->getTargetFlags() == RISCVII::MO_LO) ||
|
||||
!(GAlo->getOffset() == 0))
|
||||
return false;
|
||||
// Check for existence of %hi target flag.
|
||||
HiLUI = LoADDI->getOperand(0);
|
||||
if (!HiLUI->isMachineOpcode() ||
|
||||
!(HiLUI->getMachineOpcode() == RISCV::LUI) ||
|
||||
!isa<GlobalAddressSDNode>(HiLUI->getOperand(0)) || !HiLUI->hasOneUse())
|
||||
return false;
|
||||
GAhi = cast<GlobalAddressSDNode>(HiLUI->getOperand(0));
|
||||
if (!(GAhi->getTargetFlags() == RISCVII::MO_HI) ||
|
||||
!(GAhi->getOffset() == 0))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool matchLuiOffset(SDValue &OffsetLUI, int64_t &Offset) {
|
||||
if (!OffsetLUI->isMachineOpcode() ||
|
||||
!(OffsetLUI->getMachineOpcode() == RISCV::LUI) ||
|
||||
!isa<ConstantSDNode>(OffsetLUI->getOperand(0)))
|
||||
return false;
|
||||
Offset = cast<ConstantSDNode>(OffsetLUI->getOperand(0))->getSExtValue();
|
||||
Offset = Offset << 12;
|
||||
LLVM_DEBUG(dbgs() << " Detected \" LUI Offset_hi\"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool matchAddiLuiOffset(SDValue &OffsetLoADDI, int64_t &Offset) {
|
||||
// LoADDI should only be used by the tail instruction only.
|
||||
if (!OffsetLoADDI->isMachineOpcode() ||
|
||||
!(OffsetLoADDI->getMachineOpcode() == RISCV::ADDI) ||
|
||||
!isa<ConstantSDNode>(OffsetLoADDI->getOperand(1)) ||
|
||||
!OffsetLoADDI->hasOneUse())
|
||||
return false;
|
||||
int64_t OffLo =
|
||||
cast<ConstantSDNode>(OffsetLoADDI->getOperand(1))->getZExtValue();
|
||||
// HiLUI should only be used by the loADDI.
|
||||
SDValue OffsetHiLUI = (OffsetLoADDI->getOperand(0));
|
||||
if (!OffsetHiLUI->isMachineOpcode() ||
|
||||
!(OffsetHiLUI->getMachineOpcode() == RISCV::LUI) ||
|
||||
!isa<ConstantSDNode>(OffsetHiLUI->getOperand(0)) ||
|
||||
!OffsetHiLUI->hasOneUse())
|
||||
return false;
|
||||
int64_t OffHi =
|
||||
cast<ConstantSDNode>(OffsetHiLUI->getOperand(0))->getSExtValue();
|
||||
Offset = (OffHi << 12) + OffLo;
|
||||
LLVM_DEBUG(dbgs() << " Detected \" ADDI (LUI Offset_hi), Offset_lo\"\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
static void updateTailInstrUsers(SDNode *Tail, SelectionDAG *CurDAG,
|
||||
GlobalAddressSDNode *GAhi,
|
||||
GlobalAddressSDNode *GAlo,
|
||||
SDValue &GlobalHiLUI, SDValue &GlobalLoADDI,
|
||||
int64_t Offset) {
|
||||
// Update the offset in GAhi and GAlo.
|
||||
SDLoc DL(Tail->getOperand(1));
|
||||
SDValue GAHiNew = CurDAG->getTargetGlobalAddress(GAhi->getGlobal(), DL,
|
||||
GlobalHiLUI.getValueType(),
|
||||
Offset, RISCVII::MO_HI);
|
||||
SDValue GALoNew = CurDAG->getTargetGlobalAddress(GAlo->getGlobal(), DL,
|
||||
GlobalLoADDI.getValueType(),
|
||||
Offset, RISCVII::MO_LO);
|
||||
CurDAG->UpdateNodeOperands(GlobalHiLUI.getNode(), GAHiNew);
|
||||
CurDAG->UpdateNodeOperands(GlobalLoADDI.getNode(), GlobalHiLUI, GALoNew);
|
||||
// Update all uses of the Tail with the GlobalLoADDI. After
|
||||
// this Tail will be a dead node.
|
||||
SDValue From = SDValue(Tail, 0);
|
||||
CurDAG->ReplaceAllUsesOfValuesWith(&From, &GlobalLoADDI, 1);
|
||||
}
|
||||
|
||||
// TODO: This transformation might be better implemeted in a Machine Funtion
|
||||
// Pass as discussed here: https://reviews.llvm.org/D45748.
|
||||
//
|
||||
// Merge the offset of address calculation into the offset field
|
||||
// of a global address node in a global address lowering sequence ("LUI
|
||||
// %hi(global) --> add %lo(global)") under the following conditions: 1) The
|
||||
// offset field in the global address lowering sequence is zero. 2) The lowered
|
||||
// global address is only used in one node, referred to as "Tail".
|
||||
|
||||
// This peephole does the following transformations to merge the offset:
|
||||
|
||||
// 1) ADDI (ADDI (LUI %hi(global)) %lo(global)), offset
|
||||
// --->
|
||||
// ADDI (LUI %hi(global + offset)) %lo(global + offset).
|
||||
//
|
||||
// This generates:
|
||||
// lui a0, hi (global + offset)
|
||||
// add a0, a0, lo (global + offset)
|
||||
// Instead of
|
||||
// lui a0, hi (global)
|
||||
// addi a0, hi (global)
|
||||
// addi a0, offset
|
||||
// This pattern is for cases when the offset is small enough to fit in the
|
||||
// immediate filed of ADDI (less than 12 bits).
|
||||
|
||||
// 2) ADD ((ADDI (LUI %hi(global)) %lo(global)), (LUI hi_offset))
|
||||
// --->
|
||||
// offset = hi_offset << 12
|
||||
// ADDI (LUI %hi(global + offset)) %lo(global + offset)
|
||||
|
||||
// Which generates the ASM:
|
||||
// lui a0, hi(global + offset)
|
||||
// addi a0, lo(global + offset)
|
||||
// Instead of:
|
||||
// lui a0, hi(global)
|
||||
// addi a0, lo(global)
|
||||
// lui a1, (offset)
|
||||
// add a0, a0, a1
|
||||
|
||||
// This pattern is for cases when the offset doesn't fit in an immediate field
|
||||
// of ADDI but the lower 12 bits are all zeros.
|
||||
|
||||
// 3) ADD ((ADDI (LUI %hi(global)) %lo(global)), (ADDI lo_offset, (LUI
|
||||
// hi_offset)))
|
||||
// --->
|
||||
// ADDI (LUI %hi(global + offset)) %lo(global + offset)
|
||||
// Which generates the ASM:
|
||||
// lui a1, %hi(global + offhi20<<12 + offlo12)
|
||||
// addi a1, %lo(global + offhi20<<12 + offlo12)
|
||||
// Instead of:
|
||||
// lui a0, hi(global)
|
||||
// addi a0, lo(global)
|
||||
// lui a1, (offhi20)
|
||||
// addi a1, (offlo12)
|
||||
// add a0, a0, a1
|
||||
// This pattern is for cases when the offset doesn't fit in an immediate field
|
||||
// of ADDI and both the lower 1 bits and high 20 bits are non zero.
|
||||
void RISCVDAGToDAGISel::doPeepholeGlobalAddiLuiOffset() {
|
||||
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
||||
++Position;
|
||||
SelectionDAG::allnodes_iterator Begin(CurDAG->allnodes_begin());
|
||||
while (Position != Begin) {
|
||||
SDNode *Tail = &*--Position;
|
||||
// Skip dead nodes and any non-machine opcodes.
|
||||
if (Tail->use_empty() || !Tail->isMachineOpcode())
|
||||
continue;
|
||||
// The tail instruction can be an ADD or an ADDI.
|
||||
if (!Tail->isMachineOpcode() || !(Tail->getMachineOpcode() == RISCV::ADD ||
|
||||
Tail->getMachineOpcode() == RISCV::ADDI))
|
||||
continue;
|
||||
// First detect the global address part of pattern:
|
||||
// (lui %hi(global) --> Addi %lo(global))
|
||||
unsigned GlobalLoADDiIdx;
|
||||
SDValue GlobalLoADDI;
|
||||
SDValue GlobalHiLUI;
|
||||
GlobalAddressSDNode *GAhi;
|
||||
GlobalAddressSDNode *GAlo;
|
||||
if (!detectLuiAddiGlobal(Tail, GlobalLoADDiIdx, GlobalLoADDI, GlobalHiLUI,
|
||||
GAlo, GAhi))
|
||||
continue;
|
||||
LLVM_DEBUG(dbgs() << " Detected \"ADDI LUI %hi(global), %lo(global)\n");
|
||||
// Detect the offset part for the address calculation by looking at the
|
||||
// other operand of the tail instruction:
|
||||
int64_t Offset;
|
||||
if (Tail->getMachineOpcode() == RISCV::ADD) {
|
||||
// If the Tail is an ADD instruction, the offset can be in two forms:
|
||||
// 1) LUI hi_Offset followed by:
|
||||
// ADDI lo_offset
|
||||
// This happens in case the offset has non zero bits in
|
||||
// both hi 20 and lo 12 bits.
|
||||
// 2) LUI (offset20)
|
||||
// This happens in case the lower 12 bits of the offset are zeros.
|
||||
SDValue OffsetVal = Tail->getOperand(1 - GlobalLoADDiIdx);
|
||||
if (!matchAddiLuiOffset(OffsetVal, Offset) &&
|
||||
!matchLuiOffset(OffsetVal, Offset))
|
||||
continue;
|
||||
} else
|
||||
// The Tail is an ADDI instruction:
|
||||
Offset = cast<ConstantSDNode>(Tail->getOperand(1 - GlobalLoADDiIdx))
|
||||
->getSExtValue();
|
||||
|
||||
LLVM_DEBUG(
|
||||
dbgs()
|
||||
<< " Fold offset value into global offset of LUI %hi and ADDI %lo\n");
|
||||
LLVM_DEBUG(dbgs() << "\tTail:");
|
||||
LLVM_DEBUG(Tail->dump(CurDAG));
|
||||
LLVM_DEBUG(dbgs() << "\tGlobalHiLUI:");
|
||||
LLVM_DEBUG(GlobalHiLUI->dump(CurDAG));
|
||||
LLVM_DEBUG(dbgs() << "\tGlobalLoADDI:");
|
||||
LLVM_DEBUG(GlobalLoADDI->dump(CurDAG));
|
||||
LLVM_DEBUG(dbgs() << "\n");
|
||||
updateTailInstrUsers(Tail, CurDAG, GAhi, GAlo, GlobalHiLUI, GlobalLoADDI,
|
||||
Offset);
|
||||
}
|
||||
CurDAG->RemoveDeadNodes();
|
||||
}
|
||||
|
||||
// Merge an ADDI into the offset of a load/store instruction where possible.
|
||||
// (load (add base, off), 0) -> (load base, off)
|
||||
// (store val, (add base, off)) -> (store val, base, off)
|
||||
|
@ -4,6 +4,8 @@
|
||||
%struct.S = type { [40 x i32], i32, i32, i32, [4100 x i32], i32, i32, i32 }
|
||||
@s = common dso_local global %struct.S zeroinitializer, align 4
|
||||
@foo = global [6 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5, i16 0], align 2
|
||||
@g = global [1048576 x i8] zeroinitializer, align 1
|
||||
|
||||
|
||||
define dso_local void @multiple_stores() local_unnamed_addr {
|
||||
; CHECK-LABEL: multiple_stores:
|
||||
@ -21,8 +23,8 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
define dso_local void @control_flow() local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: control_flow:
|
||||
define dso_local void @control_flow_with_mem_access() local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: control_flow_with_mem_access:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: lui a0, %hi(s)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s)
|
||||
@ -47,37 +49,92 @@ if.end: ; preds = %if.then, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
;TODO: Offset shouln't be separated in this case. We get shorter sequence if it
|
||||
; is merged in the LUI %hi and the ADDI %lo.
|
||||
; This test checks for the case where the offset is only an LUI.
|
||||
; without peephole this generates:
|
||||
; lui a0, %hi(g)
|
||||
; addi a0, a0, %lo(g)
|
||||
; lui a1, 128 ---> offset
|
||||
; add a0, a0, a1 ---> base + offset.
|
||||
define i8* @big_offset_lui_tail() {
|
||||
; CHECK-LABEL: big_offset_lui_tail:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: lui a0, %hi(g+524288)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(g+524288)
|
||||
; CHECK-NEXT: ret
|
||||
ret i8* getelementptr inbounds ([1048576 x i8], [1048576 x i8]* @g, i32 0, i32 524288)
|
||||
}
|
||||
|
||||
define dso_local i32* @big_offset_one_use() local_unnamed_addr {
|
||||
; CHECK-LABEL: big_offset_one_use:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: lui a0, 4
|
||||
; CHECK-NEXT: addi a0, a0, 188
|
||||
; CHECK-NEXT: lui a1, %hi(s)
|
||||
; CHECK-NEXT: addi a1, a1, %lo(s)
|
||||
; CHECK-NEXT: add a0, a1, a0
|
||||
; CHECK-NEXT: lui a0, %hi(s+16572)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s+16572)
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 5)
|
||||
}
|
||||
|
||||
;TODO: Offset shouln't be separated in this case. We get shorter sequence if it
|
||||
; is merged in the LUI %hi and the ADDI %lo.
|
||||
define dso_local i32* @small_offset_one_use() local_unnamed_addr {
|
||||
; CHECK-LABEL: small_offset_one_use:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: lui a0, %hi(s)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s)
|
||||
; CHECK-NEXT: addi a0, a0, 160
|
||||
; CHECK-NEXT: lui a0, %hi(s+160)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s+160)
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1)
|
||||
}
|
||||
|
||||
; TODO: In this case we get a better sequence if the offset didn't get didn't
|
||||
; get merged back in %if.end and %if.then. The current peephole is not able to
|
||||
; detect the shared global address node across blocks.
|
||||
; Without the peephole we can generate:
|
||||
;# %bb.0: # %entry
|
||||
; lui a0, %hi(s)
|
||||
; addi a0, a0, %lo(s)
|
||||
; lw a1, 164(a0)
|
||||
; beqz a1, .LBB0_2
|
||||
;# %bb.1: # %if.end
|
||||
; addi a0, a0, 168
|
||||
; ret
|
||||
;.LBB0_2: # %if.then
|
||||
; addi a0, a0, 160
|
||||
; ret
|
||||
; Function Attrs: norecurse nounwind optsize readonly
|
||||
define dso_local i32* @control_flow_no_mem(i32 %n) local_unnamed_addr #1 {
|
||||
; CHECK-LABEL: control_flow_no_mem:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
; CHECK-NEXT: lui a0, %hi(s)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s)
|
||||
; CHECK-NEXT: lw a0, 164(a0)
|
||||
; CHECK-NEXT: beqz a0, .LBB5_2
|
||||
; CHECK-NEXT: # %bb.1: # %if.end
|
||||
; CHECK-NEXT: lui a0, %hi(s+168)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s+168)
|
||||
; CHECK-NEXT: ret
|
||||
; CHECK-NEXT: .LBB5_2: # %if.then
|
||||
; CHECK-NEXT: lui a0, %hi(s+160)
|
||||
; CHECK-NEXT: addi a0, a0, %lo(s+160)
|
||||
; CHECK-NEXT: ret
|
||||
entry:
|
||||
%0 = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 2), align 4
|
||||
%cmp = icmp eq i32 %0, 0
|
||||
br i1 %cmp, label %if.then, label %if.end
|
||||
if.then: ; preds = %entry
|
||||
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1)
|
||||
if.end: ; preds = %if.then, %entry
|
||||
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 3)
|
||||
}
|
||||
|
||||
;TODO: Offset shouln't be separated in this case. We get shorter sequence if it
|
||||
; is merged in the LUI %hi and the ADDI %lo.
|
||||
; is merged in the LUI %hi and the ADDI %lo, the "ADDI" could be folded in the
|
||||
; immediate part of "lhu" genertating the sequence:
|
||||
; lui a0, %hi(foo +8)
|
||||
; lhu a0, %lo(foo+8)(a0)
|
||||
; instead of:
|
||||
; lui a0, %hi(foo)
|
||||
; addi a0, a0, %lo(foo)
|
||||
; lhu a0, 8(a0)
|
||||
|
||||
define dso_local i32 @load_half() nounwind {
|
||||
; CHECK-LABEL: load_half:
|
||||
; CHECK: # %bb.0: # %entry
|
||||
@ -87,13 +144,13 @@ define dso_local i32 @load_half() nounwind {
|
||||
; CHECK-NEXT: addi a0, a0, %lo(foo)
|
||||
; CHECK-NEXT: lhu a0, 8(a0)
|
||||
; CHECK-NEXT: addi a1, zero, 140
|
||||
; CHECK-NEXT: bne a0, a1, .LBB4_2
|
||||
; CHECK-NEXT: bne a0, a1, .LBB6_2
|
||||
; CHECK-NEXT: # %bb.1: # %if.end
|
||||
; CHECK-NEXT: mv a0, zero
|
||||
; CHECK-NEXT: lw ra, 12(sp)
|
||||
; CHECK-NEXT: addi sp, sp, 16
|
||||
; CHECK-NEXT: ret
|
||||
; CHECK-NEXT: .LBB4_2: # %if.then
|
||||
; CHECK-NEXT: .LBB6_2: # %if.then
|
||||
; CHECK-NEXT: call abort
|
||||
entry:
|
||||
%0 = load i16, i16* getelementptr inbounds ([6 x i16], [6 x i16]* @foo, i32 0, i32 4), align 2
|
||||
|
Loading…
Reference in New Issue
Block a user