[RISCV] Add peepholes for Global Address lowering patterns

Summary:
  Base and offset are always separated when a GlobalAddress node is lowered
  (rL332641) as an optimization to reduce instruction count. However, this
  optimization is not profitable if the Global Address ends up being used in only
  instruction.

  This patch adds peephole optimizations that merge an offset of
  an address calculation into the LUI %%hi and ADD %lo of the lowering sequence.

  The peephole handles three patterns:

 1) ADDI (ADDI (LUI %hi(global)) %lo(global)), offset
     --->
      ADDI (LUI %hi(global + offset)) %lo(global + offset).

   This generates:
   lui a0, hi (global + offset)
   add a0, a0, lo (global + offset)

   Instead of

   lui a0, hi (global)
   addi a0, hi (global)
   addi a0, offset

   This pattern is for cases when the offset is small enough to fit in the
   immediate filed of ADDI (less than 12 bits).

 2) ADD ((ADDI (LUI %hi(global)) %lo(global)), (LUI hi_offset))
     --->
      offset = hi_offset << 12
      ADDI (LUI %hi(global + offset)) %lo(global + offset)

   Which generates the ASM:

   lui  a0, hi(global + offset)
   addi a0, lo(global + offset)

   Instead of:

   lui  a0, hi(global)
   addi a0, lo(global)
   lui a1, (offset)
   add a0, a0, a1

   This pattern is for cases when the offset doesn't fit in an immediate field
   of ADDI but the lower 12 bits are all zeros.

 3) ADD ((ADDI (LUI %hi(global)) %lo(global)), (ADDI lo_offset, (LUI hi_offset)))
     --->
        offset = global + offhi20<<12 + offlo12
        ADDI (LUI %hi(global + offset)) %lo(global + offset)

   Which generates the ASM:

   lui  a1, %hi(global + offset)
   addi a1, %lo(global + offset)

   Instead of:

   lui  a0, hi(global)
   addi a0, lo(global)
   lui a1, (offhi20)
   addi a1, (offlo12)
   add a0, a0, a1

   This pattern is for cases when the offset doesn't fit in an immediate field
   of ADDI and both the lower 1 bits and high 20 bits are non zero.

    Reviewers: asb

    Reviewed By: asb

    Subscribers: rbar, johnrusso, simoncook, jordy.potman.lists, apazos,
  niosHD, kito-cheng, shiva0217, zzheng, edward-jones, mgrang

llvm-svn: 333455
This commit is contained in:
Sameer AbuAsal 2018-05-29 19:34:54 +00:00
parent 6a4ca59d1b
commit d114bbf2bb
2 changed files with 282 additions and 17 deletions

View File

@ -56,12 +56,14 @@ public:
private:
void doPeepholeLoadStoreADDI();
void doPeepholeGlobalAddiLuiOffset();
void doPeepholeBuildPairF64SplitF64();
};
}
void RISCVDAGToDAGISel::PostprocessISelDAG() {
doPeepholeLoadStoreADDI();
doPeepholeGlobalAddiLuiOffset();
doPeepholeBuildPairF64SplitF64();
}
@ -128,6 +130,212 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
return false;
}
// Detect the pattern lui %hi(global) --> ADDI %lo(global)
// HiLUI LoADDI
static bool detectLuiAddiGlobal(SDNode *Tail, unsigned &Idx, SDValue &LoADDI,
SDValue &HiLUI, GlobalAddressSDNode *&GAlo,
GlobalAddressSDNode *&GAhi) {
// Try to detect the pattern on every operand of the tail instruction.
for (Idx = 0; Idx < Tail->getNumOperands(); Idx++) {
LoADDI = Tail->getOperand(Idx);
// LoADDI should only be used by one instruction (Tail).
if (!LoADDI->isMachineOpcode() ||
!(LoADDI->getMachineOpcode() == RISCV::ADDI) ||
!isa<GlobalAddressSDNode>(LoADDI->getOperand(1)) ||
!LoADDI->hasOneUse())
continue;
// Check for existence of %lo target flag.
GAlo = cast<GlobalAddressSDNode>(LoADDI->getOperand(1));
if (!(GAlo->getTargetFlags() == RISCVII::MO_LO) ||
!(GAlo->getOffset() == 0))
return false;
// Check for existence of %hi target flag.
HiLUI = LoADDI->getOperand(0);
if (!HiLUI->isMachineOpcode() ||
!(HiLUI->getMachineOpcode() == RISCV::LUI) ||
!isa<GlobalAddressSDNode>(HiLUI->getOperand(0)) || !HiLUI->hasOneUse())
return false;
GAhi = cast<GlobalAddressSDNode>(HiLUI->getOperand(0));
if (!(GAhi->getTargetFlags() == RISCVII::MO_HI) ||
!(GAhi->getOffset() == 0))
return false;
return true;
}
return false;
}
static bool matchLuiOffset(SDValue &OffsetLUI, int64_t &Offset) {
if (!OffsetLUI->isMachineOpcode() ||
!(OffsetLUI->getMachineOpcode() == RISCV::LUI) ||
!isa<ConstantSDNode>(OffsetLUI->getOperand(0)))
return false;
Offset = cast<ConstantSDNode>(OffsetLUI->getOperand(0))->getSExtValue();
Offset = Offset << 12;
LLVM_DEBUG(dbgs() << " Detected \" LUI Offset_hi\"\n");
return true;
}
static bool matchAddiLuiOffset(SDValue &OffsetLoADDI, int64_t &Offset) {
// LoADDI should only be used by the tail instruction only.
if (!OffsetLoADDI->isMachineOpcode() ||
!(OffsetLoADDI->getMachineOpcode() == RISCV::ADDI) ||
!isa<ConstantSDNode>(OffsetLoADDI->getOperand(1)) ||
!OffsetLoADDI->hasOneUse())
return false;
int64_t OffLo =
cast<ConstantSDNode>(OffsetLoADDI->getOperand(1))->getZExtValue();
// HiLUI should only be used by the loADDI.
SDValue OffsetHiLUI = (OffsetLoADDI->getOperand(0));
if (!OffsetHiLUI->isMachineOpcode() ||
!(OffsetHiLUI->getMachineOpcode() == RISCV::LUI) ||
!isa<ConstantSDNode>(OffsetHiLUI->getOperand(0)) ||
!OffsetHiLUI->hasOneUse())
return false;
int64_t OffHi =
cast<ConstantSDNode>(OffsetHiLUI->getOperand(0))->getSExtValue();
Offset = (OffHi << 12) + OffLo;
LLVM_DEBUG(dbgs() << " Detected \" ADDI (LUI Offset_hi), Offset_lo\"\n");
return true;
}
static void updateTailInstrUsers(SDNode *Tail, SelectionDAG *CurDAG,
GlobalAddressSDNode *GAhi,
GlobalAddressSDNode *GAlo,
SDValue &GlobalHiLUI, SDValue &GlobalLoADDI,
int64_t Offset) {
// Update the offset in GAhi and GAlo.
SDLoc DL(Tail->getOperand(1));
SDValue GAHiNew = CurDAG->getTargetGlobalAddress(GAhi->getGlobal(), DL,
GlobalHiLUI.getValueType(),
Offset, RISCVII::MO_HI);
SDValue GALoNew = CurDAG->getTargetGlobalAddress(GAlo->getGlobal(), DL,
GlobalLoADDI.getValueType(),
Offset, RISCVII::MO_LO);
CurDAG->UpdateNodeOperands(GlobalHiLUI.getNode(), GAHiNew);
CurDAG->UpdateNodeOperands(GlobalLoADDI.getNode(), GlobalHiLUI, GALoNew);
// Update all uses of the Tail with the GlobalLoADDI. After
// this Tail will be a dead node.
SDValue From = SDValue(Tail, 0);
CurDAG->ReplaceAllUsesOfValuesWith(&From, &GlobalLoADDI, 1);
}
// TODO: This transformation might be better implemeted in a Machine Funtion
// Pass as discussed here: https://reviews.llvm.org/D45748.
//
// Merge the offset of address calculation into the offset field
// of a global address node in a global address lowering sequence ("LUI
// %hi(global) --> add %lo(global)") under the following conditions: 1) The
// offset field in the global address lowering sequence is zero. 2) The lowered
// global address is only used in one node, referred to as "Tail".
// This peephole does the following transformations to merge the offset:
// 1) ADDI (ADDI (LUI %hi(global)) %lo(global)), offset
// --->
// ADDI (LUI %hi(global + offset)) %lo(global + offset).
//
// This generates:
// lui a0, hi (global + offset)
// add a0, a0, lo (global + offset)
// Instead of
// lui a0, hi (global)
// addi a0, hi (global)
// addi a0, offset
// This pattern is for cases when the offset is small enough to fit in the
// immediate filed of ADDI (less than 12 bits).
// 2) ADD ((ADDI (LUI %hi(global)) %lo(global)), (LUI hi_offset))
// --->
// offset = hi_offset << 12
// ADDI (LUI %hi(global + offset)) %lo(global + offset)
// Which generates the ASM:
// lui a0, hi(global + offset)
// addi a0, lo(global + offset)
// Instead of:
// lui a0, hi(global)
// addi a0, lo(global)
// lui a1, (offset)
// add a0, a0, a1
// This pattern is for cases when the offset doesn't fit in an immediate field
// of ADDI but the lower 12 bits are all zeros.
// 3) ADD ((ADDI (LUI %hi(global)) %lo(global)), (ADDI lo_offset, (LUI
// hi_offset)))
// --->
// ADDI (LUI %hi(global + offset)) %lo(global + offset)
// Which generates the ASM:
// lui a1, %hi(global + offhi20<<12 + offlo12)
// addi a1, %lo(global + offhi20<<12 + offlo12)
// Instead of:
// lui a0, hi(global)
// addi a0, lo(global)
// lui a1, (offhi20)
// addi a1, (offlo12)
// add a0, a0, a1
// This pattern is for cases when the offset doesn't fit in an immediate field
// of ADDI and both the lower 1 bits and high 20 bits are non zero.
void RISCVDAGToDAGISel::doPeepholeGlobalAddiLuiOffset() {
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
++Position;
SelectionDAG::allnodes_iterator Begin(CurDAG->allnodes_begin());
while (Position != Begin) {
SDNode *Tail = &*--Position;
// Skip dead nodes and any non-machine opcodes.
if (Tail->use_empty() || !Tail->isMachineOpcode())
continue;
// The tail instruction can be an ADD or an ADDI.
if (!Tail->isMachineOpcode() || !(Tail->getMachineOpcode() == RISCV::ADD ||
Tail->getMachineOpcode() == RISCV::ADDI))
continue;
// First detect the global address part of pattern:
// (lui %hi(global) --> Addi %lo(global))
unsigned GlobalLoADDiIdx;
SDValue GlobalLoADDI;
SDValue GlobalHiLUI;
GlobalAddressSDNode *GAhi;
GlobalAddressSDNode *GAlo;
if (!detectLuiAddiGlobal(Tail, GlobalLoADDiIdx, GlobalLoADDI, GlobalHiLUI,
GAlo, GAhi))
continue;
LLVM_DEBUG(dbgs() << " Detected \"ADDI LUI %hi(global), %lo(global)\n");
// Detect the offset part for the address calculation by looking at the
// other operand of the tail instruction:
int64_t Offset;
if (Tail->getMachineOpcode() == RISCV::ADD) {
// If the Tail is an ADD instruction, the offset can be in two forms:
// 1) LUI hi_Offset followed by:
// ADDI lo_offset
// This happens in case the offset has non zero bits in
// both hi 20 and lo 12 bits.
// 2) LUI (offset20)
// This happens in case the lower 12 bits of the offset are zeros.
SDValue OffsetVal = Tail->getOperand(1 - GlobalLoADDiIdx);
if (!matchAddiLuiOffset(OffsetVal, Offset) &&
!matchLuiOffset(OffsetVal, Offset))
continue;
} else
// The Tail is an ADDI instruction:
Offset = cast<ConstantSDNode>(Tail->getOperand(1 - GlobalLoADDiIdx))
->getSExtValue();
LLVM_DEBUG(
dbgs()
<< " Fold offset value into global offset of LUI %hi and ADDI %lo\n");
LLVM_DEBUG(dbgs() << "\tTail:");
LLVM_DEBUG(Tail->dump(CurDAG));
LLVM_DEBUG(dbgs() << "\tGlobalHiLUI:");
LLVM_DEBUG(GlobalHiLUI->dump(CurDAG));
LLVM_DEBUG(dbgs() << "\tGlobalLoADDI:");
LLVM_DEBUG(GlobalLoADDI->dump(CurDAG));
LLVM_DEBUG(dbgs() << "\n");
updateTailInstrUsers(Tail, CurDAG, GAhi, GAlo, GlobalHiLUI, GlobalLoADDI,
Offset);
}
CurDAG->RemoveDeadNodes();
}
// Merge an ADDI into the offset of a load/store instruction where possible.
// (load (add base, off), 0) -> (load base, off)
// (store val, (add base, off)) -> (store val, base, off)

View File

@ -4,6 +4,8 @@
%struct.S = type { [40 x i32], i32, i32, i32, [4100 x i32], i32, i32, i32 }
@s = common dso_local global %struct.S zeroinitializer, align 4
@foo = global [6 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5, i16 0], align 2
@g = global [1048576 x i8] zeroinitializer, align 1
define dso_local void @multiple_stores() local_unnamed_addr {
; CHECK-LABEL: multiple_stores:
@ -21,8 +23,8 @@ entry:
ret void
}
define dso_local void @control_flow() local_unnamed_addr #0 {
; CHECK-LABEL: control_flow:
define dso_local void @control_flow_with_mem_access() local_unnamed_addr #0 {
; CHECK-LABEL: control_flow_with_mem_access:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lui a0, %hi(s)
; CHECK-NEXT: addi a0, a0, %lo(s)
@ -47,37 +49,92 @@ if.end: ; preds = %if.then, %entry
ret void
}
;TODO: Offset shouln't be separated in this case. We get shorter sequence if it
; is merged in the LUI %hi and the ADDI %lo.
; This test checks for the case where the offset is only an LUI.
; without peephole this generates:
; lui a0, %hi(g)
; addi a0, a0, %lo(g)
; lui a1, 128 ---> offset
; add a0, a0, a1 ---> base + offset.
define i8* @big_offset_lui_tail() {
; CHECK-LABEL: big_offset_lui_tail:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a0, %hi(g+524288)
; CHECK-NEXT: addi a0, a0, %lo(g+524288)
; CHECK-NEXT: ret
ret i8* getelementptr inbounds ([1048576 x i8], [1048576 x i8]* @g, i32 0, i32 524288)
}
define dso_local i32* @big_offset_one_use() local_unnamed_addr {
; CHECK-LABEL: big_offset_one_use:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lui a0, 4
; CHECK-NEXT: addi a0, a0, 188
; CHECK-NEXT: lui a1, %hi(s)
; CHECK-NEXT: addi a1, a1, %lo(s)
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: lui a0, %hi(s+16572)
; CHECK-NEXT: addi a0, a0, %lo(s+16572)
; CHECK-NEXT: ret
entry:
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 5)
}
;TODO: Offset shouln't be separated in this case. We get shorter sequence if it
; is merged in the LUI %hi and the ADDI %lo.
define dso_local i32* @small_offset_one_use() local_unnamed_addr {
; CHECK-LABEL: small_offset_one_use:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lui a0, %hi(s)
; CHECK-NEXT: addi a0, a0, %lo(s)
; CHECK-NEXT: addi a0, a0, 160
; CHECK-NEXT: lui a0, %hi(s+160)
; CHECK-NEXT: addi a0, a0, %lo(s+160)
; CHECK-NEXT: ret
entry:
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1)
}
; TODO: In this case we get a better sequence if the offset didn't get didn't
; get merged back in %if.end and %if.then. The current peephole is not able to
; detect the shared global address node across blocks.
; Without the peephole we can generate:
;# %bb.0: # %entry
; lui a0, %hi(s)
; addi a0, a0, %lo(s)
; lw a1, 164(a0)
; beqz a1, .LBB0_2
;# %bb.1: # %if.end
; addi a0, a0, 168
; ret
;.LBB0_2: # %if.then
; addi a0, a0, 160
; ret
; Function Attrs: norecurse nounwind optsize readonly
define dso_local i32* @control_flow_no_mem(i32 %n) local_unnamed_addr #1 {
; CHECK-LABEL: control_flow_no_mem:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: lui a0, %hi(s)
; CHECK-NEXT: addi a0, a0, %lo(s)
; CHECK-NEXT: lw a0, 164(a0)
; CHECK-NEXT: beqz a0, .LBB5_2
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: lui a0, %hi(s+168)
; CHECK-NEXT: addi a0, a0, %lo(s+168)
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB5_2: # %if.then
; CHECK-NEXT: lui a0, %hi(s+160)
; CHECK-NEXT: addi a0, a0, %lo(s+160)
; CHECK-NEXT: ret
entry:
%0 = load i32, i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 2), align 4
%cmp = icmp eq i32 %0, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 1)
if.end: ; preds = %if.then, %entry
ret i32* getelementptr inbounds (%struct.S, %struct.S* @s, i32 0, i32 3)
}
;TODO: Offset shouln't be separated in this case. We get shorter sequence if it
; is merged in the LUI %hi and the ADDI %lo.
; is merged in the LUI %hi and the ADDI %lo, the "ADDI" could be folded in the
; immediate part of "lhu" genertating the sequence:
; lui a0, %hi(foo +8)
; lhu a0, %lo(foo+8)(a0)
; instead of:
; lui a0, %hi(foo)
; addi a0, a0, %lo(foo)
; lhu a0, 8(a0)
define dso_local i32 @load_half() nounwind {
; CHECK-LABEL: load_half:
; CHECK: # %bb.0: # %entry
@ -87,13 +144,13 @@ define dso_local i32 @load_half() nounwind {
; CHECK-NEXT: addi a0, a0, %lo(foo)
; CHECK-NEXT: lhu a0, 8(a0)
; CHECK-NEXT: addi a1, zero, 140
; CHECK-NEXT: bne a0, a1, .LBB4_2
; CHECK-NEXT: bne a0, a1, .LBB6_2
; CHECK-NEXT: # %bb.1: # %if.end
; CHECK-NEXT: mv a0, zero
; CHECK-NEXT: lw ra, 12(sp)
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
; CHECK-NEXT: .LBB4_2: # %if.then
; CHECK-NEXT: .LBB6_2: # %if.then
; CHECK-NEXT: call abort
entry:
%0 = load i16, i16* getelementptr inbounds ([6 x i16], [6 x i16]* @foo, i32 0, i32 4), align 2