PPCDAGToDAGISel::PostprocessISelDAG()

This patch implements the PPCDAGToDAGISel::PostprocessISelDAG virtual
method to perform post-selection peephole optimizations on the DAG
representation.

One optimization is implemented here:  folds to clean up complex
addressing expressions for thread-local storage and medium code
model.  It will also be useful for large code model sequences when
those are added later.  I originally thought about doing this on the
MI representation prior to register assignment, but it's difficult to
do effective global dead code elimination at that point.  DCE is
trivial on the DAG representation.

A typical example of a candidate code sequence in assembly:

   addis 3, 2, globalvar@toc@ha
   addi  3, 3, globalvar@toc@l
   lwz   5, 0(3)

When the final instruction is a load or store with an immediate offset
of zero, the offset from the add-immediate can replace the zero,
provided the relocation information is carried along:

   addis 3, 2, globalvar@toc@ha
   lwz   5, globalvar@toc@l(3)

Since the addi can in general have multiple uses, we need to only
delete the instruction when the last use is removed.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@175697 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Bill Schmidt 2013-02-21 00:38:25 +00:00
parent 5c43245bf4
commit 421021157e
8 changed files with 342 additions and 1 deletions

View File

@ -67,6 +67,8 @@ namespace {
return true;
}
virtual void PostprocessISelDAG();
/// getI32Imm - Return a target constant with the specified value, of type
/// i32.
inline SDValue getI32Imm(unsigned Imm) {
@ -1398,6 +1400,159 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
return SelectCode(N);
}
/// PostProcessISelDAG - Perform some late peephole optimizations
/// on the DAG representation.
void PPCDAGToDAGISel::PostprocessISelDAG() {
// Skip peepholes at -O0.
if (TM.getOptLevel() == CodeGenOpt::None)
return;
// These optimizations are currently supported only for 64-bit SVR4.
if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
return;
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
++Position;
while (Position != CurDAG->allnodes_begin()) {
SDNode *N = --Position;
// Skip dead nodes and any non-machine opcodes.
if (N->use_empty() || !N->isMachineOpcode())
continue;
unsigned FirstOp;
unsigned StorageOpcode = N->getMachineOpcode();
switch (StorageOpcode) {
default: continue;
case PPC::LBZ:
case PPC::LBZ8:
case PPC::LD:
case PPC::LFD:
case PPC::LFS:
case PPC::LHA:
case PPC::LHA8:
case PPC::LHZ:
case PPC::LHZ8:
case PPC::LWA:
case PPC::LWZ:
case PPC::LWZ8:
FirstOp = 0;
break;
case PPC::STB:
case PPC::STB8:
case PPC::STD:
case PPC::STFD:
case PPC::STFS:
case PPC::STH:
case PPC::STH8:
case PPC::STW:
case PPC::STW8:
FirstOp = 1;
break;
}
// If this is a load or store with a zero offset, we may be able to
// fold an add-immediate into the memory operation.
if (!isa<ConstantSDNode>(N->getOperand(FirstOp)) ||
N->getConstantOperandVal(FirstOp) != 0)
continue;
SDValue Base = N->getOperand(FirstOp + 1);
if (!Base.isMachineOpcode())
continue;
unsigned Flags = 0;
bool ReplaceFlags = true;
// When the feeding operation is an add-immediate of some sort,
// determine whether we need to add relocation information to the
// target flags on the immediate operand when we fold it into the
// load instruction.
//
// For something like ADDItocL, the relocation information is
// inferred from the opcode; when we process it in the AsmPrinter,
// we add the necessary relocation there. A load, though, can receive
// relocation from various flavors of ADDIxxx, so we need to carry
// the relocation information in the target flags.
switch (Base.getMachineOpcode()) {
default: continue;
case PPC::ADDI8:
case PPC::ADDI8L:
case PPC::ADDIL:
// In some cases (such as TLS) the relocation information
// is already in place on the operand, so copying the operand
// is sufficient.
ReplaceFlags = false;
// For these cases, the immediate may not be divisible by 4, in
// which case the fold is illegal for DS-form instructions. (The
// other cases provide aligned addresses and are always safe.)
if ((StorageOpcode == PPC::LWA ||
StorageOpcode == PPC::LD ||
StorageOpcode == PPC::STD) &&
(!isa<ConstantSDNode>(Base.getOperand(1)) ||
Base.getConstantOperandVal(1) % 4 != 0))
continue;
break;
case PPC::ADDIdtprelL:
Flags = PPCII::MO_DTPREL16_LO;
break;
case PPC::ADDItlsldL:
Flags = PPCII::MO_TLSLD16_LO;
break;
case PPC::ADDItocL:
Flags = PPCII::MO_TOC16_LO;
break;
}
// We found an opportunity. Reverse the operands from the add
// immediate and substitute them into the load or store. If
// needed, update the target flags for the immediate operand to
// reflect the necessary relocation information.
DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase: ");
DEBUG(Base->dump(CurDAG));
DEBUG(dbgs() << "\nN: ");
DEBUG(N->dump(CurDAG));
DEBUG(dbgs() << "\n");
SDValue ImmOpnd = Base.getOperand(1);
// If the relocation information isn't already present on the
// immediate operand, add it now.
if (ReplaceFlags) {
GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd);
if (GA) {
DebugLoc dl = GA->getDebugLoc();
const GlobalValue *GV = GA->getGlobal();
ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
} else {
ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(ImmOpnd);
if (CP) {
const Constant *C = CP->getConstVal();
ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
CP->getAlignment(),
0, Flags);
}
}
}
if (FirstOp == 1) // Store
(void)CurDAG->UpdateNodeOperands(N, N->getOperand(0), ImmOpnd,
Base.getOperand(0), N->getOperand(3));
else // Load
(void)CurDAG->UpdateNodeOperands(N, ImmOpnd, Base.getOperand(0),
N->getOperand(2));
// The add-immediate may now be dead, in which case remove it.
if (Base.getNode()->use_empty())
CurDAG->RemoveDeadNode(Base.getNode());
}
}
/// createPPCISelDag - This pass converts a legalized DAG into a

View File

@ -0,0 +1,25 @@
; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
; Test peephole optimization for medium code model (32-bit TOC offsets)
; for loading and storing a static variable scoped to a function.
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@test_fn_static.si = internal global i32 0, align 4
define signext i32 @test_fn_static() nounwind {
entry:
%0 = load i32* @test_fn_static.si, align 4
%inc = add nsw i32 %0, 1
store i32 %inc, i32* @test_fn_static.si, align 4
ret i32 %0
}
; CHECK: test_fn_static:
; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
; CHECK: .type [[VAR]],@object
; CHECK: .local [[VAR]]
; CHECK: .comm [[VAR]],4,4

View File

@ -0,0 +1,27 @@
; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
; Test peephole optimization for medium code model (32-bit TOC offsets)
; for loading and storing a file-scope static variable.
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@gi = global i32 5, align 4
define signext i32 @test_file_static() nounwind {
entry:
%0 = load i32* @gi, align 4
%inc = add nsw i32 %0, 1
store i32 %inc, i32* @gi, align 4
ret i32 %0
}
; CHECK: test_file_static:
; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR:[a-z0-9A-Z_.]+]]@toc@ha
; CHECK: lwz {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
; CHECK: stw {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])
; CHECK: .type [[VAR]],@object
; CHECK: .data
; CHECK: .globl [[VAR]]
; CHECK: [[VAR]]:
; CHECK: .long 5

View File

@ -0,0 +1,18 @@
; RUN: llc -mcpu=pwr7 -O1 -code-model=medium <%s | FileCheck %s
; Test peephole optimization for medium code model (32-bit TOC offsets)
; for loading a value from the constant pool (TOC-relative).
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
define double @test_double_const() nounwind {
entry:
ret double 0x3F4FD4920B498CF0
}
; CHECK: [[VAR:[a-z0-9A-Z_.]+]]:
; CHECK: .quad 4562098671269285104
; CHECK: test_double_const:
; CHECK: addis [[REG1:[0-9]+]], 2, [[VAR]]@toc@ha
; CHECK: lfd {{[0-9]+}}, [[VAR]]@toc@l([[REG1]])

View File

@ -0,0 +1,77 @@
; RUN: llc -O1 -mcpu=pwr7 -code-model=medium -filetype=obj %s -o - | \
; RUN: elf-dump --dump-section-data | FileCheck %s
; FIXME: When asm-parse is available, could make this an assembly test.
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@test_fn_static.si = internal global i32 0, align 4
define signext i32 @test_fn_static() nounwind {
entry:
%0 = load i32* @test_fn_static.si, align 4
%inc = add nsw i32 %0, 1
store i32 %inc, i32* @test_fn_static.si, align 4
ret i32 %0
}
; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
; accessing function-scoped variable si.
;
; CHECK: Relocation 0
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM2:[0-9]+]]
; CHECK-NEXT: 'r_type', 0x00000032
; CHECK: Relocation 1
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM2]]
; CHECK-NEXT: 'r_type', 0x00000030
; CHECK: Relocation 2
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM2]]
; CHECK-NEXT: 'r_type', 0x00000030
@gi = global i32 5, align 4
define signext i32 @test_file_static() nounwind {
entry:
%0 = load i32* @gi, align 4
%inc = add nsw i32 %0, 1
store i32 %inc, i32* @gi, align 4
ret i32 %0
}
; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
; accessing file-scope variable gi.
;
; CHECK: Relocation 3
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM3:[0-9]+]]
; CHECK-NEXT: 'r_type', 0x00000032
; CHECK: Relocation 4
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM3]]
; CHECK-NEXT: 'r_type', 0x00000030
; CHECK: Relocation 5
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM3]]
; CHECK-NEXT: 'r_type', 0x00000030
define double @test_double_const() nounwind {
entry:
ret double 0x3F4FD4920B498CF0
}
; Verify generation of R_PPC64_TOC16_HA and R_PPC64_TOC16_LO for
; accessing a constant.
;
; CHECK: Relocation 6
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM4:[0-9]+]]
; CHECK-NEXT: 'r_type', 0x00000032
; CHECK: Relocation 7
; CHECK-NEXT: 'r_offset'
; CHECK-NEXT: 'r_sym', 0x[[SYM4]]
; CHECK-NEXT: 'r_type', 0x00000030

View File

@ -0,0 +1,15 @@
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
target triple = "powerpc64-unknown-freebsd10.0"
; RUN: llc -O1 < %s -march=ppc64 | FileCheck %s
@a = thread_local global i32 0, align 4
;CHECK: localexec:
define i32 @localexec() nounwind {
entry:
;CHECK: addis [[REG1:[0-9]+]], 13, a@tprel@ha
;CHECK-NEXT: li [[REG2:[0-9]+]], 42
;CHECK-NEXT: stw [[REG2]], a@tprel@l([[REG1]])
store i32 42, i32* @a, align 4
ret i32 0
}

View File

@ -0,0 +1,24 @@
; RUN: llc -mcpu=pwr7 -O1 -relocation-model=pic < %s | FileCheck %s
; Test peephole optimization for thread-local storage using the
; local dynamic model.
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@a = hidden thread_local global i32 0, align 4
define signext i32 @main() nounwind {
entry:
%retval = alloca i32, align 4
store i32 0, i32* %retval
%0 = load i32* @a, align 4
ret i32 %0
}
; CHECK: addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
; CHECK-NEXT: addi 3, [[REG]], a@got@tlsld@l
; CHECK-NEXT: bl __tls_get_addr(a@tlsld)
; CHECK-NEXT: nop
; CHECK-NEXT: addis [[REG2:[0-9]+]], 3, a@dtprel@ha
; CHECK-NEXT: lwa {{[0-9]+}}, a@dtprel@l([[REG2]])

View File

@ -1,6 +1,6 @@
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
target triple = "powerpc64-unknown-freebsd10.0"
; RUN: llc < %s -march=ppc64 | FileCheck %s
; RUN: llc -O0 < %s -march=ppc64 | FileCheck %s
@a = thread_local global i32 0, align 4