[PowerPC] Add an MI SSA peephole pass.

This patch adds a pass for doing PowerPC peephole optimizations at the
MI level while the code is still in SSA form.  This allows for easy
modifications to the instructions while depending on a subsequent pass
of DCE.  Both passes are very fast due to the characteristics of SSA.

At this time, the only peepholes added are for cleaning up various
redundancies involving the XXPERMDI instruction.  However, I would
expect this will be a useful place to add more peepholes for
inefficiencies generated during instruction selection.  The pass is
placed after VSX swap optimization, as it is best to let that pass
remove unnecessary swaps before performing any remaining clean-ups.

The utility of these clean-ups are demonstrated by changes to four
existing test cases, all of which now have tighter expected code
generation.  I've also added Eric Schweiz's bugpoint-reduced test from
PR25157, for which we now generate tight code.  One other test started
failing for me, and I've fixed it
(test/Transforms/PlaceSafepoints/finite-loops.ll) as well; this is not
related to my changes, and I'm not sure why it works before and not
after.  The problem is that the CHECK-NOT: of "statepoint" from test1
fails because of the "statepoint" in test2, and so forth.  Adding a
CHECK-LABEL in between keeps the different occurrences of that string
properly scoped.

llvm-svn: 252651
This commit is contained in:
Bill Schmidt 2015-11-10 21:38:26 +00:00
parent 3a5e0423d9
commit 4bb7c62dcb
11 changed files with 322 additions and 28 deletions

View File

@ -41,6 +41,7 @@ namespace llvm {
FunctionPass *createPPCVSXCopyPass();
FunctionPass *createPPCVSXFMAMutatePass();
FunctionPass *createPPCVSXSwapRemovalPass();
FunctionPass *createPPCMIPeepholePass();
FunctionPass *createPPCBranchSelectionPass();
FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
FunctionPass *createPPCTLSDynamicCallPass();

View File

@ -0,0 +1,230 @@
//===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===---------------------------------------------------------------------===//
//
// This pass performs peephole optimizations to clean up ugly code
// sequences at the MachineInstruction layer. It runs at the end of
// the SSA phases, following VSX swap removal. A pass of dead code
// elimination follows this one for quick clean-up of any dead
// instructions introduced here. Although we could do this as callbacks
// from the generic peephole pass, this would have a couple of bad
// effects: it might remove optimization opportunities for VSX swap
// removal, and it would miss cleanups made possible following VSX
// swap removal.
//
//===---------------------------------------------------------------------===//
#include "PPCInstrInfo.h"
#include "PPC.h"
#include "PPCInstrBuilder.h"
#include "PPCTargetMachine.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
#define DEBUG_TYPE "ppc-mi-peepholes"
namespace llvm {
void initializePPCMIPeepholePass(PassRegistry&);
}
namespace {
struct PPCMIPeephole : public MachineFunctionPass {
static char ID;
const PPCInstrInfo *TII;
MachineFunction *MF;
MachineRegisterInfo *MRI;
PPCMIPeephole() : MachineFunctionPass(ID) {
initializePPCMIPeepholePass(*PassRegistry::getPassRegistry());
}
private:
// Initialize class variables.
void initialize(MachineFunction &MFParm);
// Perform peepholes.
bool simplifyCode(void);
// Find the "true" register represented by SrcReg (following chains
// of copies and subreg_to_reg operations).
unsigned lookThruCopyLike(unsigned SrcReg);
public:
// Main entry point for this pass.
bool runOnMachineFunction(MachineFunction &MF) override {
initialize(MF);
return simplifyCode();
}
};
// Initialize class variables.
void PPCMIPeephole::initialize(MachineFunction &MFParm) {
MF = &MFParm;
MRI = &MF->getRegInfo();
TII = MF->getSubtarget<PPCSubtarget>().getInstrInfo();
DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
DEBUG(MF->dump());
}
// Perform peephole optimizations.
bool PPCMIPeephole::simplifyCode(void) {
bool Simplified = false;
MachineInstr* ToErase = nullptr;
for (MachineBasicBlock &MBB : *MF) {
for (MachineInstr &MI : MBB) {
// If the previous instruction was marked for elimination,
// remove it now.
if (ToErase) {
ToErase->eraseFromParent();
ToErase = nullptr;
}
// Ignore debug instructions.
if (MI.isDebugValue())
continue;
// Per-opcode peepholes.
switch (MI.getOpcode()) {
default:
break;
case PPC::XXPERMDI: {
// Perform simplifications of 2x64 vector swaps and splats.
// A swap is identified by an immediate value of 2, and a splat
// is identified by an immediate value of 0 or 3.
int Immed = MI.getOperand(3).getImm();
if (Immed != 1) {
// For each of these simplifications, we need the two source
// regs to match. Unfortunately, MachineCSE ignores COPY and
// SUBREG_TO_REG, so for example we can see
// XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
// We have to look through chains of COPY and SUBREG_TO_REG
// to find the real source values for comparison.
unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
if (TrueReg1 == TrueReg2
&& TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
// If this is a splat or a swap fed by another splat, we
// can replace it with a copy.
if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
unsigned FeedImmed = DefMI->getOperand(3).getImm();
unsigned FeedReg1
= lookThruCopyLike(DefMI->getOperand(1).getReg());
unsigned FeedReg2
= lookThruCopyLike(DefMI->getOperand(2).getReg());
if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
DEBUG(dbgs()
<< "Optimizing splat/swap or splat/splat "
"to splat/copy: ");
DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(),
TII->get(PPC::COPY), MI.getOperand(0).getReg())
.addOperand(MI.getOperand(1));
ToErase = &MI;
Simplified = true;
}
// If this is a splat fed by a swap, we can simplify modify
// the splat to splat the other value from the swap's input
// parameter.
else if ((Immed == 0 || Immed == 3)
&& FeedImmed == 2 && FeedReg1 == FeedReg2) {
DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
DEBUG(MI.dump());
MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
MI.getOperand(3).setImm(3 - Immed);
Simplified = true;
}
// If this is a swap fed by a swap, we can replace it
// with a copy from the first swap's input.
else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
DEBUG(MI.dump());
BuildMI(MBB, &MI, MI.getDebugLoc(),
TII->get(PPC::COPY), MI.getOperand(0).getReg())
.addOperand(DefMI->getOperand(1));
ToErase = &MI;
Simplified = true;
}
}
}
}
break;
}
}
}
// If the last instruction was marked for elimination,
// remove it now.
if (ToErase) {
ToErase->eraseFromParent();
ToErase = nullptr;
}
}
return Simplified;
}
// This is used to find the "true" source register for an
// XXPERMDI instruction, since MachineCSE does not handle the
// "copy-like" operations (Copy and SubregToReg). Returns
// the original SrcReg unless it is the target of a copy-like
// operation, in which case we chain backwards through all
// such operations to the ultimate source register. If a
// physical register is encountered, we stop the search.
unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
while (true) {
MachineInstr *MI = MRI->getVRegDef(SrcReg);
if (!MI->isCopyLike())
return SrcReg;
unsigned CopySrcReg;
if (MI->isCopy())
CopySrcReg = MI->getOperand(1).getReg();
else {
assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
CopySrcReg = MI->getOperand(2).getReg();
}
if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
return CopySrcReg;
SrcReg = CopySrcReg;
}
}
} // end default namespace
INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
"PowerPC MI Peephole Optimization", false, false)
INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
"PowerPC MI Peephole Optimization", false, false)
char PPCMIPeephole::ID = 0;
FunctionPass*
llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }

View File

@ -42,6 +42,10 @@ static cl::
opt<bool> DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
cl::desc("Disable VSX Swap Removal for PPC"));
static cl::
opt<bool> DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
cl::desc("Disable machine peepholes for PPC"));
static cl::opt<bool>
EnableGEPOpt("ppc-gep-opt", cl::Hidden,
cl::desc("Enable optimizations on complex GEPs"),
@ -348,6 +352,12 @@ void PPCPassConfig::addMachineSSAOptimization() {
if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
!DisableVSXSwapRemoval)
addPass(createPPCVSXSwapRemovalPass());
// Target-specific peephole cleanups performed after instruction
// selection.
if (!DisableMIPeephole) {
addPass(createPPCMIPeepholePass());
addPass(&DeadMachineInstructionElimID);
}
}
void PPCPassConfig::addPreRegAlloc() {

View File

@ -63,7 +63,7 @@ entry:
ret <2 x i64> %splat.splat
; CHECK: mtvsrd {{[0-9]+}}, 3
; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
; CHECK-LE: xxspltd [[REG1]], [[REG1]], 0
}
; Function Attrs: nounwind

View File

@ -0,0 +1,61 @@
; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
; Verify peephole simplification of splats and swaps. Bugpoint-reduced
; test from Eric Schweitz.
%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625 = type <{ [28 x i8] }>
%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626 = type <{ [64 x i8] }>
@.BSS38 = external global %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, align 32
@_main1_2_ = external global %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, section ".comm", align 16
define void @aercalc_() {
L.entry:
br i1 undef, label %L.LB38_2426, label %L.LB38_2911
L.LB38_2911:
br i1 undef, label %L.LB38_2140, label %L.LB38_2640
L.LB38_2640:
unreachable
L.LB38_2426:
br i1 undef, label %L.LB38_2438, label %L.LB38_2920
L.LB38_2920:
br i1 undef, label %L.LB38_2438, label %L.LB38_2921
L.LB38_2921:
br label %L.LB38_2140
L.LB38_2140:
ret void
L.LB38_2438:
br i1 undef, label %L.LB38_2451, label %L.LB38_2935
L.LB38_2935:
br i1 undef, label %L.LB38_2451, label %L.LB38_2936
L.LB38_2936:
unreachable
L.LB38_2451:
br i1 undef, label %L.LB38_2452, label %L.LB38_2937
L.LB38_2937:
unreachable
L.LB38_2452:
%0 = load float, float* bitcast (i8* getelementptr inbounds (%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625* @.BSS38, i64 0, i32 0, i64 16) to float*), align 16
%1 = fpext float %0 to double
%2 = insertelement <2 x double> undef, double %1, i32 1
store <2 x double> %2, <2 x double>* bitcast (i8* getelementptr inbounds (%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626* @_main1_2_, i64 0, i32 0, i64 32) to <2 x double>*), align 16
unreachable
}
; CHECK-LABEL: @aercalc_
; CHECK: lxsspx
; CHECK: xxspltd
; CHECK: stxvd2x
; CHECK-NOT: xxswapd

View File

@ -15,11 +15,11 @@ entry:
}
; CHECK-LABEL: @bar0
; CHECK-DAG: xxswapd {{[0-9]+}}, 1
; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
; CHECK: stxvd2x [[REG3]]
; CHECK-NOT: xxswapd
define void @bar1(double %y) {
entry:
@ -30,11 +30,11 @@ entry:
}
; CHECK-LABEL: @bar1
; CHECK-DAG: xxswapd {{[0-9]+}}, 1
; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]]
; CHECK: stxvd2x [[REG3]]
; CHECK-NOT: xxswapd
define void @baz0() {
entry:

View File

@ -20,8 +20,7 @@ entry:
; CHECK-LABEL: @bar0
; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1
; CHECK: stxvd2x [[REG5]]
@ -37,8 +36,7 @@ entry:
; CHECK-LABEL: @bar1
; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]]
; CHECK: stxvd2x [[REG5]]

View File

@ -1228,9 +1228,8 @@ define <2 x i32> @test80(i32 %v) {
; CHECK-LE-LABEL: @test80
; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
; CHECK-LE-DAG: xxspltd 34, [[V1]]
; CHECK-LE-DAG: xxspltd 34, [[R1]]
; CHECK-LE-DAG: xxswapd 35, [[V2]]
; CHECK-LE: vaddudm 2, 2, 3
; CHECK-LE: blr

View File

@ -33,12 +33,8 @@ define double @teste0(<2 x double>* %p1) {
%r = extractelement <2 x double> %v, i32 0
ret double %r
; FIXME: Swap optimization will collapse this into lxvd2x 1, 0, 3.
; CHECK-LABEL: teste0
; CHECK: lxvd2x 0, 0, 3
; CHECK: xxswapd 0, 0
; CHECK: xxswapd 1, 0
; CHECK: lxvd2x 1, 0, 3
}
define double @teste1(<2 x double>* %p1) {

View File

@ -8,8 +8,7 @@ define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) {
; CHECK-LABEL: test00
; CHECK: lxvd2x 0, 0, 3
; CHECK: xxswapd 0, 0
; CHECK: xxspltd 34, 0, 1
; CHECK: xxspltd 34, 0, 0
}
define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) {
@ -58,9 +57,7 @@ define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) {
ret <2 x double> %v3
; CHECK-LABEL: @test10
; CHECK: lxvd2x 0, 0, 3
; CHECK: xxswapd 0, 0
; CHECK: xxswapd 34, 0
; CHECK: lxvd2x 34, 0, 3
}
define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
@ -71,8 +68,7 @@ define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
; CHECK-LABEL: @test11
; CHECK: lxvd2x 0, 0, 3
; CHECK: xxswapd 0, 0
; CHECK: xxspltd 34, 0, 0
; CHECK: xxspltd 34, 0, 1
}
define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) {
@ -139,8 +135,7 @@ define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) {
; CHECK-LABEL: @test22
; CHECK: lxvd2x 0, 0, 4
; CHECK: xxswapd 0, 0
; CHECK: xxspltd 34, 0, 1
; CHECK: xxspltd 34, 0, 0
}
define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) {
@ -189,9 +184,7 @@ define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) {
ret <2 x double> %v3
; CHECK-LABEL: @test32
; CHECK: lxvd2x 0, 0, 4
; CHECK: xxswapd 0, 0
; CHECK: xxswapd 34, 0
; CHECK: lxvd2x 34, 0, 4
}
define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
@ -202,6 +195,5 @@ define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
; CHECK-LABEL: @test33
; CHECK: lxvd2x 0, 0, 4
; CHECK: xxswapd 0, 0
; CHECK: xxspltd 34, 0, 0
; CHECK: xxspltd 34, 0, 1
}

View File

@ -11,6 +11,7 @@ define void @test1(i32) gc "statepoint-example" {
; CHECK: statepoint
; CHECK-LABEL: loop
; CHECK-NOT: statepoint
; CHECK-LABEL: exit
entry:
br label %loop
@ -32,6 +33,7 @@ define void @test2(i32) gc "statepoint-example" {
; CHECK: statepoint
; CHECK-LABEL: loop
; CHECK-NOT: statepoint
; CHECK-LABEL: exit
entry:
br label %loop
@ -56,6 +58,7 @@ define void @test3(i8 %upper) gc "statepoint-example" {
; CHECK: statepoint
; CHECK-LABEL: loop
; CHECK-NOT: statepoint
; CHECK-LABEL: exit
entry:
br label %loop
@ -77,12 +80,14 @@ define void @test4(i64 %upper) gc "statepoint-example" {
; CHECK: statepoint
; CHECK-LABEL: loop
; CHECK: statepoint
; CHECK-LABEL: exit
; COUNTED-64-LABEL: test4
; COUNTED-64-LABEL: entry
; COUNTED-64: statepoint
; COUNTED-64-LABEL: loop
; COUNTED-64-NOT: statepoint
; COUNTED-64-LABEL: exit
entry:
br label %loop
@ -105,12 +110,14 @@ define void @test5(i64 %upper) gc "statepoint-example" {
; CHECK: statepoint
; CHECK-LABEL: loop
; CHECK: statepoint
; CHECK-LABEL: exit
; COUNTED-64-LABEL: test5
; COUNTED-64-LABEL: entry
; COUNTED-64: statepoint
; COUNTED-64-LABEL: loop
; COUNTED-64: statepoint
; COUNTED-64-LABEL: exit
entry:
br label %loop