[PowerPC] Add a DAGToDAG peephole to remove unnecessary zero-exts

On PPC64, we end up with lots of i32 -> i64 zero extensions, not only from all
of the usual places, but also from the ABI, which specifies that values passed
are zero extended. Almost all 32-bit PPC instructions in PPC64 mode are defined
to do *something* to the higher-order bits, and for some instructions, that
action clears those bits (thus providing a zero-extended result). This is
especially common after rotate-and-mask instructions. Adding an additional
instruction to zero-extend the results of these instructions is unnecessary.

This PPCISelDAGToDAG peephole optimization examines these zero-extensions, and
looks back through their operands to see if all instructions will implicitly
zero extend their results. If so, we convert these instructions to their 64-bit
variants (which is an internal change only, the actual encoding of these
instructions is the same as the original 32-bit ones) and remove the
unnecessary zero-extension (changing where the INSERT_SUBREG instructions are
to make everything internally consistent).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224169 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Hal Finkel 2014-12-12 23:59:36 +00:00
parent 3b7e6d27d2
commit 4e703f82f2
4 changed files with 342 additions and 5 deletions

View File

@ -205,6 +205,7 @@ private:
SDNode *SelectSETCC(SDNode *N);
void PeepholePPC64();
void PeepholePPC64ZExt();
void PeepholeCROps();
bool AllUsersSelectZero(SDNode *N);
@ -1628,6 +1629,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
PeepholePPC64();
PeepholeCROps();
PeepholePPC64ZExt();
}
// Check if all users of this node will become isel where the second operand
@ -2101,6 +2103,299 @@ void PPCDAGToDAGISel::PeepholeCROps() {
} while (IsModified);
}
// Gather the set of 32-bit operations that are known to have their
// higher-order 32 bits zero, where ToPromote contains all such operations.
static bool PeepholePPC64ZExtGather(SDValue Op32,
SmallPtrSetImpl<SDNode *> &ToPromote) {
if (!Op32.isMachineOpcode())
return false;
// First, check for the "frontier" instructions (those that will clear the
// higher-order 32 bits.
// For RLWINM and RLWNM, we need to make sure that the mask does not wrap
// around. If it does not, then these instructions will clear the
// higher-order bits.
if ((Op32.getMachineOpcode() == PPC::RLWINM ||
Op32.getMachineOpcode() == PPC::RLWNM) &&
Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
ToPromote.insert(Op32.getNode());
return true;
}
// SLW and SRW always clear the higher-order bits.
if (Op32.getMachineOpcode() == PPC::SLW ||
Op32.getMachineOpcode() == PPC::SRW) {
ToPromote.insert(Op32.getNode());
return true;
}
// For LI and LIS, we need the immediate to be positive (so that it is not
// sign extended).
if (Op32.getMachineOpcode() == PPC::LI ||
Op32.getMachineOpcode() == PPC::LIS) {
if (!isUInt<15>(Op32.getConstantOperandVal(0)))
return false;
ToPromote.insert(Op32.getNode());
return true;
}
// Next, check for those instructions we can look through.
// Assuming the mask does not wrap around, then the higher-order bits are
// taken directly from the first operand.
if (Op32.getMachineOpcode() == PPC::RLWIMI &&
Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
SmallPtrSet<SDNode *, 16> ToPromote1;
if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
return false;
ToPromote.insert(Op32.getNode());
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
return true;
}
// For OR, the higher-order bits are zero if that is true for both operands.
// For SELECT_I4, the same is true (but the relevant operand numbers are
// shifted by 1).
if (Op32.getMachineOpcode() == PPC::OR ||
Op32.getMachineOpcode() == PPC::SELECT_I4) {
unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
SmallPtrSet<SDNode *, 16> ToPromote1;
if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
return false;
if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
return false;
ToPromote.insert(Op32.getNode());
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
return true;
}
// For ORI and ORIS, we need the higher-order bits of the first operand to be
// zero, and also for the constant to be positive (so that it is not sign
// extended).
if (Op32.getMachineOpcode() == PPC::ORI ||
Op32.getMachineOpcode() == PPC::ORIS) {
SmallPtrSet<SDNode *, 16> ToPromote1;
if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
return false;
if (!isUInt<15>(Op32.getConstantOperandVal(1)))
return false;
ToPromote.insert(Op32.getNode());
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
return true;
}
// The higher-order bits of AND are zero if that is true for at least one of
// the operands.
if (Op32.getMachineOpcode() == PPC::AND) {
SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
bool Op0OK =
PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
bool Op1OK =
PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
if (!Op0OK && !Op1OK)
return false;
ToPromote.insert(Op32.getNode());
if (Op0OK)
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
if (Op1OK)
ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
return true;
}
// For ANDI and ANDIS, the higher-order bits are zero if either that is true
// of the first operand, or if the second operand is positive (so that it is
// not sign extended).
if (Op32.getMachineOpcode() == PPC::ANDIo ||
Op32.getMachineOpcode() == PPC::ANDISo) {
SmallPtrSet<SDNode *, 16> ToPromote1;
bool Op0OK =
PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
if (!Op0OK && !Op1OK)
return false;
ToPromote.insert(Op32.getNode());
if (Op0OK)
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
return true;
}
return false;
}
void PPCDAGToDAGISel::PeepholePPC64ZExt() {
if (!PPCSubTarget->isPPC64())
return;
// When we zero-extend from i32 to i64, we use a pattern like this:
// def : Pat<(i64 (zext i32:$in)),
// (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
// 0, 32)>;
// There are several 32-bit shift/rotate instructions, however, that will
// clear the higher-order bits of their output, rendering the RLDICL
// unnecessary. When that happens, we remove it here, and redefine the
// relevant 32-bit operation to be a 64-bit operation.
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
++Position;
bool MadeChange = false;
while (Position != CurDAG->allnodes_begin()) {
SDNode *N = --Position;
// Skip dead nodes and any non-machine opcodes.
if (N->use_empty() || !N->isMachineOpcode())
continue;
if (N->getMachineOpcode() != PPC::RLDICL)
continue;
if (N->getConstantOperandVal(1) != 0 ||
N->getConstantOperandVal(2) != 32)
continue;
SDValue ISR = N->getOperand(0);
if (!ISR.isMachineOpcode() ||
ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
continue;
if (!ISR.hasOneUse())
continue;
if (ISR.getConstantOperandVal(2) != PPC::sub_32)
continue;
SDValue IDef = ISR.getOperand(0);
if (!IDef.isMachineOpcode() ||
IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
continue;
// We now know that we're looking at a canonical i32 -> i64 zext. See if we
// can get rid of it.
SDValue Op32 = ISR->getOperand(1);
if (!Op32.isMachineOpcode())
continue;
// There are some 32-bit instructions that always clear the high-order 32
// bits, there are also some instructions (like AND) that we can look
// through.
SmallPtrSet<SDNode *, 16> ToPromote;
if (!PeepholePPC64ZExtGather(Op32, ToPromote))
continue;
// If the ToPromote set contains nodes that have uses outside of the set
// (except for the original INSERT_SUBREG), then abort the transformation.
bool OutsideUse = false;
for (SDNode *PN : ToPromote) {
for (SDNode *UN : PN->uses()) {
if (!ToPromote.count(UN) && UN != ISR.getNode()) {
OutsideUse = true;
break;
}
}
if (OutsideUse)
break;
}
if (OutsideUse)
continue;
MadeChange = true;
// We now know that this zero extension can be removed by promoting to
// nodes in ToPromote to 64-bit operations, where for operations in the
// frontier of the set, we need to insert INSERT_SUBREGs for their
// operands.
for (SDNode *PN : ToPromote) {
unsigned NewOpcode;
switch (PN->getMachineOpcode()) {
default:
llvm_unreachable("Don't know the 64-bit variant of this instruction");
case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break;
case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break;
case PPC::SLW: NewOpcode = PPC::SLW8; break;
case PPC::SRW: NewOpcode = PPC::SRW8; break;
case PPC::LI: NewOpcode = PPC::LI8; break;
case PPC::LIS: NewOpcode = PPC::LIS8; break;
case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break;
case PPC::OR: NewOpcode = PPC::OR8; break;
case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
case PPC::ORI: NewOpcode = PPC::ORI8; break;
case PPC::ORIS: NewOpcode = PPC::ORIS8; break;
case PPC::AND: NewOpcode = PPC::AND8; break;
case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break;
case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break;
}
// Note: During the replacement process, the nodes will be in an
// inconsistent state (some instructions will have operands with values
// of the wrong type). Once done, however, everything should be right
// again.
SmallVector<SDValue, 4> Ops;
for (const SDValue &V : PN->ops()) {
if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
!isa<ConstantSDNode>(V)) {
SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
SDNode *ReplOp =
CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
ISR.getNode()->getVTList(), ReplOpOps);
Ops.push_back(SDValue(ReplOp, 0));
} else {
Ops.push_back(V);
}
}
// Because all to-be-promoted nodes only have users that are other
// promoted nodes (or the original INSERT_SUBREG), we can safely replace
// the i32 result value type with i64.
SmallVector<EVT, 2> NewVTs;
SDVTList VTs = PN->getVTList();
for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
if (VTs.VTs[i] == MVT::i32)
NewVTs.push_back(MVT::i64);
else
NewVTs.push_back(VTs.VTs[i]);
DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
DEBUG(PN->dump(CurDAG));
CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
DEBUG(dbgs() << "\nNew: ");
DEBUG(PN->dump(CurDAG));
DEBUG(dbgs() << "\n");
}
// Now we replace the original zero extend and its associated INSERT_SUBREG
// with the value feeding the INSERT_SUBREG (which has now been promoted to
// return an i64).
DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
DEBUG(N->dump(CurDAG));
DEBUG(dbgs() << "\nNew: ");
DEBUG(Op32.getNode()->dump(CurDAG));
DEBUG(dbgs() << "\n");
ReplaceUses(N, Op32.getNode());
}
if (MadeChange)
CurDAG->RemoveDeadNodes();
}
void PPCDAGToDAGISel::PeepholePPC64() {
// These optimizations are currently supported only for 64-bit SVR4.
if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())

View File

@ -547,6 +547,11 @@ defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
"extsh", "$rA, $rS", IIC_IntSimple,
[(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
"slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
"srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
} // Interpretation64Bit
// For fast-isel:
@ -645,7 +650,11 @@ defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
"rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
[]>;
let isCommutable = 1 in {
defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA),
(ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
"rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
[]>;
// RLWIMI can be commuted if the rotate amount is zero.
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
@ -653,7 +662,6 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
}
let isSelect = 1 in
def ISEL8 : AForm_4<31, 15,

View File

@ -230,10 +230,12 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
// Normal instructions can be commuted the obvious way.
if (MI->getOpcode() != PPC::RLWIMI &&
MI->getOpcode() != PPC::RLWIMIo &&
MI->getOpcode() != PPC::RLWIMI8 &&
MI->getOpcode() != PPC::RLWIMI8o)
MI->getOpcode() != PPC::RLWIMIo)
return TargetInstrInfo::commuteInstruction(MI, NewMI);
// Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
// 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
// changing the relative order of the mask operands might change what happens
// to the high-bits of the mask (and, thus, the result).
// Cannot commute if it has a non-zero rotate count.
if (MI->getOperand(3).getImm() != 0)

View File

@ -0,0 +1,32 @@
; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind readnone
define signext i32 @foo(i32 signext %a) #0 {
entry:
%mul = mul nsw i32 %a, %a
%shr2 = lshr i32 %mul, 5
ret i32 %shr2
; CHECK-LABEL @foo
; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
; CHECK: blr
}
define zeroext i32 @test6(i32 zeroext %x) #0 {
entry:
%and = lshr i32 %x, 16
%shr = and i32 %and, 255
%and1 = shl i32 %x, 16
%shl = and i32 %and1, 16711680
%or = or i32 %shr, %shl
ret i32 %or
; CHECK-LABEL @test6
; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
; CHECK: blr
}
attributes #0 = { nounwind readnone }