mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-04 01:26:41 +00:00
[PowerPC] Add a DAGToDAG peephole to remove unnecessary zero-exts
On PPC64, we end up with lots of i32 -> i64 zero extensions, not only from all of the usual places, but also from the ABI, which specifies that values passed are zero extended. Almost all 32-bit PPC instructions in PPC64 mode are defined to do *something* to the higher-order bits, and for some instructions, that action clears those bits (thus providing a zero-extended result). This is especially common after rotate-and-mask instructions. Adding an additional instruction to zero-extend the results of these instructions is unnecessary. This PPCISelDAGToDAG peephole optimization examines these zero-extensions, and looks back through their operands to see if all instructions will implicitly zero extend their results. If so, we convert these instructions to their 64-bit variants (which is an internal change only, the actual encoding of these instructions is the same as the original 32-bit ones) and remove the unnecessary zero-extension (changing where the INSERT_SUBREG instructions are to make everything internally consistent). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@224169 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3b7e6d27d2
commit
4e703f82f2
@ -205,6 +205,7 @@ private:
|
||||
SDNode *SelectSETCC(SDNode *N);
|
||||
|
||||
void PeepholePPC64();
|
||||
void PeepholePPC64ZExt();
|
||||
void PeepholeCROps();
|
||||
|
||||
bool AllUsersSelectZero(SDNode *N);
|
||||
@ -1628,6 +1629,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
|
||||
|
||||
PeepholePPC64();
|
||||
PeepholeCROps();
|
||||
PeepholePPC64ZExt();
|
||||
}
|
||||
|
||||
// Check if all users of this node will become isel where the second operand
|
||||
@ -2101,6 +2103,299 @@ void PPCDAGToDAGISel::PeepholeCROps() {
|
||||
} while (IsModified);
|
||||
}
|
||||
|
||||
// Gather the set of 32-bit operations that are known to have their
|
||||
// higher-order 32 bits zero, where ToPromote contains all such operations.
|
||||
static bool PeepholePPC64ZExtGather(SDValue Op32,
|
||||
SmallPtrSetImpl<SDNode *> &ToPromote) {
|
||||
if (!Op32.isMachineOpcode())
|
||||
return false;
|
||||
|
||||
// First, check for the "frontier" instructions (those that will clear the
|
||||
// higher-order 32 bits.
|
||||
|
||||
// For RLWINM and RLWNM, we need to make sure that the mask does not wrap
|
||||
// around. If it does not, then these instructions will clear the
|
||||
// higher-order bits.
|
||||
if ((Op32.getMachineOpcode() == PPC::RLWINM ||
|
||||
Op32.getMachineOpcode() == PPC::RLWNM) &&
|
||||
Op32.getConstantOperandVal(2) <= Op32.getConstantOperandVal(3)) {
|
||||
ToPromote.insert(Op32.getNode());
|
||||
return true;
|
||||
}
|
||||
|
||||
// SLW and SRW always clear the higher-order bits.
|
||||
if (Op32.getMachineOpcode() == PPC::SLW ||
|
||||
Op32.getMachineOpcode() == PPC::SRW) {
|
||||
ToPromote.insert(Op32.getNode());
|
||||
return true;
|
||||
}
|
||||
|
||||
// For LI and LIS, we need the immediate to be positive (so that it is not
|
||||
// sign extended).
|
||||
if (Op32.getMachineOpcode() == PPC::LI ||
|
||||
Op32.getMachineOpcode() == PPC::LIS) {
|
||||
if (!isUInt<15>(Op32.getConstantOperandVal(0)))
|
||||
return false;
|
||||
|
||||
ToPromote.insert(Op32.getNode());
|
||||
return true;
|
||||
}
|
||||
|
||||
// Next, check for those instructions we can look through.
|
||||
|
||||
// Assuming the mask does not wrap around, then the higher-order bits are
|
||||
// taken directly from the first operand.
|
||||
if (Op32.getMachineOpcode() == PPC::RLWIMI &&
|
||||
Op32.getConstantOperandVal(3) <= Op32.getConstantOperandVal(4)) {
|
||||
SmallPtrSet<SDNode *, 16> ToPromote1;
|
||||
if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
|
||||
return false;
|
||||
|
||||
ToPromote.insert(Op32.getNode());
|
||||
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
// For OR, the higher-order bits are zero if that is true for both operands.
|
||||
// For SELECT_I4, the same is true (but the relevant operand numbers are
|
||||
// shifted by 1).
|
||||
if (Op32.getMachineOpcode() == PPC::OR ||
|
||||
Op32.getMachineOpcode() == PPC::SELECT_I4) {
|
||||
unsigned B = Op32.getMachineOpcode() == PPC::SELECT_I4 ? 1 : 0;
|
||||
SmallPtrSet<SDNode *, 16> ToPromote1;
|
||||
if (!PeepholePPC64ZExtGather(Op32.getOperand(B+0), ToPromote1))
|
||||
return false;
|
||||
if (!PeepholePPC64ZExtGather(Op32.getOperand(B+1), ToPromote1))
|
||||
return false;
|
||||
|
||||
ToPromote.insert(Op32.getNode());
|
||||
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
// For ORI and ORIS, we need the higher-order bits of the first operand to be
|
||||
// zero, and also for the constant to be positive (so that it is not sign
|
||||
// extended).
|
||||
if (Op32.getMachineOpcode() == PPC::ORI ||
|
||||
Op32.getMachineOpcode() == PPC::ORIS) {
|
||||
SmallPtrSet<SDNode *, 16> ToPromote1;
|
||||
if (!PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1))
|
||||
return false;
|
||||
if (!isUInt<15>(Op32.getConstantOperandVal(1)))
|
||||
return false;
|
||||
|
||||
ToPromote.insert(Op32.getNode());
|
||||
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
||||
return true;
|
||||
}
|
||||
|
||||
// The higher-order bits of AND are zero if that is true for at least one of
|
||||
// the operands.
|
||||
if (Op32.getMachineOpcode() == PPC::AND) {
|
||||
SmallPtrSet<SDNode *, 16> ToPromote1, ToPromote2;
|
||||
bool Op0OK =
|
||||
PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
|
||||
bool Op1OK =
|
||||
PeepholePPC64ZExtGather(Op32.getOperand(1), ToPromote2);
|
||||
if (!Op0OK && !Op1OK)
|
||||
return false;
|
||||
|
||||
ToPromote.insert(Op32.getNode());
|
||||
|
||||
if (Op0OK)
|
||||
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
||||
|
||||
if (Op1OK)
|
||||
ToPromote.insert(ToPromote2.begin(), ToPromote2.end());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// For ANDI and ANDIS, the higher-order bits are zero if either that is true
|
||||
// of the first operand, or if the second operand is positive (so that it is
|
||||
// not sign extended).
|
||||
if (Op32.getMachineOpcode() == PPC::ANDIo ||
|
||||
Op32.getMachineOpcode() == PPC::ANDISo) {
|
||||
SmallPtrSet<SDNode *, 16> ToPromote1;
|
||||
bool Op0OK =
|
||||
PeepholePPC64ZExtGather(Op32.getOperand(0), ToPromote1);
|
||||
bool Op1OK = isUInt<15>(Op32.getConstantOperandVal(1));
|
||||
if (!Op0OK && !Op1OK)
|
||||
return false;
|
||||
|
||||
ToPromote.insert(Op32.getNode());
|
||||
|
||||
if (Op0OK)
|
||||
ToPromote.insert(ToPromote1.begin(), ToPromote1.end());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void PPCDAGToDAGISel::PeepholePPC64ZExt() {
|
||||
if (!PPCSubTarget->isPPC64())
|
||||
return;
|
||||
|
||||
// When we zero-extend from i32 to i64, we use a pattern like this:
|
||||
// def : Pat<(i64 (zext i32:$in)),
|
||||
// (RLDICL (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $in, sub_32),
|
||||
// 0, 32)>;
|
||||
// There are several 32-bit shift/rotate instructions, however, that will
|
||||
// clear the higher-order bits of their output, rendering the RLDICL
|
||||
// unnecessary. When that happens, we remove it here, and redefine the
|
||||
// relevant 32-bit operation to be a 64-bit operation.
|
||||
|
||||
SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
|
||||
++Position;
|
||||
|
||||
bool MadeChange = false;
|
||||
while (Position != CurDAG->allnodes_begin()) {
|
||||
SDNode *N = --Position;
|
||||
// Skip dead nodes and any non-machine opcodes.
|
||||
if (N->use_empty() || !N->isMachineOpcode())
|
||||
continue;
|
||||
|
||||
if (N->getMachineOpcode() != PPC::RLDICL)
|
||||
continue;
|
||||
|
||||
if (N->getConstantOperandVal(1) != 0 ||
|
||||
N->getConstantOperandVal(2) != 32)
|
||||
continue;
|
||||
|
||||
SDValue ISR = N->getOperand(0);
|
||||
if (!ISR.isMachineOpcode() ||
|
||||
ISR.getMachineOpcode() != TargetOpcode::INSERT_SUBREG)
|
||||
continue;
|
||||
|
||||
if (!ISR.hasOneUse())
|
||||
continue;
|
||||
|
||||
if (ISR.getConstantOperandVal(2) != PPC::sub_32)
|
||||
continue;
|
||||
|
||||
SDValue IDef = ISR.getOperand(0);
|
||||
if (!IDef.isMachineOpcode() ||
|
||||
IDef.getMachineOpcode() != TargetOpcode::IMPLICIT_DEF)
|
||||
continue;
|
||||
|
||||
// We now know that we're looking at a canonical i32 -> i64 zext. See if we
|
||||
// can get rid of it.
|
||||
|
||||
SDValue Op32 = ISR->getOperand(1);
|
||||
if (!Op32.isMachineOpcode())
|
||||
continue;
|
||||
|
||||
// There are some 32-bit instructions that always clear the high-order 32
|
||||
// bits, there are also some instructions (like AND) that we can look
|
||||
// through.
|
||||
SmallPtrSet<SDNode *, 16> ToPromote;
|
||||
if (!PeepholePPC64ZExtGather(Op32, ToPromote))
|
||||
continue;
|
||||
|
||||
// If the ToPromote set contains nodes that have uses outside of the set
|
||||
// (except for the original INSERT_SUBREG), then abort the transformation.
|
||||
bool OutsideUse = false;
|
||||
for (SDNode *PN : ToPromote) {
|
||||
for (SDNode *UN : PN->uses()) {
|
||||
if (!ToPromote.count(UN) && UN != ISR.getNode()) {
|
||||
OutsideUse = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (OutsideUse)
|
||||
break;
|
||||
}
|
||||
if (OutsideUse)
|
||||
continue;
|
||||
|
||||
MadeChange = true;
|
||||
|
||||
// We now know that this zero extension can be removed by promoting to
|
||||
// nodes in ToPromote to 64-bit operations, where for operations in the
|
||||
// frontier of the set, we need to insert INSERT_SUBREGs for their
|
||||
// operands.
|
||||
for (SDNode *PN : ToPromote) {
|
||||
unsigned NewOpcode;
|
||||
switch (PN->getMachineOpcode()) {
|
||||
default:
|
||||
llvm_unreachable("Don't know the 64-bit variant of this instruction");
|
||||
case PPC::RLWINM: NewOpcode = PPC::RLWINM8; break;
|
||||
case PPC::RLWNM: NewOpcode = PPC::RLWNM8; break;
|
||||
case PPC::SLW: NewOpcode = PPC::SLW8; break;
|
||||
case PPC::SRW: NewOpcode = PPC::SRW8; break;
|
||||
case PPC::LI: NewOpcode = PPC::LI8; break;
|
||||
case PPC::LIS: NewOpcode = PPC::LIS8; break;
|
||||
case PPC::RLWIMI: NewOpcode = PPC::RLWIMI8; break;
|
||||
case PPC::OR: NewOpcode = PPC::OR8; break;
|
||||
case PPC::SELECT_I4: NewOpcode = PPC::SELECT_I8; break;
|
||||
case PPC::ORI: NewOpcode = PPC::ORI8; break;
|
||||
case PPC::ORIS: NewOpcode = PPC::ORIS8; break;
|
||||
case PPC::AND: NewOpcode = PPC::AND8; break;
|
||||
case PPC::ANDIo: NewOpcode = PPC::ANDIo8; break;
|
||||
case PPC::ANDISo: NewOpcode = PPC::ANDISo8; break;
|
||||
}
|
||||
|
||||
// Note: During the replacement process, the nodes will be in an
|
||||
// inconsistent state (some instructions will have operands with values
|
||||
// of the wrong type). Once done, however, everything should be right
|
||||
// again.
|
||||
|
||||
SmallVector<SDValue, 4> Ops;
|
||||
for (const SDValue &V : PN->ops()) {
|
||||
if (!ToPromote.count(V.getNode()) && V.getValueType() == MVT::i32 &&
|
||||
!isa<ConstantSDNode>(V)) {
|
||||
SDValue ReplOpOps[] = { ISR.getOperand(0), V, ISR.getOperand(2) };
|
||||
SDNode *ReplOp =
|
||||
CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(V),
|
||||
ISR.getNode()->getVTList(), ReplOpOps);
|
||||
Ops.push_back(SDValue(ReplOp, 0));
|
||||
} else {
|
||||
Ops.push_back(V);
|
||||
}
|
||||
}
|
||||
|
||||
// Because all to-be-promoted nodes only have users that are other
|
||||
// promoted nodes (or the original INSERT_SUBREG), we can safely replace
|
||||
// the i32 result value type with i64.
|
||||
|
||||
SmallVector<EVT, 2> NewVTs;
|
||||
SDVTList VTs = PN->getVTList();
|
||||
for (unsigned i = 0, ie = VTs.NumVTs; i != ie; ++i)
|
||||
if (VTs.VTs[i] == MVT::i32)
|
||||
NewVTs.push_back(MVT::i64);
|
||||
else
|
||||
NewVTs.push_back(VTs.VTs[i]);
|
||||
|
||||
DEBUG(dbgs() << "PPC64 ZExt Peephole morphing:\nOld: ");
|
||||
DEBUG(PN->dump(CurDAG));
|
||||
|
||||
CurDAG->SelectNodeTo(PN, NewOpcode, CurDAG->getVTList(NewVTs), Ops);
|
||||
|
||||
DEBUG(dbgs() << "\nNew: ");
|
||||
DEBUG(PN->dump(CurDAG));
|
||||
DEBUG(dbgs() << "\n");
|
||||
}
|
||||
|
||||
// Now we replace the original zero extend and its associated INSERT_SUBREG
|
||||
// with the value feeding the INSERT_SUBREG (which has now been promoted to
|
||||
// return an i64).
|
||||
|
||||
DEBUG(dbgs() << "PPC64 ZExt Peephole replacing:\nOld: ");
|
||||
DEBUG(N->dump(CurDAG));
|
||||
DEBUG(dbgs() << "\nNew: ");
|
||||
DEBUG(Op32.getNode()->dump(CurDAG));
|
||||
DEBUG(dbgs() << "\n");
|
||||
|
||||
ReplaceUses(N, Op32.getNode());
|
||||
}
|
||||
|
||||
if (MadeChange)
|
||||
CurDAG->RemoveDeadNodes();
|
||||
}
|
||||
|
||||
void PPCDAGToDAGISel::PeepholePPC64() {
|
||||
// These optimizations are currently supported only for 64-bit SVR4.
|
||||
if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
|
||||
|
@ -547,6 +547,11 @@ defm EXTSB8 : XForm_11r<31, 954, (outs g8rc:$rA), (ins g8rc:$rS),
|
||||
defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
|
||||
"extsh", "$rA, $rS", IIC_IntSimple,
|
||||
[(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
|
||||
|
||||
defm SLW8 : XForm_6r<31, 24, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
||||
"slw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
|
||||
defm SRW8 : XForm_6r<31, 536, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
|
||||
"srw", "$rA, $rS, $rB", IIC_IntGeneral, []>;
|
||||
} // Interpretation64Bit
|
||||
|
||||
// For fast-isel:
|
||||
@ -645,7 +650,11 @@ defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
|
||||
"rlwinm", "$rA, $rS, $SH, $MB, $ME", IIC_IntGeneral,
|
||||
[]>;
|
||||
|
||||
let isCommutable = 1 in {
|
||||
defm RLWNM8 : MForm_2r<23, (outs g8rc:$rA),
|
||||
(ins g8rc:$rS, g8rc:$rB, u5imm:$MB, u5imm:$ME),
|
||||
"rlwnm", "$rA, $rS, $rB, $MB, $ME", IIC_IntGeneral,
|
||||
[]>;
|
||||
|
||||
// RLWIMI can be commuted if the rotate amount is zero.
|
||||
let Interpretation64Bit = 1, isCodeGenOnly = 1 in
|
||||
defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
|
||||
@ -653,7 +662,6 @@ defm RLWIMI8 : MForm_2r<20, (outs g8rc:$rA),
|
||||
u5imm:$ME), "rlwimi", "$rA, $rS, $SH, $MB, $ME",
|
||||
IIC_IntRotate, []>, PPC970_DGroup_Cracked,
|
||||
RegConstraint<"$rSi = $rA">, NoEncode<"$rSi">;
|
||||
}
|
||||
|
||||
let isSelect = 1 in
|
||||
def ISEL8 : AForm_4<31, 15,
|
||||
|
@ -230,10 +230,12 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
|
||||
|
||||
// Normal instructions can be commuted the obvious way.
|
||||
if (MI->getOpcode() != PPC::RLWIMI &&
|
||||
MI->getOpcode() != PPC::RLWIMIo &&
|
||||
MI->getOpcode() != PPC::RLWIMI8 &&
|
||||
MI->getOpcode() != PPC::RLWIMI8o)
|
||||
MI->getOpcode() != PPC::RLWIMIo)
|
||||
return TargetInstrInfo::commuteInstruction(MI, NewMI);
|
||||
// Note that RLWIMI can be commuted as a 32-bit instruction, but not as a
|
||||
// 64-bit instruction (so we don't handle PPC::RLWIMI8 here), because
|
||||
// changing the relative order of the mask operands might change what happens
|
||||
// to the high-bits of the mask (and, thus, the result).
|
||||
|
||||
// Cannot commute if it has a non-zero rotate count.
|
||||
if (MI->getOperand(3).getImm() != 0)
|
||||
|
32
test/CodeGen/PowerPC/rm-zext.ll
Normal file
32
test/CodeGen/PowerPC/rm-zext.ll
Normal file
@ -0,0 +1,32 @@
|
||||
; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
|
||||
target datalayout = "E-m:e-i64:64-n32:64"
|
||||
target triple = "powerpc64-unknown-linux-gnu"
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
define signext i32 @foo(i32 signext %a) #0 {
|
||||
entry:
|
||||
%mul = mul nsw i32 %a, %a
|
||||
%shr2 = lshr i32 %mul, 5
|
||||
ret i32 %shr2
|
||||
|
||||
; CHECK-LABEL @foo
|
||||
; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
define zeroext i32 @test6(i32 zeroext %x) #0 {
|
||||
entry:
|
||||
%and = lshr i32 %x, 16
|
||||
%shr = and i32 %and, 255
|
||||
%and1 = shl i32 %x, 16
|
||||
%shl = and i32 %and1, 16711680
|
||||
%or = or i32 %shr, %shl
|
||||
ret i32 %or
|
||||
|
||||
; CHECK-LABEL @test6
|
||||
; CHECK-NOT: rldicl 3, {{[0-9]+}}, 0, 32
|
||||
; CHECK: blr
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
|
Loading…
x
Reference in New Issue
Block a user