mirror of
https://github.com/RPCS3/llvm.git
synced 2024-11-29 14:40:39 +00:00
Added optimization that narrow load / op / store and the 'op' is a bit twiddling instruction and its second operand is an immediate. If bits that are touched by 'op' can be done with a narrower instruction, reduce the width of the load and store as well. This happens a lot with bitfield manipulation code.
e.g. orl $65536, 8(%rax) => orb $1, 10(%rax) Since narrowing is not always a win, e.g. i32 -> i16 is a loss on x86, dag combiner consults with the target before performing the optimization. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@72507 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
c2695eba57
commit
8b944d39b3
@ -1420,6 +1420,13 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
/// isNarrowingProfitable - Return true if it's profitable to narrow
|
||||
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
|
||||
/// from i32 to i8 but not from i32 to i16.
|
||||
virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const {
|
||||
return true;
|
||||
}
|
||||
|
||||
//===--------------------------------------------------------------------===//
|
||||
// Div utility functions
|
||||
//
|
||||
|
@ -41,6 +41,7 @@ using namespace llvm;
|
||||
STATISTIC(NodesCombined , "Number of dag nodes combined");
|
||||
STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
|
||||
STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
|
||||
STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
|
||||
|
||||
namespace {
|
||||
static cl::opt<bool>
|
||||
@ -222,6 +223,7 @@ namespace {
|
||||
SDValue BuildUDIV(SDNode *N);
|
||||
SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL);
|
||||
SDValue ReduceLoadWidth(SDNode *N);
|
||||
SDValue ReduceLoadOpStoreWidth(SDNode *N);
|
||||
|
||||
SDValue GetDemandedBits(SDValue V, const APInt &Mask);
|
||||
|
||||
@ -4900,6 +4902,96 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
|
||||
/// ReduceLoadOpStoreWidth - Look for sequence of load / op / store where op is
|
||||
/// one of 'or', 'xor', and 'and' of immediates. If 'op' is only touching some
|
||||
/// of the loaded bits, try narrowing the load and store if it would end up
|
||||
/// being a win for performance or code size.
|
||||
SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
|
||||
StoreSDNode *ST = cast<StoreSDNode>(N);
|
||||
SDValue Chain = ST->getChain();
|
||||
SDValue Value = ST->getValue();
|
||||
SDValue Ptr = ST->getBasePtr();
|
||||
MVT VT = Value.getValueType();
|
||||
|
||||
if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
|
||||
return SDValue(0, 0);
|
||||
|
||||
unsigned Opc = Value.getOpcode();
|
||||
if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
|
||||
Value.getOperand(1).getOpcode() != ISD::Constant)
|
||||
return SDValue(0, 0);
|
||||
|
||||
SDValue N0 = Value.getOperand(0);
|
||||
if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N0);
|
||||
if (LD->getBasePtr() != Ptr/* || Chain != N0.getValue(1)*/)
|
||||
return SDValue(0, 0);
|
||||
|
||||
// Find the type to narrow it the load / op / store to.
|
||||
SDValue N1 = Value.getOperand(1);
|
||||
unsigned BitWidth = N1.getValueSizeInBits();
|
||||
APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
|
||||
if (Opc == ISD::AND)
|
||||
Imm ^= APInt::getAllOnesValue(BitWidth);
|
||||
unsigned ShAmt = Imm.countTrailingZeros();
|
||||
unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
|
||||
unsigned NewBW = NextPowerOf2(MSB - ShAmt);
|
||||
MVT NewVT = MVT::getIntegerVT(NewBW);
|
||||
while (NewBW < BitWidth &&
|
||||
!(TLI.isTypeLegal(NewVT) &&
|
||||
TLI.isOperationLegalOrCustom(Opc, NewVT) &&
|
||||
TLI.isNarrowingProfitable(VT, NewVT))) {
|
||||
NewBW = NextPowerOf2(NewBW);
|
||||
NewVT = MVT::getIntegerVT(NewBW);
|
||||
}
|
||||
if (NewBW == BitWidth)
|
||||
return SDValue(0, 0);
|
||||
|
||||
// If the lsb changed does not start at the type bitwidth boundary,
|
||||
// start at the previous one.
|
||||
if (ShAmt % NewBW)
|
||||
ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
|
||||
APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, ShAmt + NewBW);
|
||||
if ((Imm & Mask) == Imm) {
|
||||
APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
|
||||
if (Opc == ISD::AND)
|
||||
NewImm ^= APInt::getAllOnesValue(NewBW);
|
||||
uint64_t PtrOff = ShAmt / 8;
|
||||
// For big endian targets, we need to adjust the offset to the pointer to
|
||||
// load the correct bytes.
|
||||
if (TLI.isBigEndian())
|
||||
PtrOff = (BitWidth - NewBW) / 8 - PtrOff;
|
||||
|
||||
unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
|
||||
SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
|
||||
Ptr.getValueType(), Ptr,
|
||||
DAG.getConstant(PtrOff, Ptr.getValueType()));
|
||||
SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(),
|
||||
LD->getChain(), NewPtr,
|
||||
LD->getSrcValue(), LD->getSrcValueOffset(),
|
||||
LD->isVolatile(), NewAlign);
|
||||
SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD,
|
||||
DAG.getConstant(NewImm, NewVT));
|
||||
SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(),
|
||||
NewVal, NewPtr,
|
||||
ST->getSrcValue(), ST->getSrcValueOffset(),
|
||||
ST->isVolatile(), NewAlign);
|
||||
|
||||
AddToWorkList(NewPtr.getNode());
|
||||
AddToWorkList(NewLD.getNode());
|
||||
AddToWorkList(NewVal.getNode());
|
||||
WorkListRemover DeadNodes(*this);
|
||||
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1),
|
||||
&DeadNodes);
|
||||
++OpsNarrowed;
|
||||
return NewST;
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue(0, 0);
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitSTORE(SDNode *N) {
|
||||
StoreSDNode *ST = cast<StoreSDNode>(N);
|
||||
SDValue Chain = ST->getChain();
|
||||
@ -5086,7 +5178,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
|
||||
ST->isVolatile(), ST->getAlignment());
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
return ReduceLoadOpStoreWidth(N);
|
||||
}
|
||||
|
||||
SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
|
||||
|
@ -6877,6 +6877,11 @@ bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
|
||||
return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
|
||||
}
|
||||
|
||||
bool X86TargetLowering::isNarrowingProfitable(MVT VT1, MVT VT2) const {
|
||||
// i16 instructions are longer (0x66 prefix) and potentially slower.
|
||||
return !(VT1 == MVT::i32 && VT2 == MVT::i16);
|
||||
}
|
||||
|
||||
/// isShuffleMaskLegal - Targets can use this to indicate that they only
|
||||
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
|
||||
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
|
||||
|
@ -466,6 +466,11 @@ namespace llvm {
|
||||
virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
|
||||
virtual bool isZExtFree(MVT VT1, MVT VT2) const;
|
||||
|
||||
/// isNarrowingProfitable - Return true if it's profitable to narrow
|
||||
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
|
||||
/// from i32 to i8 but not from i32 to i16.
|
||||
virtual bool isNarrowingProfitable(MVT VT1, MVT VT2) const;
|
||||
|
||||
/// isShuffleMaskLegal - Targets can use this to indicate that they only
|
||||
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
|
||||
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask
|
||||
|
23
test/CodeGen/X86/narrow_op-1.ll
Normal file
23
test/CodeGen/X86/narrow_op-1.ll
Normal file
@ -0,0 +1,23 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep orb | count 1
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep orb | grep 1
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep orl | count 1
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep orl | grep 16842752
|
||||
|
||||
%struct.bf = type { i64, i16, i16, i32 }
|
||||
@bfi = common global %struct.bf zeroinitializer, align 16
|
||||
|
||||
define void @t1() nounwind optsize ssp {
|
||||
entry:
|
||||
%0 = load i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
|
||||
%1 = or i32 %0, 65536
|
||||
store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t2() nounwind optsize ssp {
|
||||
entry:
|
||||
%0 = load i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
|
||||
%1 = or i32 %0, 16842752
|
||||
store i32 %1, i32* bitcast (i16* getelementptr (%struct.bf* @bfi, i32 0, i32 1) to i32*), align 8
|
||||
ret void
|
||||
}
|
23
test/CodeGen/X86/narrow_op-2.ll
Normal file
23
test/CodeGen/X86/narrow_op-2.ll
Normal file
@ -0,0 +1,23 @@
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | count 2
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | grep 254
|
||||
; RUN: llvm-as < %s | llc -march=x86-64 | grep andb | grep 253
|
||||
|
||||
%struct.bf = type { i64, i16, i16, i32 }
|
||||
@bfi = external global %struct.bf*
|
||||
|
||||
define void @t1() nounwind ssp {
|
||||
entry:
|
||||
%0 = load %struct.bf** @bfi, align 8
|
||||
%1 = getelementptr %struct.bf* %0, i64 0, i32 1
|
||||
%2 = bitcast i16* %1 to i32*
|
||||
%3 = load i32* %2, align 1
|
||||
%4 = and i32 %3, -65537
|
||||
store i32 %4, i32* %2, align 1
|
||||
%5 = load %struct.bf** @bfi, align 8
|
||||
%6 = getelementptr %struct.bf* %5, i64 0, i32 1
|
||||
%7 = bitcast i16* %6 to i32*
|
||||
%8 = load i32* %7, align 1
|
||||
%9 = and i32 %8, -131073
|
||||
store i32 %9, i32* %7, align 1
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user