mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-14 15:19:33 +00:00
[DAGCombiner] Reapply load slicing (192471) with a test that explicitly set sse4.2 support.
This should fix the buildbots. Original commit message: [DAGCombiner] Slice a big load in two loads when the element are next to each other in memory and the target has paired load and performs post-isel loads combining. E.g., this optimization will transform something like this: a = load i64* addr b = trunc i64 a to i32 c = lshr i64 a, 32 d = trunc i64 c to i32 into: b = load i32* addr1 d = load i32* addr2 Where addr1 = addr2 +/- sizeof(i32), if the target supports paired load and performs post-isel loads combining. One should overload TargetLowering::hasPairedLoad to provide this information. The default is false. <rdar://problem/14477220> llvm-svn: 192476
This commit is contained in:
parent
fd0097531f
commit
c02e5604f4
@ -1183,6 +1183,35 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Return true if the target supplies and combines to a paired load
|
||||
/// two loaded values of type LoadedType next to each other in memory.
|
||||
/// RequiredAlignment gives the minimal alignment constraints that must be met to
|
||||
/// be able to select this paired load.
|
||||
///
|
||||
/// This information is *not* used to generate actual paired loads, but it is used
|
||||
/// to generate a sequence of loads that is easier to combine into a paired load.
|
||||
/// For instance, something like this:
|
||||
/// a = load i64* addr
|
||||
/// b = trunc i64 a to i32
|
||||
/// c = lshr i64 a, 32
|
||||
/// d = trunc i64 c to i32
|
||||
/// will be optimized into:
|
||||
/// b = load i32* addr1
|
||||
/// d = load i32* addr2
|
||||
/// Where addr1 = addr2 +/- sizeof(i32).
|
||||
///
|
||||
/// In other words, unless the target performs a post-isel load combining, this
|
||||
/// information should not be provided because it will generate more loads.
|
||||
virtual bool hasPairedLoad(Type * /*LoadedType*/,
|
||||
unsigned & /*RequiredAligment*/) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
virtual bool hasPairedLoad(EVT /*LoadedType*/,
|
||||
unsigned & /*RequiredAligment*/) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Return true if zero-extending the specific node Val to type VT2 is free
|
||||
/// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
|
||||
/// because it's folded such as X86 zero-extending loads).
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "llvm/Target/TargetLowering.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/Target/TargetRegisterInfo.h"
|
||||
#include "llvm/Target/TargetSubtargetInfo.h"
|
||||
#include <algorithm>
|
||||
using namespace llvm;
|
||||
@ -44,6 +45,7 @@ STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
|
||||
STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
|
||||
STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
|
||||
STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
|
||||
STATISTIC(SlicedLoads, "Number of load sliced");
|
||||
|
||||
namespace {
|
||||
static cl::opt<bool>
|
||||
@ -54,6 +56,14 @@ namespace {
|
||||
CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
|
||||
cl::desc("Include global information in alias analysis"));
|
||||
|
||||
/// Hidden option to stress test load slicing, i.e., when this option
|
||||
/// is enabled, load slicing bypasses most of its profitability guards.
|
||||
static cl::opt<bool>
|
||||
StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
|
||||
cl::desc("Bypass the profitability model of load "
|
||||
"slicing"),
|
||||
cl::init(false));
|
||||
|
||||
//------------------------------ DAGCombiner ---------------------------------//
|
||||
|
||||
class DAGCombiner {
|
||||
@ -63,6 +73,7 @@ namespace {
|
||||
CodeGenOpt::Level OptLevel;
|
||||
bool LegalOperations;
|
||||
bool LegalTypes;
|
||||
bool ForCodeSize;
|
||||
|
||||
// Worklist of all of the nodes that need to be simplified.
|
||||
//
|
||||
@ -145,6 +156,7 @@ namespace {
|
||||
|
||||
bool CombineToPreIndexedLoadStore(SDNode *N);
|
||||
bool CombineToPostIndexedLoadStore(SDNode *N);
|
||||
bool SliceUpLoad(SDNode *N);
|
||||
|
||||
void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
|
||||
SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
|
||||
@ -316,8 +328,15 @@ namespace {
|
||||
|
||||
public:
|
||||
DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
|
||||
: DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
|
||||
OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
|
||||
: DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
|
||||
OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
|
||||
AttributeSet FnAttrs =
|
||||
DAG.getMachineFunction().getFunction()->getAttributes();
|
||||
ForCodeSize =
|
||||
FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
|
||||
Attribute::OptimizeForSize) ||
|
||||
FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
|
||||
}
|
||||
|
||||
/// Run - runs the dag combiner on all nodes in the work list
|
||||
void Run(CombineLevel AtLevel);
|
||||
@ -7579,9 +7598,562 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
|
||||
if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
|
||||
return SDValue(N, 0);
|
||||
|
||||
// Try to slice up N to more direct loads if the slices are mapped to
|
||||
// different register banks or pairing can take place.
|
||||
if (SliceUpLoad(N))
|
||||
return SDValue(N, 0);
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// \brief Helper structure used to slice a load in smaller loads.
|
||||
/// Basically a slice is obtained from the following sequence:
|
||||
/// Origin = load Ty1, Base
|
||||
/// Shift = srl Ty1 Origin, CstTy Amount
|
||||
/// Inst = trunc Shift to Ty2
|
||||
///
|
||||
/// Then, it will be rewriten into:
|
||||
/// Slice = load SliceTy, Base + SliceOffset
|
||||
/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
|
||||
///
|
||||
/// SliceTy is deduced from the number of bits that are actually used to
|
||||
/// build Inst.
|
||||
struct LoadedSlice {
|
||||
/// \brief Helper structure used to compute the cost of a slice.
|
||||
struct Cost {
|
||||
/// Are we optimizing for code size.
|
||||
bool ForCodeSize;
|
||||
/// Various cost.
|
||||
unsigned Loads;
|
||||
unsigned Truncates;
|
||||
unsigned CrossRegisterBanksCopies;
|
||||
unsigned ZExts;
|
||||
unsigned Shift;
|
||||
|
||||
Cost(bool ForCodeSize = false)
|
||||
: ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
|
||||
CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
|
||||
|
||||
/// \brief Get the cost of one isolated slice.
|
||||
Cost(const LoadedSlice &LS, bool ForCodeSize = false)
|
||||
: ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
|
||||
CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
|
||||
EVT TruncType = LS.Inst->getValueType(0);
|
||||
EVT LoadedType = LS.getLoadedType();
|
||||
if (TruncType != LoadedType &&
|
||||
!LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
|
||||
ZExts = 1;
|
||||
}
|
||||
|
||||
/// \brief Account for slicing gain in the current cost.
|
||||
/// Slicing provide a few gains like removing a shift or a
|
||||
/// truncate. This method allows to grow the cost of the original
|
||||
/// load with the gain from this slice.
|
||||
void addSliceGain(const LoadedSlice &LS) {
|
||||
// Each slice saves a truncate.
|
||||
const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
|
||||
if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
|
||||
LS.Inst->getOperand(0).getValueType()))
|
||||
++Truncates;
|
||||
// If there is a shift amount, this slice gets rid of it.
|
||||
if (LS.Shift)
|
||||
++Shift;
|
||||
// If this slice can merge a cross register bank copy, account for it.
|
||||
if (LS.canMergeExpensiveCrossRegisterBankCopy())
|
||||
++CrossRegisterBanksCopies;
|
||||
}
|
||||
|
||||
Cost &operator+=(const Cost &RHS) {
|
||||
Loads += RHS.Loads;
|
||||
Truncates += RHS.Truncates;
|
||||
CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
|
||||
ZExts += RHS.ZExts;
|
||||
Shift += RHS.Shift;
|
||||
return *this;
|
||||
}
|
||||
|
||||
bool operator==(const Cost &RHS) const {
|
||||
return Loads == RHS.Loads && Truncates == RHS.Truncates &&
|
||||
CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
|
||||
ZExts == RHS.ZExts && Shift == RHS.Shift;
|
||||
}
|
||||
|
||||
bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
|
||||
|
||||
bool operator<(const Cost &RHS) const {
|
||||
// Assume cross register banks copies are as expensive as loads.
|
||||
// FIXME: Do we want some more target hooks?
|
||||
unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
|
||||
unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
|
||||
// Unless we are optimizing for code size, consider the
|
||||
// expensive operation first.
|
||||
if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
|
||||
return ExpensiveOpsLHS < ExpensiveOpsRHS;
|
||||
return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
|
||||
(RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
|
||||
}
|
||||
|
||||
bool operator>(const Cost &RHS) const { return RHS < *this; }
|
||||
|
||||
bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
|
||||
|
||||
bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
|
||||
};
|
||||
// The last instruction that represent the slice. This should be a
|
||||
// truncate instruction.
|
||||
SDNode *Inst;
|
||||
// The original load instruction.
|
||||
LoadSDNode *Origin;
|
||||
// The right shift amount in bits from the original load.
|
||||
unsigned Shift;
|
||||
// The DAG from which Origin came from.
|
||||
// This is used to get some contextual information about legal types, etc.
|
||||
SelectionDAG *DAG;
|
||||
|
||||
LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
|
||||
unsigned Shift = 0, SelectionDAG *DAG = NULL)
|
||||
: Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
|
||||
|
||||
LoadedSlice(const LoadedSlice &LS)
|
||||
: Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {}
|
||||
|
||||
/// \brief Get the bits used in a chunk of bits \p BitWidth large.
|
||||
/// \return Result is \p BitWidth and has used bits set to 1 and
|
||||
/// not used bits set to 0.
|
||||
APInt getUsedBits() const {
|
||||
// Reproduce the trunc(lshr) sequence:
|
||||
// - Start from the truncated value.
|
||||
// - Zero extend to the desired bit width.
|
||||
// - Shift left.
|
||||
assert(Origin && "No original load to compare against.");
|
||||
unsigned BitWidth = Origin->getValueSizeInBits(0);
|
||||
assert(Inst && "This slice is not bound to an instruction");
|
||||
assert(Inst->getValueSizeInBits(0) <= BitWidth &&
|
||||
"Extracted slice is bigger than the whole type!");
|
||||
APInt UsedBits(Inst->getValueSizeInBits(0), 0);
|
||||
UsedBits.setAllBits();
|
||||
UsedBits = UsedBits.zext(BitWidth);
|
||||
UsedBits <<= Shift;
|
||||
return UsedBits;
|
||||
}
|
||||
|
||||
/// \brief Get the size of the slice to be loaded in bytes.
|
||||
unsigned getLoadedSize() const {
|
||||
unsigned SliceSize = getUsedBits().countPopulation();
|
||||
assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
|
||||
return SliceSize / 8;
|
||||
}
|
||||
|
||||
/// \brief Get the type that will be loaded for this slice.
|
||||
/// Note: This may not be the final type for the slice.
|
||||
EVT getLoadedType() const {
|
||||
assert(DAG && "Missing context");
|
||||
LLVMContext &Ctxt = *DAG->getContext();
|
||||
return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
|
||||
}
|
||||
|
||||
/// \brief Get the alignment of the load used for this slice.
|
||||
unsigned getAlignment() const {
|
||||
unsigned Alignment = Origin->getAlignment();
|
||||
unsigned Offset = getOffsetFromBase();
|
||||
if (Offset != 0)
|
||||
Alignment = MinAlign(Alignment, Alignment + Offset);
|
||||
return Alignment;
|
||||
}
|
||||
|
||||
/// \brief Check if this slice can be rewritten with legal operations.
|
||||
bool isLegal() const {
|
||||
// An invalid slice is not legal.
|
||||
if (!Origin || !Inst || !DAG)
|
||||
return false;
|
||||
|
||||
// Offsets are for indexed load only, we do not handle that.
|
||||
if (Origin->getOffset().getOpcode() != ISD::UNDEF)
|
||||
return false;
|
||||
|
||||
const TargetLowering &TLI = DAG->getTargetLoweringInfo();
|
||||
|
||||
// Check that the type is legal.
|
||||
EVT SliceType = getLoadedType();
|
||||
if (!TLI.isTypeLegal(SliceType))
|
||||
return false;
|
||||
|
||||
// Check that the load is legal for this type.
|
||||
if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
|
||||
return false;
|
||||
|
||||
// Check that the offset can be computed.
|
||||
// 1. Check its type.
|
||||
EVT PtrType = Origin->getBasePtr().getValueType();
|
||||
if (PtrType == MVT::Untyped || PtrType.isExtended())
|
||||
return false;
|
||||
|
||||
// 2. Check that it fits in the immediate.
|
||||
if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
|
||||
return false;
|
||||
|
||||
// 3. Check that the computation is legal.
|
||||
if (!TLI.isOperationLegal(ISD::ADD, PtrType))
|
||||
return false;
|
||||
|
||||
// Check that the zext is legal if it needs one.
|
||||
EVT TruncateType = Inst->getValueType(0);
|
||||
if (TruncateType != SliceType &&
|
||||
!TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/// \brief Get the offset in bytes of this slice in the original chunk of
|
||||
/// bits.
|
||||
/// \pre DAG != NULL.
|
||||
uint64_t getOffsetFromBase() const {
|
||||
assert(DAG && "Missing context.");
|
||||
bool IsBigEndian =
|
||||
DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
|
||||
assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
|
||||
uint64_t Offset = Shift / 8;
|
||||
unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
|
||||
assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
|
||||
"The size of the original loaded type is not a multiple of a"
|
||||
" byte.");
|
||||
// If Offset is bigger than TySizeInBytes, it means we are loading all
|
||||
// zeros. This should have been optimized before in the process.
|
||||
assert(TySizeInBytes > Offset &&
|
||||
"Invalid shift amount for given loaded size");
|
||||
if (IsBigEndian)
|
||||
Offset = TySizeInBytes - Offset - getLoadedSize();
|
||||
return Offset;
|
||||
}
|
||||
|
||||
/// \brief Generate the sequence of instructions to load the slice
|
||||
/// represented by this object and redirect the uses of this slice to
|
||||
/// this new sequence of instructions.
|
||||
/// \pre this->Inst && this->Origin are valid Instructions and this
|
||||
/// object passed the legal check: LoadedSlice::isLegal returned true.
|
||||
/// \return The last instruction of the sequence used to load the slice.
|
||||
SDValue loadSlice() const {
|
||||
assert(Inst && Origin && "Unable to replace a non-existing slice.");
|
||||
const SDValue &OldBaseAddr = Origin->getBasePtr();
|
||||
SDValue BaseAddr = OldBaseAddr;
|
||||
// Get the offset in that chunk of bytes w.r.t. the endianess.
|
||||
int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
|
||||
assert(Offset >= 0 && "Offset too big to fit in int64_t!");
|
||||
if (Offset) {
|
||||
// BaseAddr = BaseAddr + Offset.
|
||||
EVT ArithType = BaseAddr.getValueType();
|
||||
BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr,
|
||||
DAG->getConstant(Offset, ArithType));
|
||||
}
|
||||
|
||||
// Create the type of the loaded slice according to its size.
|
||||
EVT SliceType = getLoadedType();
|
||||
|
||||
// Create the load for the slice.
|
||||
SDValue LastInst = DAG->getLoad(
|
||||
SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
|
||||
Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
|
||||
Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
|
||||
// If the final type is not the same as the loaded type, this means that
|
||||
// we have to pad with zero. Create a zero extend for that.
|
||||
EVT FinalType = Inst->getValueType(0);
|
||||
if (SliceType != FinalType)
|
||||
LastInst =
|
||||
DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
|
||||
return LastInst;
|
||||
}
|
||||
|
||||
/// \brief Check if this slice can be merged with an expensive cross register
|
||||
/// bank copy. E.g.,
|
||||
/// i = load i32
|
||||
/// f = bitcast i32 i to float
|
||||
bool canMergeExpensiveCrossRegisterBankCopy() const {
|
||||
if (!Inst || !Inst->hasOneUse())
|
||||
return false;
|
||||
SDNode *Use = *Inst->use_begin();
|
||||
if (Use->getOpcode() != ISD::BITCAST)
|
||||
return false;
|
||||
assert(DAG && "Missing context");
|
||||
const TargetLowering &TLI = DAG->getTargetLoweringInfo();
|
||||
EVT ResVT = Use->getValueType(0);
|
||||
const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
|
||||
const TargetRegisterClass *ArgRC =
|
||||
TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
|
||||
if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
|
||||
return false;
|
||||
|
||||
// At this point, we know that we perform a cross-register-bank copy.
|
||||
// Check if it is expensive.
|
||||
const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
|
||||
// Assume bitcasts are cheap, unless both register classes do not
|
||||
// explicitly share a common sub class.
|
||||
if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
|
||||
return false;
|
||||
|
||||
// Check if it will be merged with the load.
|
||||
// 1. Check the alignment constraint.
|
||||
unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
|
||||
ResVT.getTypeForEVT(*DAG->getContext()));
|
||||
|
||||
if (RequiredAlignment > getAlignment())
|
||||
return false;
|
||||
|
||||
// 2. Check that the load is a legal operation for that type.
|
||||
if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
|
||||
return false;
|
||||
|
||||
// 3. Check that we do not have a zext in the way.
|
||||
if (Inst->getValueType(0) != getLoadedType())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// \brief Sorts LoadedSlice according to their offset.
|
||||
struct LoadedSliceSorter {
|
||||
bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) {
|
||||
assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
|
||||
return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
|
||||
}
|
||||
};
|
||||
|
||||
/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
|
||||
/// \p UsedBits looks like 0..0 1..1 0..0.
|
||||
static bool areUsedBitsDense(const APInt &UsedBits) {
|
||||
// If all the bits are one, this is dense!
|
||||
if (UsedBits.isAllOnesValue())
|
||||
return true;
|
||||
|
||||
// Get rid of the unused bits on the right.
|
||||
APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
|
||||
// Get rid of the unused bits on the left.
|
||||
if (NarrowedUsedBits.countLeadingZeros())
|
||||
NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
|
||||
// Check that the chunk of bits is completely used.
|
||||
return NarrowedUsedBits.isAllOnesValue();
|
||||
}
|
||||
|
||||
/// \brief Check whether or not \p First and \p Second are next to each other
|
||||
/// in memory. This means that there is no hole between the bits loaded
|
||||
/// by \p First and the bits loaded by \p Second.
|
||||
static bool areSlicesNextToEachOther(const LoadedSlice &First,
|
||||
const LoadedSlice &Second) {
|
||||
assert(First.Origin == Second.Origin && First.Origin &&
|
||||
"Unable to match different memory origins.");
|
||||
APInt UsedBits = First.getUsedBits();
|
||||
assert((UsedBits & Second.getUsedBits()) == 0 &&
|
||||
"Slices are not supposed to overlap.");
|
||||
UsedBits |= Second.getUsedBits();
|
||||
return areUsedBitsDense(UsedBits);
|
||||
}
|
||||
|
||||
/// \brief Adjust the \p GlobalLSCost according to the target
|
||||
/// paring capabilities and the layout of the slices.
|
||||
/// \pre \p GlobalLSCost should account for at least as many loads as
|
||||
/// there is in the slices in \p LoadedSlices.
|
||||
static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
|
||||
LoadedSlice::Cost &GlobalLSCost) {
|
||||
unsigned NumberOfSlices = LoadedSlices.size();
|
||||
// If there is less than 2 elements, no pairing is possible.
|
||||
if (NumberOfSlices < 2)
|
||||
return;
|
||||
|
||||
// Sort the slices so that elements that are likely to be next to each
|
||||
// other in memory are next to each other in the list.
|
||||
std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter());
|
||||
const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
|
||||
// First (resp. Second) is the first (resp. Second) potentially candidate
|
||||
// to be placed in a paired load.
|
||||
const LoadedSlice *First = NULL;
|
||||
const LoadedSlice *Second = NULL;
|
||||
for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
|
||||
// Set the beginning of the pair.
|
||||
First = Second) {
|
||||
|
||||
Second = &LoadedSlices[CurrSlice];
|
||||
|
||||
// If First is NULL, it means we start a new pair.
|
||||
// Get to the next slice.
|
||||
if (!First)
|
||||
continue;
|
||||
|
||||
EVT LoadedType = First->getLoadedType();
|
||||
|
||||
// If the types of the slices are different, we cannot pair them.
|
||||
if (LoadedType != Second->getLoadedType())
|
||||
continue;
|
||||
|
||||
// Check if the target supplies paired loads for this type.
|
||||
unsigned RequiredAlignment = 0;
|
||||
if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
|
||||
// move to the next pair, this type is hopeless.
|
||||
Second = NULL;
|
||||
continue;
|
||||
}
|
||||
// Check if we meet the alignment requirement.
|
||||
if (RequiredAlignment > First->getAlignment())
|
||||
continue;
|
||||
|
||||
// Check that both loads are next to each other in memory.
|
||||
if (!areSlicesNextToEachOther(*First, *Second))
|
||||
continue;
|
||||
|
||||
assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
|
||||
--GlobalLSCost.Loads;
|
||||
// Move to the next pair.
|
||||
Second = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Check the profitability of all involved LoadedSlice.
|
||||
/// Currently, it is considered profitable if there is exactly two
|
||||
/// involved slices (1) which are (2) next to each other in memory, and
|
||||
/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
|
||||
///
|
||||
/// Note: The order of the elements in \p LoadedSlices may be modified, but not
|
||||
/// the elements themselves.
|
||||
///
|
||||
/// FIXME: When the cost model will be mature enough, we can relax
|
||||
/// constraints (1) and (2).
|
||||
static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
|
||||
const APInt &UsedBits, bool ForCodeSize) {
|
||||
unsigned NumberOfSlices = LoadedSlices.size();
|
||||
if (StressLoadSlicing)
|
||||
return NumberOfSlices > 1;
|
||||
|
||||
// Check (1).
|
||||
if (NumberOfSlices != 2)
|
||||
return false;
|
||||
|
||||
// Check (2).
|
||||
if (!areUsedBitsDense(UsedBits))
|
||||
return false;
|
||||
|
||||
// Check (3).
|
||||
LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
|
||||
// The original code has one big load.
|
||||
OrigCost.Loads = 1;
|
||||
for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
|
||||
const LoadedSlice &LS = LoadedSlices[CurrSlice];
|
||||
// Accumulate the cost of all the slices.
|
||||
LoadedSlice::Cost SliceCost(LS, ForCodeSize);
|
||||
GlobalSlicingCost += SliceCost;
|
||||
|
||||
// Account as cost in the original configuration the gain obtained
|
||||
// with the current slices.
|
||||
OrigCost.addSliceGain(LS);
|
||||
}
|
||||
|
||||
// If the target supports paired load, adjust the cost accordingly.
|
||||
adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
|
||||
return OrigCost > GlobalSlicingCost;
|
||||
}
|
||||
|
||||
/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
|
||||
/// operations, split it in the various pieces being extracted.
|
||||
///
|
||||
/// This sort of thing is introduced by SROA.
|
||||
/// This slicing takes care not to insert overlapping loads.
|
||||
/// \pre LI is a simple load (i.e., not an atomic or volatile load).
|
||||
bool DAGCombiner::SliceUpLoad(SDNode *N) {
|
||||
if (Level < AfterLegalizeDAG)
|
||||
return false;
|
||||
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
|
||||
!LD->getValueType(0).isInteger())
|
||||
return false;
|
||||
|
||||
// Keep track of already used bits to detect overlapping values.
|
||||
// In that case, we will just abort the transformation.
|
||||
APInt UsedBits(LD->getValueSizeInBits(0), 0);
|
||||
|
||||
SmallVector<LoadedSlice, 4> LoadedSlices;
|
||||
|
||||
// Check if this load is used as several smaller chunks of bits.
|
||||
// Basically, look for uses in trunc or trunc(lshr) and record a new chain
|
||||
// of computation for each trunc.
|
||||
for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
|
||||
UI != UIEnd; ++UI) {
|
||||
// Skip the uses of the chain.
|
||||
if (UI.getUse().getResNo() != 0)
|
||||
continue;
|
||||
|
||||
SDNode *User = *UI;
|
||||
unsigned Shift = 0;
|
||||
|
||||
// Check if this is a trunc(lshr).
|
||||
if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
|
||||
isa<ConstantSDNode>(User->getOperand(1))) {
|
||||
Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
|
||||
User = *User->use_begin();
|
||||
}
|
||||
|
||||
// At this point, User is a Truncate, iff we encountered, trunc or
|
||||
// trunc(lshr).
|
||||
if (User->getOpcode() != ISD::TRUNCATE)
|
||||
return false;
|
||||
|
||||
// The width of the type must be a power of 2 and greater than 8-bits.
|
||||
// Otherwise the load cannot be represented in LLVM IR.
|
||||
// Moreover, if we shifted with a non 8-bits multiple, the slice
|
||||
// will be accross several bytes. We do not support that.
|
||||
unsigned Width = User->getValueSizeInBits(0);
|
||||
if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
|
||||
return 0;
|
||||
|
||||
// Build the slice for this chain of computations.
|
||||
LoadedSlice LS(User, LD, Shift, &DAG);
|
||||
APInt CurrentUsedBits = LS.getUsedBits();
|
||||
|
||||
// Check if this slice overlaps with another.
|
||||
if ((CurrentUsedBits & UsedBits) != 0)
|
||||
return false;
|
||||
// Update the bits used globally.
|
||||
UsedBits |= CurrentUsedBits;
|
||||
|
||||
// Check if the new slice would be legal.
|
||||
if (!LS.isLegal())
|
||||
return false;
|
||||
|
||||
// Record the slice.
|
||||
LoadedSlices.push_back(LS);
|
||||
}
|
||||
|
||||
// Abort slicing if it does not seem to be profitable.
|
||||
if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
|
||||
return false;
|
||||
|
||||
++SlicedLoads;
|
||||
|
||||
// Rewrite each chain to use an independent load.
|
||||
// By construction, each chain can be represented by a unique load.
|
||||
|
||||
// Prepare the argument for the new token factor for all the slices.
|
||||
SmallVector<SDValue, 8> ArgChains;
|
||||
for (SmallVectorImpl<LoadedSlice>::const_iterator
|
||||
LSIt = LoadedSlices.begin(),
|
||||
LSItEnd = LoadedSlices.end();
|
||||
LSIt != LSItEnd; ++LSIt) {
|
||||
SDValue SliceInst = LSIt->loadSlice();
|
||||
CombineTo(LSIt->Inst, SliceInst, true);
|
||||
if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
|
||||
SliceInst = SliceInst.getOperand(0);
|
||||
assert(SliceInst->getOpcode() == ISD::LOAD &&
|
||||
"It takes more than a zext to get to the loaded slice!!");
|
||||
ArgChains.push_back(SliceInst.getValue(1));
|
||||
}
|
||||
|
||||
SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
|
||||
&ArgChains[0], ArgChains.size());
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
|
||||
/// load is having specific bytes cleared out. If so, return the byte size
|
||||
/// being masked out and the shift amount.
|
||||
|
140
test/CodeGen/X86/load-slice.ll
Normal file
140
test/CodeGen/X86/load-slice.ll
Normal file
@ -0,0 +1,140 @@
|
||||
; RUN: llc -mtriple x86_64-apple-macosx -mattr=+sse4.2 -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS
|
||||
; RUN: llc -mtriple x86_64-apple-macosx -mattr=+sse4.2 < %s -o - | FileCheck %s --check-prefix=REGULAR
|
||||
;
|
||||
; <rdar://problem/14477220>
|
||||
|
||||
%class.Complex = type { float, float }
|
||||
|
||||
|
||||
; Check that independant slices leads to independant loads then the slices leads to
|
||||
; different register file.
|
||||
;
|
||||
; The layout is:
|
||||
; LSB 0 1 2 3 | 4 5 6 7 MSB
|
||||
; Low High
|
||||
; The base address points to 0 and is 8-bytes aligned.
|
||||
; Low slice starts at 0 (base) and is 8-bytes aligned.
|
||||
; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned.
|
||||
;
|
||||
; STRESS-LABEL: t1:
|
||||
; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
|
||||
; STRESS: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
|
||||
; Add high slice: out[out_start].imm, this is base + 4.
|
||||
; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
|
||||
; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
|
||||
; STRESS-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
|
||||
; Add low slice: out[out_start].real, this is base + 0.
|
||||
; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
|
||||
; Swap Imm and Real.
|
||||
; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
|
||||
; Put the results back into out[out_start].
|
||||
; STRESS-NEXT: vmovq [[RES_Vec]], ([[BASE]])
|
||||
;
|
||||
; Same for REGULAR, we eliminate register bank copy with each slices.
|
||||
; REGULAR-LABEL: t1:
|
||||
; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
|
||||
; REGULAR: vmovss 68([[BASE:[^)]+]]), [[OUT_Imm:%xmm[0-9]+]]
|
||||
; Add high slice: out[out_start].imm, this is base + 4.
|
||||
; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
|
||||
; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
|
||||
; REGULAR-NEXT: vmovss 64([[BASE]]), [[OUT_Real:%xmm[0-9]+]]
|
||||
; Add low slice: out[out_start].real, this is base + 0.
|
||||
; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
|
||||
; Swap Imm and Real.
|
||||
; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]]
|
||||
; Put the results back into out[out_start].
|
||||
; REGULAR-NEXT: vmovq [[RES_Vec]], ([[BASE]])
|
||||
define void @t1(%class.Complex* nocapture %out, i64 %out_start) {
|
||||
entry:
|
||||
%arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
|
||||
%tmp = bitcast %class.Complex* %arrayidx to i64*
|
||||
%tmp1 = load i64* %tmp, align 8
|
||||
%t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32
|
||||
%tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
|
||||
%t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32
|
||||
%t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
|
||||
%tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
|
||||
%add = add i64 %out_start, 8
|
||||
%arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
|
||||
%i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
|
||||
%tmp4 = load float* %i.i, align 4
|
||||
%add.i = fadd float %tmp4, %tmp2
|
||||
%retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
|
||||
%r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
|
||||
%tmp5 = load float* %r.i, align 4
|
||||
%add5.i = fadd float %tmp5, %tmp3
|
||||
%retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
|
||||
%ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
|
||||
store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.lifetime.start(i64, i8* nocapture)
|
||||
|
||||
; Function Attrs: nounwind
|
||||
declare void @llvm.lifetime.end(i64, i8* nocapture)
|
||||
|
||||
; Check that we do not read outside of the chunk of bits of the original loads.
|
||||
;
|
||||
; The 64-bits should have been split in one 32-bits and one 16-bits slices.
|
||||
; The 16-bits should be zero extended to match the final type.
|
||||
;
|
||||
; The memory layout is:
|
||||
; LSB 0 1 2 3 | 4 5 | 6 7 MSB
|
||||
; Low High
|
||||
; The base address points to 0 and is 8-bytes aligned.
|
||||
; Low slice starts at 0 (base) and is 8-bytes aligned.
|
||||
; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned.
|
||||
;
|
||||
; STRESS-LABEL: t2:
|
||||
; STRESS: movzwl 6([[BASE:[^)]+]]), %eax
|
||||
; STRESS-NEXT: addl ([[BASE]]), %eax
|
||||
; STRESS-NEXT: ret
|
||||
;
|
||||
; For the REGULAR heuristic, this is not profitable to slice things that are not
|
||||
; next to each other in memory. Here we have a hole with bytes #4-5.
|
||||
; REGULAR-LABEL: t2:
|
||||
; REGULAR: shrq $48
|
||||
define i32 @t2(%class.Complex* nocapture %out, i64 %out_start) {
|
||||
%arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
|
||||
%bitcast = bitcast %class.Complex* %arrayidx to i64*
|
||||
%chunk64 = load i64* %bitcast, align 8
|
||||
%slice32_low = trunc i64 %chunk64 to i32
|
||||
%shift48 = lshr i64 %chunk64, 48
|
||||
%slice32_high = trunc i64 %shift48 to i32
|
||||
%res = add i32 %slice32_high, %slice32_low
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; Check that we do not optimize overlapping slices.
|
||||
;
|
||||
; The 64-bits should NOT have been split in as slices are overlapping.
|
||||
; First slice uses bytes numbered 0 to 3.
|
||||
; Second slice uses bytes numbered 6 and 7.
|
||||
; Third slice uses bytes numbered 4 to 7.
|
||||
;
|
||||
; STRESS-LABEL: t3:
|
||||
; STRESS: shrq $48
|
||||
; STRESS: shrq $32
|
||||
;
|
||||
; REGULAR-LABEL: t3:
|
||||
; REGULAR: shrq $48
|
||||
; REGULAR: shrq $32
|
||||
define i32 @t3(%class.Complex* nocapture %out, i64 %out_start) {
|
||||
%arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
|
||||
%bitcast = bitcast %class.Complex* %arrayidx to i64*
|
||||
%chunk64 = load i64* %bitcast, align 8
|
||||
%slice32_low = trunc i64 %chunk64 to i32
|
||||
%shift48 = lshr i64 %chunk64, 48
|
||||
%slice32_high = trunc i64 %shift48 to i32
|
||||
%shift32 = lshr i64 %chunk64, 32
|
||||
%slice32_lowhigh = trunc i64 %shift32 to i32
|
||||
%tmpres = add i32 %slice32_high, %slice32_low
|
||||
%res = add i32 %slice32_lowhigh, %tmpres
|
||||
ret i32 %res
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user