mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-27 23:51:56 +00:00
Reland [X86][CostModel] X86TTIImpl::getMemoryOpCost(): rewrite vector handling again
Instead of handling power-of-two sized vector chunks, try handling the large vector in a stream mode, decreasing the operational vector size once it no longer works for the elements left to process. Notably, this improves costs for overaligned loads - loading padding is fine. This more directly tracks when we need to insert/extract the YMM/XMM subvector, some costs fluctuate because of that. This was initially landed inc02476f315
, but reverted in5fddc3312b
, because the code made some very optimistic assumptions about invariants that didn't hold in practice. Reviewed By: RKSimon, ABataev Differential Revision: https://reviews.llvm.org/D100684
This commit is contained in:
parent
fd5cc41818
commit
05a4e4a89c
@ -3290,50 +3290,131 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
|
||||
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
|
||||
CostKind);
|
||||
|
||||
// Handle non-power-of-two vectors such as <3 x float> and <48 x i16>
|
||||
if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
|
||||
const unsigned NumElem = VTy->getNumElements();
|
||||
if (!isPowerOf2_32(NumElem)) {
|
||||
// Factorize NumElem into sum of power-of-two.
|
||||
InstructionCost Cost = 0;
|
||||
unsigned NumElemDone = 0;
|
||||
for (unsigned NumElemLeft = NumElem, Factor;
|
||||
Factor = PowerOf2Floor(NumElemLeft), NumElemLeft > 0;
|
||||
NumElemLeft -= Factor) {
|
||||
Type *SubTy = FixedVectorType::get(VTy->getScalarType(), Factor);
|
||||
unsigned SubTyBytes = SubTy->getPrimitiveSizeInBits() / 8;
|
||||
|
||||
Cost +=
|
||||
getMemoryOpCost(Opcode, SubTy, Alignment, AddressSpace, CostKind);
|
||||
|
||||
std::pair<InstructionCost, MVT> LST =
|
||||
TLI->getTypeLegalizationCost(DL, SubTy);
|
||||
if (!LST.second.isVector()) {
|
||||
APInt DemandedElts =
|
||||
APInt::getBitsSet(NumElem, NumElemDone, NumElemDone + Factor);
|
||||
Cost += getScalarizationOverhead(VTy, DemandedElts,
|
||||
Opcode == Instruction::Load,
|
||||
Opcode == Instruction::Store);
|
||||
}
|
||||
|
||||
NumElemDone += Factor;
|
||||
Alignment = commonAlignment(Alignment.valueOrOne(), SubTyBytes);
|
||||
}
|
||||
assert(NumElemDone == NumElem && "Processed wrong element count?");
|
||||
return Cost;
|
||||
}
|
||||
}
|
||||
|
||||
// Legalize the type.
|
||||
std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
|
||||
|
||||
// Each load/store unit costs 1.
|
||||
InstructionCost Cost = LT.first * 1;
|
||||
auto *VTy = dyn_cast<FixedVectorType>(Src);
|
||||
|
||||
// This isn't exactly right. We're using slow unaligned 32-byte accesses as a
|
||||
// proxy for a double-pumped AVX memory interface such as on Sandybridge.
|
||||
if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
|
||||
Cost *= 2;
|
||||
// Handle the simple case of non-vectors.
|
||||
// NOTE: this assumes that legalization never creates vector from scalars!
|
||||
if (!VTy || !LT.second.isVector())
|
||||
// Each load/store unit costs 1.
|
||||
return LT.first * 1;
|
||||
|
||||
bool IsLoad = Opcode == Instruction::Load;
|
||||
|
||||
Type *EltTy = VTy->getElementType();
|
||||
|
||||
const int EltTyBits = DL.getTypeSizeInBits(EltTy);
|
||||
|
||||
InstructionCost Cost = 0;
|
||||
|
||||
// Source of truth: how many elements were there in the original IR vector?
|
||||
const unsigned SrcNumElt = VTy->getNumElements();
|
||||
|
||||
// How far have we gotten?
|
||||
int NumEltRemaining = SrcNumElt;
|
||||
// Note that we intentionally capture by-reference, NumEltRemaining changes.
|
||||
auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
|
||||
|
||||
const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
|
||||
|
||||
// Note that even if we can store 64 bits of an XMM, we still operate on XMM.
|
||||
const unsigned XMMBits = 128;
|
||||
if (XMMBits % EltTyBits != 0)
|
||||
// Vector size must be a multiple of the element size. I.e. no padding.
|
||||
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
|
||||
CostKind);
|
||||
const int NumEltPerXMM = XMMBits / EltTyBits;
|
||||
|
||||
auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
|
||||
|
||||
for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
|
||||
NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
|
||||
// How many elements would a single op deal with at once?
|
||||
if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
|
||||
// Vector size must be a multiple of the element size. I.e. no padding.
|
||||
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
|
||||
CostKind);
|
||||
int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
|
||||
|
||||
assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
|
||||
assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
|
||||
(CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
|
||||
"Unless we haven't halved the op size yet, "
|
||||
"we have less than two op's sized units of work left.");
|
||||
|
||||
auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
|
||||
? FixedVectorType::get(EltTy, CurrNumEltPerOp)
|
||||
: XMMVecTy;
|
||||
|
||||
assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
|
||||
"After halving sizes, the vector elt count is no longer a multiple "
|
||||
"of number of elements per operation?");
|
||||
auto *CoalescedVecTy =
|
||||
CurrNumEltPerOp == 1
|
||||
? CurrVecTy
|
||||
: FixedVectorType::get(
|
||||
IntegerType::get(Src->getContext(),
|
||||
EltTyBits * CurrNumEltPerOp),
|
||||
CurrVecTy->getNumElements() / CurrNumEltPerOp);
|
||||
assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
|
||||
DL.getTypeSizeInBits(CurrVecTy) &&
|
||||
"coalesciing elements doesn't change vector width.");
|
||||
|
||||
while (NumEltRemaining > 0) {
|
||||
assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
|
||||
|
||||
// Can we use this vector size, as per the remaining element count?
|
||||
// Iff the vector is naturally aligned, we can do a wide load regardless.
|
||||
if (NumEltRemaining < CurrNumEltPerOp &&
|
||||
(!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
|
||||
CurrOpSizeBytes != 1)
|
||||
break; // Try smalled vector size.
|
||||
|
||||
bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
|
||||
|
||||
// If we have fully processed the previous reg, we need to replenish it.
|
||||
if (SubVecEltsLeft == 0) {
|
||||
SubVecEltsLeft += CurrVecTy->getNumElements();
|
||||
// And that's free only for the 0'th subvector of a legalized vector.
|
||||
if (!Is0thSubVec)
|
||||
Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
|
||||
: TTI::ShuffleKind::SK_ExtractSubvector,
|
||||
VTy, None, NumEltDone(), CurrVecTy);
|
||||
}
|
||||
|
||||
// While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
|
||||
// for smaller widths (32/16/8) we have to insert/extract them separately.
|
||||
// Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
|
||||
// but let's pretend that it is also true for 16/8 bit wide ops...)
|
||||
if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
|
||||
int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
|
||||
assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
|
||||
int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
|
||||
APInt DemandedElts =
|
||||
APInt::getBitsSet(CoalescedVecTy->getNumElements(),
|
||||
CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
|
||||
assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
|
||||
Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
|
||||
!IsLoad);
|
||||
}
|
||||
|
||||
// This isn't exactly right. We're using slow unaligned 32-byte accesses
|
||||
// as a proxy for a double-pumped AVX memory interface such as on
|
||||
// Sandybridge.
|
||||
if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
|
||||
Cost += 2;
|
||||
else
|
||||
Cost += 1;
|
||||
|
||||
SubVecEltsLeft -= CurrNumEltPerOp;
|
||||
NumEltRemaining -= CurrNumEltPerOp;
|
||||
Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
|
||||
}
|
||||
}
|
||||
|
||||
assert(NumEltRemaining <= 0 && "Should have processed all the elements.");
|
||||
|
||||
return Cost;
|
||||
}
|
||||
|
@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; CHECK: LV: Checking a loop in "test"
|
||||
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
; CHECK: LV: Found an estimated cost of 30 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: %v0 = load i16, i16* %in0, align 2
|
||||
|
||||
|
@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"
|
||||
|
||||
; CHECK: LV: Checking a loop in "test"
|
||||
; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
; CHECK: LV: Found an estimated cost of 34 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction: store i16 %v2, i16* %out2, align 2
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user