Reland [X86][CostModel] X86TTIImpl::getMemoryOpCost(): rewrite vector handling again

Instead of handling power-of-two sized vector chunks, try handling the large vector in a stream mode, decreasing the operational vector size once it no longer works for the elements left to process. Notably, this improves costs for overaligned loads - loading padding is fine. This more directly tracks when we need to insert/extract the YMM/XMM subvector, some costs fluctuate because of that. This was initially landed in c02476f315, but reverted in 5fddc3312b, because the code made some very optimistic assumptions about invariants that didn't hold in practice. Reviewed By: RKSimon, ABataev Differential Revision: https://reviews.llvm.org/D100684
2024-11-27 23:51:56 +00:00 · 2021-05-22 11:40:58 +03:00 · 2021-05-22 11:40:58 +03:00 · 05a4e4a89c
commit 05a4e4a89c
parent fd5cc41818
4 changed files with 3157 additions and 3076 deletions
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@ -3290,50 +3290,131 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                  CostKind);

-  // Handle non-power-of-two vectors such as <3 x float> and <48 x i16>
-  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
-    const unsigned NumElem = VTy->getNumElements();
-    if (!isPowerOf2_32(NumElem)) {
-      // Factorize NumElem into sum of power-of-two.
-      InstructionCost Cost = 0;
-      unsigned NumElemDone = 0;
-      for (unsigned NumElemLeft = NumElem, Factor;
-           Factor = PowerOf2Floor(NumElemLeft), NumElemLeft > 0;
-           NumElemLeft -= Factor) {
-        Type *SubTy = FixedVectorType::get(VTy->getScalarType(), Factor);
-        unsigned SubTyBytes = SubTy->getPrimitiveSizeInBits() / 8;
-
-        Cost +=
-            getMemoryOpCost(Opcode, SubTy, Alignment, AddressSpace, CostKind);
-
-        std::pair<InstructionCost, MVT> LST =
-            TLI->getTypeLegalizationCost(DL, SubTy);
-        if (!LST.second.isVector()) {
-          APInt DemandedElts =
-              APInt::getBitsSet(NumElem, NumElemDone, NumElemDone + Factor);
-          Cost += getScalarizationOverhead(VTy, DemandedElts,
-                                           Opcode == Instruction::Load,
-                                           Opcode == Instruction::Store);
-        }
-
-        NumElemDone += Factor;
-        Alignment = commonAlignment(Alignment.valueOrOne(), SubTyBytes);
-      }
-      assert(NumElemDone == NumElem && "Processed wrong element count?");
-      return Cost;
-    }
-  }
-
  // Legalize the type.
  std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);

-  // Each load/store unit costs 1.
-  InstructionCost Cost = LT.first * 1;
+  auto *VTy = dyn_cast<FixedVectorType>(Src);

-  // This isn't exactly right. We're using slow unaligned 32-byte accesses as a
-  // proxy for a double-pumped AVX memory interface such as on Sandybridge.
-  if (LT.second.getStoreSize() == 32 && ST->isUnalignedMem32Slow())
-    Cost *= 2;
+  // Handle the simple case of non-vectors.
+  // NOTE: this assumes that legalization never creates vector from scalars!
+  if (!VTy || !LT.second.isVector())
+    // Each load/store unit costs 1.
+    return LT.first * 1;
+
+  bool IsLoad = Opcode == Instruction::Load;
+
+  Type *EltTy = VTy->getElementType();
+
+  const int EltTyBits = DL.getTypeSizeInBits(EltTy);
+
+  InstructionCost Cost = 0;
+
+  // Source of truth: how many elements were there in the original IR vector?
+  const unsigned SrcNumElt = VTy->getNumElements();
+
+  // How far have we gotten?
+  int NumEltRemaining = SrcNumElt;
+  // Note that we intentionally capture by-reference, NumEltRemaining changes.
+  auto NumEltDone = [&]() { return SrcNumElt - NumEltRemaining; };
+
+  const int MaxLegalOpSizeBytes = divideCeil(LT.second.getSizeInBits(), 8);
+
+  // Note that even if we can store 64 bits of an XMM, we still operate on XMM.
+  const unsigned XMMBits = 128;
+  if (XMMBits % EltTyBits != 0)
+    // Vector size must be a multiple of the element size. I.e. no padding.
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind);
+  const int NumEltPerXMM = XMMBits / EltTyBits;
+
+  auto *XMMVecTy = FixedVectorType::get(EltTy, NumEltPerXMM);
+
+  for (int CurrOpSizeBytes = MaxLegalOpSizeBytes, SubVecEltsLeft = 0;
+       NumEltRemaining > 0; CurrOpSizeBytes /= 2) {
+    // How many elements would a single op deal with at once?
+    if ((8 * CurrOpSizeBytes) % EltTyBits != 0)
+      // Vector size must be a multiple of the element size. I.e. no padding.
+      return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                    CostKind);
+    int CurrNumEltPerOp = (8 * CurrOpSizeBytes) / EltTyBits;
+
+    assert(CurrOpSizeBytes > 0 && CurrNumEltPerOp > 0 && "How'd we get here?");
+    assert((((NumEltRemaining * EltTyBits) < (2 * 8 * CurrOpSizeBytes)) ||
+            (CurrOpSizeBytes == MaxLegalOpSizeBytes)) &&
+           "Unless we haven't halved the op size yet, "
+           "we have less than two op's sized units of work left.");
+
+    auto *CurrVecTy = CurrNumEltPerOp > NumEltPerXMM
+                          ? FixedVectorType::get(EltTy, CurrNumEltPerOp)
+                          : XMMVecTy;
+
+    assert(CurrVecTy->getNumElements() % CurrNumEltPerOp == 0 &&
+           "After halving sizes, the vector elt count is no longer a multiple "
+           "of number of elements per operation?");
+    auto *CoalescedVecTy =
+        CurrNumEltPerOp == 1
+            ? CurrVecTy
+            : FixedVectorType::get(
+                  IntegerType::get(Src->getContext(),
+                                   EltTyBits * CurrNumEltPerOp),
+                  CurrVecTy->getNumElements() / CurrNumEltPerOp);
+    assert(DL.getTypeSizeInBits(CoalescedVecTy) ==
+               DL.getTypeSizeInBits(CurrVecTy) &&
+           "coalesciing elements doesn't change vector width.");
+
+    while (NumEltRemaining > 0) {
+      assert(SubVecEltsLeft >= 0 && "Subreg element count overconsumtion?");
+
+      // Can we use this vector size, as per the remaining element count?
+      // Iff the vector is naturally aligned, we can do a wide load regardless.
+      if (NumEltRemaining < CurrNumEltPerOp &&
+          (!IsLoad || Alignment.valueOrOne() < CurrOpSizeBytes) &&
+          CurrOpSizeBytes != 1)
+        break; // Try smalled vector size.
+
+      bool Is0thSubVec = (NumEltDone() % LT.second.getVectorNumElements()) == 0;
+
+      // If we have fully processed the previous reg, we need to replenish it.
+      if (SubVecEltsLeft == 0) {
+        SubVecEltsLeft += CurrVecTy->getNumElements();
+        // And that's free only for the 0'th subvector of a legalized vector.
+        if (!Is0thSubVec)
+          Cost += getShuffleCost(IsLoad ? TTI::ShuffleKind::SK_InsertSubvector
+                                        : TTI::ShuffleKind::SK_ExtractSubvector,
+                                 VTy, None, NumEltDone(), CurrVecTy);
+      }
+
+      // While we can directly load/store ZMM, YMM, and 64-bit halves of XMM,
+      // for smaller widths (32/16/8) we have to insert/extract them separately.
+      // Again, it's free for the 0'th subreg (if op is 32/64 bit wide,
+      // but let's pretend that it is also true for 16/8 bit wide ops...)
+      if (CurrOpSizeBytes <= 32 / 8 && !Is0thSubVec) {
+        int NumEltDoneInCurrXMM = NumEltDone() % NumEltPerXMM;
+        assert(NumEltDoneInCurrXMM % CurrNumEltPerOp == 0 && "");
+        int CoalescedVecEltIdx = NumEltDoneInCurrXMM / CurrNumEltPerOp;
+        APInt DemandedElts =
+            APInt::getBitsSet(CoalescedVecTy->getNumElements(),
+                              CoalescedVecEltIdx, CoalescedVecEltIdx + 1);
+        assert(DemandedElts.countPopulation() == 1 && "Inserting single value");
+        Cost += getScalarizationOverhead(CoalescedVecTy, DemandedElts, IsLoad,
+                                         !IsLoad);
+      }
+
+      // This isn't exactly right. We're using slow unaligned 32-byte accesses
+      // as a proxy for a double-pumped AVX memory interface such as on
+      // Sandybridge.
+      if (CurrOpSizeBytes == 32 && ST->isUnalignedMem32Slow())
+        Cost += 2;
+      else
+        Cost += 1;
+
+      SubVecEltsLeft -= CurrNumEltPerOp;
+      NumEltRemaining -= CurrNumEltPerOp;
+      Alignment = commonAlignment(Alignment.valueOrOne(), CurrOpSizeBytes);
+    }
+  }
+
+  assert(NumEltRemaining <= 0 && "Should have processed all the elements.");

  return Cost;
 }
--- a/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-load-i16-stride-3.ll
@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"

 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   %v0 = load i16, i16* %in0, align 2
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
-; CHECK: LV: Found an estimated cost of 30 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction:   %v0 = load i16, i16* %in0, align 2
+; CHECK: LV: Found an estimated cost of 31 for VF 4 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 58 for VF 8 For instruction:   %v0 = load i16, i16* %in0, align 2
 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction:   %v0 = load i16, i16* %in0, align 2

--- a/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
+++ b/llvm/test/Analysis/CostModel/X86/interleaved-store-i16-stride-3.ll
@ -9,8 +9,8 @@ target triple = "x86_64-unknown-linux-gnu"

 ; CHECK: LV: Checking a loop in "test"
 ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i16 %v2, i16* %out2, align 2
-; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
-; CHECK: LV: Found an estimated cost of 34 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
+; CHECK: LV: Found an estimated cost of 15 for VF 2 For instruction:   store i16 %v2, i16* %out2, align 2
+; CHECK: LV: Found an estimated cost of 35 for VF 4 For instruction:   store i16 %v2, i16* %out2, align 2
 ; CHECK: LV: Found an estimated cost of 66 for VF 8 For instruction:   store i16 %v2, i16* %out2, align 2
 ; CHECK: LV: Found an estimated cost of 171 for VF 16 For instruction:   store i16 %v2, i16* %out2, align 2

--- a/llvm/test/Analysis/CostModel/X86/load_store.ll
+++ b/llvm/test/Analysis/CostModel/X86/load_store.ll