[VectorCombine] allow vector loads with mismatched insert type

This is an enhancement to D81766 to allow loading the minimum target vector type into an IR vector with a different number of elements. In one of the motivating tests from PR16739, SLP creates <2 x float> load ops mixed with <4 x float> insert ops, so we want to handle that pattern in addition to potential oversized vectors created by the vectorizers. For now, we are assuming the insert/extract subvector with undef is free because there is no exact corresponding TTI modeling for that. Differential Revision: https://reviews.llvm.org/D86160
2024-11-25 04:19:43 +00:00 · 2020-09-02 08:09:24 -04:00 · 2020-09-02 08:09:24 -04:00 · 4e9822e551
commit 4e9822e551
parent 260b1e427d
2 changed files with 34 additions and 30 deletions
--- a/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/lib/Transforms/Vectorize/VectorCombine.cpp
@ -100,36 +100,36 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  Type *ScalarTy = Scalar->getType();
  if (!Load || !Load->isSimple())
    return false;
+  auto *Ty = dyn_cast<FixedVectorType>(I.getType());
+  if (!Ty)
+    return false;

  // TODO: Extend this to match GEP with constant offsets.
  Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
  assert(isa<PointerType>(PtrOp->getType()) && "Expected a pointer type");

-  unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
+  unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
  uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
-  if (!ScalarSize || !VectorSize || VectorSize % ScalarSize != 0)
+  if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0)
    return false;

  // Check safety of replacing the scalar load with a larger vector load.
-  unsigned VecNumElts = VectorSize / ScalarSize;
-  auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
-  // TODO: Allow insert/extract subvector if the type does not match.
-  if (VectorTy != I.getType())
-    return false;
+  unsigned MinVecNumElts = MinVectorSize / ScalarSize;
+  auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
  Align Alignment = Load->getAlign();
  const DataLayout &DL = I.getModule()->getDataLayout();
-  if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
+  if (!isSafeToLoadUnconditionally(PtrOp, MinVecTy, Alignment, DL, Load, &DT))
    return false;

  unsigned AS = Load->getPointerAddressSpace();

  // Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
  int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, AS);
-  APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0);
-  OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false);
+  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+  OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, true, false);

  // New pattern: load VecPtr
-  int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, AS);
+  int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);

  // We can aggressively convert to the vector form because the backend can
  // invert this transform if it does not result in a performance win.
@ -139,8 +139,18 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
  // It is safe and potentially profitable to load a vector directly:
  // inselt undef, load Scalar, 0 --> load VecPtr
  IRBuilder<> Builder(Load);
-  Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS));
-  LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
+  Value *CastedPtr = Builder.CreateBitCast(PtrOp, MinVecTy->getPointerTo(AS));
+  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+
+  // If the insert type does not match the target's minimum vector type,
+  // use an identity shuffle to shrink/grow the vector.
+  if (Ty != MinVecTy) {
+    unsigned OutputNumElts = Ty->getNumElements();
+    SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
+    for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i)
+      Mask[i] = i;
+    VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(MinVecTy), Mask);
+  }
  replaceValue(I, *VecLd);
  ++NumVecLoad;
  return true;
--- a/test/Transforms/VectorCombine/X86/load.ll
+++ b/test/Transforms/VectorCombine/X86/load.ll
@ -346,12 +346,11 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
  ret <4 x float> %r
 }

-; TODO: Should load v4i32.
-
 define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_i32_insert_v8i32(
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
  %s = load i32, i32* %p, align 4
@ -359,13 +358,10 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
  ret <8 x i32> %r
 }

-; TODO: Should load v4i32.
-
 define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
 ; CHECK-LABEL: @casted_load_i32_insert_v8i32(
-; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
-; CHECK-NEXT:    [[S:%.*]] = load i32, i32* [[B]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i32> [[R]]
 ;
  %b = bitcast <4 x i32>* %p to i32*
@ -374,12 +370,11 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
  ret <8 x i32> %r
 }

-; TODO: Should load v4f32.
-
 define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v16f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x float> [[R]]
 ;
  %s = load float, float* %p, align 4
@ -387,12 +382,11 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
  ret <16 x float> %r
 }

-; TODO: Should load v4f32.
-
 define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
 ; CHECK-LABEL: @load_f32_insert_v2f32(
-; CHECK-NEXT:    [[S:%.*]] = load float, float* [[P:%.*]], align 4
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    ret <2 x float> [[R]]
 ;
  %s = load float, float* %p, align 4