[VectorCombine] allow vector loads with mismatched insert type

This is an enhancement to D81766 to allow loading the minimum target
vector type into an IR vector with a different number of elements.

In one of the motivating tests from PR16739, SLP creates <2 x float>
load ops mixed with <4 x float> insert ops, so we want to handle that
pattern in addition to potential oversized vectors created by the
vectorizers.

For now, we are assuming the insert/extract subvector with undef is
free because there is no exact corresponding TTI modeling for that.

Differential Revision: https://reviews.llvm.org/D86160
This commit is contained in:
Sanjay Patel 2020-09-02 08:09:24 -04:00
parent 260b1e427d
commit 4e9822e551
2 changed files with 34 additions and 30 deletions

View File

@ -100,36 +100,36 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
Type *ScalarTy = Scalar->getType();
if (!Load || !Load->isSimple())
return false;
auto *Ty = dyn_cast<FixedVectorType>(I.getType());
if (!Ty)
return false;
// TODO: Extend this to match GEP with constant offsets.
Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
assert(isa<PointerType>(PtrOp->getType()) && "Expected a pointer type");
unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
if (!ScalarSize || !VectorSize || VectorSize % ScalarSize != 0)
if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0)
return false;
// Check safety of replacing the scalar load with a larger vector load.
unsigned VecNumElts = VectorSize / ScalarSize;
auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
// TODO: Allow insert/extract subvector if the type does not match.
if (VectorTy != I.getType())
return false;
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
Align Alignment = Load->getAlign();
const DataLayout &DL = I.getModule()->getDataLayout();
if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
if (!isSafeToLoadUnconditionally(PtrOp, MinVecTy, Alignment, DL, Load, &DT))
return false;
unsigned AS = Load->getPointerAddressSpace();
// Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, AS);
APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0);
OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false);
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, true, false);
// New pattern: load VecPtr
int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, AS);
int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
// We can aggressively convert to the vector form because the backend can
// invert this transform if it does not result in a performance win.
@ -139,8 +139,18 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// It is safe and potentially profitable to load a vector directly:
// inselt undef, load Scalar, 0 --> load VecPtr
IRBuilder<> Builder(Load);
Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS));
LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
Value *CastedPtr = Builder.CreateBitCast(PtrOp, MinVecTy->getPointerTo(AS));
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
// If the insert type does not match the target's minimum vector type,
// use an identity shuffle to shrink/grow the vector.
if (Ty != MinVecTy) {
unsigned OutputNumElts = Ty->getNumElements();
SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i)
Mask[i] = i;
VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(MinVecTy), Mask);
}
replaceValue(I, *VecLd);
++NumVecLoad;
return true;

View File

@ -346,12 +346,11 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
ret <4 x float> %r
}
; TODO: Should load v4i32.
define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_i32_insert_v8i32(
; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%s = load i32, i32* %p, align 4
@ -359,13 +358,10 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
ret <8 x i32> %r
}
; TODO: Should load v4i32.
define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
; CHECK-LABEL: @casted_load_i32_insert_v8i32(
; CHECK-NEXT: [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 4
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%b = bitcast <4 x i32>* %p to i32*
@ -374,12 +370,11 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
ret <8 x i32> %r
}
; TODO: Should load v4f32.
define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v16f32(
; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4
; CHECK-NEXT: [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
; CHECK-NEXT: ret <16 x float> [[R]]
;
%s = load float, float* %p, align 4
@ -387,12 +382,11 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
ret <16 x float> %r
}
; TODO: Should load v4f32.
define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
; CHECK-LABEL: @load_f32_insert_v2f32(
; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
; CHECK-NEXT: ret <2 x float> [[R]]
;
%s = load float, float* %p, align 4