mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 04:19:43 +00:00
[VectorCombine] allow vector loads with mismatched insert type
This is an enhancement to D81766 to allow loading the minimum target vector type into an IR vector with a different number of elements. In one of the motivating tests from PR16739, SLP creates <2 x float> load ops mixed with <4 x float> insert ops, so we want to handle that pattern in addition to potential oversized vectors created by the vectorizers. For now, we are assuming the insert/extract subvector with undef is free because there is no exact corresponding TTI modeling for that. Differential Revision: https://reviews.llvm.org/D86160
This commit is contained in:
parent
260b1e427d
commit
4e9822e551
@ -100,36 +100,36 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
||||
Type *ScalarTy = Scalar->getType();
|
||||
if (!Load || !Load->isSimple())
|
||||
return false;
|
||||
auto *Ty = dyn_cast<FixedVectorType>(I.getType());
|
||||
if (!Ty)
|
||||
return false;
|
||||
|
||||
// TODO: Extend this to match GEP with constant offsets.
|
||||
Value *PtrOp = Load->getPointerOperand()->stripPointerCasts();
|
||||
assert(isa<PointerType>(PtrOp->getType()) && "Expected a pointer type");
|
||||
|
||||
unsigned VectorSize = TTI.getMinVectorRegisterBitWidth();
|
||||
unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
|
||||
uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
|
||||
if (!ScalarSize || !VectorSize || VectorSize % ScalarSize != 0)
|
||||
if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0)
|
||||
return false;
|
||||
|
||||
// Check safety of replacing the scalar load with a larger vector load.
|
||||
unsigned VecNumElts = VectorSize / ScalarSize;
|
||||
auto *VectorTy = VectorType::get(ScalarTy, VecNumElts, false);
|
||||
// TODO: Allow insert/extract subvector if the type does not match.
|
||||
if (VectorTy != I.getType())
|
||||
return false;
|
||||
unsigned MinVecNumElts = MinVectorSize / ScalarSize;
|
||||
auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
|
||||
Align Alignment = Load->getAlign();
|
||||
const DataLayout &DL = I.getModule()->getDataLayout();
|
||||
if (!isSafeToLoadUnconditionally(PtrOp, VectorTy, Alignment, DL, Load, &DT))
|
||||
if (!isSafeToLoadUnconditionally(PtrOp, MinVecTy, Alignment, DL, Load, &DT))
|
||||
return false;
|
||||
|
||||
unsigned AS = Load->getPointerAddressSpace();
|
||||
|
||||
// Original pattern: insertelt undef, load [free casts of] ScalarPtr, 0
|
||||
int OldCost = TTI.getMemoryOpCost(Instruction::Load, ScalarTy, Alignment, AS);
|
||||
APInt DemandedElts = APInt::getOneBitSet(VecNumElts, 0);
|
||||
OldCost += TTI.getScalarizationOverhead(VectorTy, DemandedElts, true, false);
|
||||
APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
|
||||
OldCost += TTI.getScalarizationOverhead(MinVecTy, DemandedElts, true, false);
|
||||
|
||||
// New pattern: load VecPtr
|
||||
int NewCost = TTI.getMemoryOpCost(Instruction::Load, VectorTy, Alignment, AS);
|
||||
int NewCost = TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS);
|
||||
|
||||
// We can aggressively convert to the vector form because the backend can
|
||||
// invert this transform if it does not result in a performance win.
|
||||
@ -139,8 +139,18 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
|
||||
// It is safe and potentially profitable to load a vector directly:
|
||||
// inselt undef, load Scalar, 0 --> load VecPtr
|
||||
IRBuilder<> Builder(Load);
|
||||
Value *CastedPtr = Builder.CreateBitCast(PtrOp, VectorTy->getPointerTo(AS));
|
||||
LoadInst *VecLd = Builder.CreateAlignedLoad(VectorTy, CastedPtr, Alignment);
|
||||
Value *CastedPtr = Builder.CreateBitCast(PtrOp, MinVecTy->getPointerTo(AS));
|
||||
Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
|
||||
|
||||
// If the insert type does not match the target's minimum vector type,
|
||||
// use an identity shuffle to shrink/grow the vector.
|
||||
if (Ty != MinVecTy) {
|
||||
unsigned OutputNumElts = Ty->getNumElements();
|
||||
SmallVector<int, 16> Mask(OutputNumElts, UndefMaskElem);
|
||||
for (unsigned i = 0; i < OutputNumElts && i < MinVecNumElts; ++i)
|
||||
Mask[i] = i;
|
||||
VecLd = Builder.CreateShuffleVector(VecLd, UndefValue::get(MinVecTy), Mask);
|
||||
}
|
||||
replaceValue(I, *VecLd);
|
||||
++NumVecLoad;
|
||||
return true;
|
||||
|
@ -346,12 +346,11 @@ define <4 x float> @load_f32_insert_v4f32_deref(float* align 4 dereferenceable(1
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
; TODO: Should load v4i32.
|
||||
|
||||
define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_i32_insert_v8i32(
|
||||
; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[P:%.*]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <8 x i32> [[R]]
|
||||
;
|
||||
%s = load i32, i32* %p, align 4
|
||||
@ -359,13 +358,10 @@ define <8 x i32> @load_i32_insert_v8i32(i32* align 16 dereferenceable(16) %p) {
|
||||
ret <8 x i32> %r
|
||||
}
|
||||
|
||||
; TODO: Should load v4i32.
|
||||
|
||||
define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @casted_load_i32_insert_v8i32(
|
||||
; CHECK-NEXT: [[B:%.*]] = bitcast <4 x i32>* [[P:%.*]] to i32*
|
||||
; CHECK-NEXT: [[S:%.*]] = load i32, i32* [[B]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[P:%.*]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <8 x i32> [[R]]
|
||||
;
|
||||
%b = bitcast <4 x i32>* %p to i32*
|
||||
@ -374,12 +370,11 @@ define <8 x i32> @casted_load_i32_insert_v8i32(<4 x i32>* align 4 dereferenceabl
|
||||
ret <8 x i32> %r
|
||||
}
|
||||
|
||||
; TODO: Should load v4f32.
|
||||
|
||||
define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_f32_insert_v16f32(
|
||||
; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <16 x float> [[R]]
|
||||
;
|
||||
%s = load float, float* %p, align 4
|
||||
@ -387,12 +382,11 @@ define <16 x float> @load_f32_insert_v16f32(float* align 16 dereferenceable(16)
|
||||
ret <16 x float> %r
|
||||
}
|
||||
|
||||
; TODO: Should load v4f32.
|
||||
|
||||
define <2 x float> @load_f32_insert_v2f32(float* align 16 dereferenceable(16) %p) {
|
||||
; CHECK-LABEL: @load_f32_insert_v2f32(
|
||||
; CHECK-NEXT: [[S:%.*]] = load float, float* [[P:%.*]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
|
||||
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
||||
; CHECK-NEXT: ret <2 x float> [[R]]
|
||||
;
|
||||
%s = load float, float* %p, align 4
|
||||
|
Loading…
Reference in New Issue
Block a user