mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-15 15:48:38 +00:00
[CostModel] Add more realistic SK_InsertSubvector generic costs.
Instead of defaulting to a cost = 1, expand to element extract/insert like we do for other shuffles. llvm-svn: 346662
This commit is contained in:
parent
bff9319409
commit
71b92e1ae4
@ -140,6 +140,28 @@ private:
|
||||
return Cost;
|
||||
}
|
||||
|
||||
/// Estimate a cost of subvector insertion as a sequence of extract and
|
||||
/// insert operations.
|
||||
unsigned getInsertSubvectorOverhead(Type *Ty, int Index, Type *SubTy) {
|
||||
assert(Ty && Ty->isVectorTy() && SubTy && SubTy->isVectorTy() &&
|
||||
"Can only insert subvectors into vectors");
|
||||
int NumSubElts = SubTy->getVectorNumElements();
|
||||
assert((Index + NumSubElts) <= (int)Ty->getVectorNumElements() &&
|
||||
"SK_InsertSubvector index out of range");
|
||||
|
||||
unsigned Cost = 0;
|
||||
// Subvector insertion cost is equal to the cost of extracting element from
|
||||
// the source type plus the cost of inserting them into the result vector
|
||||
// type.
|
||||
for (int i = 0; i != NumSubElts; ++i) {
|
||||
Cost += static_cast<T *>(this)->getVectorInstrCost(
|
||||
Instruction::ExtractElement, SubTy, i);
|
||||
Cost += static_cast<T *>(this)->getVectorInstrCost(
|
||||
Instruction::InsertElement, Ty, i + Index);
|
||||
}
|
||||
return Cost;
|
||||
}
|
||||
|
||||
/// Local query method delegates up to T which *must* implement this!
|
||||
const TargetSubtargetInfo *getST() const {
|
||||
return static_cast<const T *>(this)->getST();
|
||||
@ -603,9 +625,10 @@ public:
|
||||
return getPermuteShuffleOverhead(Tp);
|
||||
case TTI::SK_ExtractSubvector:
|
||||
return getExtractSubvectorOverhead(Tp, Index, SubTp);
|
||||
default:
|
||||
return 1;
|
||||
case TTI::SK_InsertSubvector:
|
||||
return getInsertSubvectorOverhead(Tp, Index, SubTp);
|
||||
}
|
||||
llvm_unreachable("Unknown TTI::ShuffleKind");
|
||||
}
|
||||
|
||||
unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
|
||||
|
@ -112,22 +112,22 @@ define <8 x float> @test4(<8 x i32> %trigger, <8 x float>* %addr, <8 x float> %d
|
||||
define void @test5(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %val) {
|
||||
; AVX2-LABEL: 'test5'
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; SKL-LABEL: 'test5'
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; KNL-LABEL: 'test5'
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
; SKX-LABEL: 'test5'
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
|
||||
;
|
||||
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
@ -164,22 +164,22 @@ define void @test6(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %val) {
|
||||
define <2 x float> @test7(<2 x i32> %trigger, <2 x float>* %addr, <2 x float> %dst) {
|
||||
; AVX2-LABEL: 'test7'
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
|
||||
;
|
||||
; SKL-LABEL: 'test7'
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; SKL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
|
||||
;
|
||||
; KNL-LABEL: 'test7'
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; KNL-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
|
||||
;
|
||||
; SKX-LABEL: 'test7'
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %res = call <2 x float> @llvm.masked.load.v2f32.p0v2f32(<2 x float>* %addr, i32 4, <2 x i1> %mask, <2 x float> %dst)
|
||||
; SKX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <2 x float> %res
|
||||
;
|
||||
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
|
||||
|
@ -95,65 +95,68 @@ define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger
|
||||
; FVW2-NEXT: entry:
|
||||
; FVW2-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; FVW2: vector.body:
|
||||
; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; FVW2-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
|
||||
; FVW2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]]
|
||||
; FVW2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4
|
||||
; FVW2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2
|
||||
; FVW2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
|
||||
; FVW2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
|
||||
; FVW2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD11:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
|
||||
; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6
|
||||
; FVW2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD12:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
|
||||
; FVW2-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP9:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD10]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP10:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD11]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP11:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD12]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
|
||||
; FVW2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP13]], i32 4, <2 x i1> [[TMP8]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 2
|
||||
; FVW2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP15]], i32 4, <2 x i1> [[TMP9]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4
|
||||
; FVW2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP17]], i32 4, <2 x i1> [[TMP10]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 6
|
||||
; FVW2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP19]], i32 4, <2 x i1> [[TMP11]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP20:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP21:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD13]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP22:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD14]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP23:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD15]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP20]]
|
||||
; FVW2-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP21]]
|
||||
; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP22]]
|
||||
; FVW2-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP23]]
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP24]], i32 4, <2 x i1> [[TMP8]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER16:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP25]], i32 4, <2 x i1> [[TMP9]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER17:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP10]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER18:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP27]], i32 4, <2 x i1> [[TMP11]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[TMP28:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP29:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER16]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP30:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER17]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP31:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER18]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
|
||||
; FVW2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP28]], <2 x float>* [[TMP33]], i32 4, <2 x i1> [[TMP8]])
|
||||
; FVW2-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 2
|
||||
; FVW2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP29]], <2 x float>* [[TMP35]], i32 4, <2 x i1> [[TMP9]])
|
||||
; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 4
|
||||
; FVW2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP30]], <2 x float>* [[TMP37]], i32 4, <2 x i1> [[TMP10]])
|
||||
; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i64 6
|
||||
; FVW2-NEXT: [[TMP2:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]]
|
||||
; FVW2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP4]], i32 4, <2 x i1> [[TMP2]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <2 x i64> [[TMP5]]
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP6]], i32 4, <2 x i1> [[TMP2]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]]
|
||||
; FVW2-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP7]], <2 x float>* [[TMP9]], i32 4, <2 x i1> [[TMP2]])
|
||||
; FVW2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 2
|
||||
; FVW2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]]
|
||||
; FVW2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP11]], align 4
|
||||
; FVW2-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_1]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]]
|
||||
; FVW2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP14]], i32 4, <2 x i1> [[TMP12]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP15:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_1]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP15]]
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP16]], i32 4, <2 x i1> [[TMP12]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[TMP17:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_1]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]]
|
||||
; FVW2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP17]], <2 x float>* [[TMP19]], i32 4, <2 x i1> [[TMP12]])
|
||||
; FVW2-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 4
|
||||
; FVW2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]]
|
||||
; FVW2-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD_2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP21]], align 4
|
||||
; FVW2-NEXT: [[TMP22:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_2]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]]
|
||||
; FVW2-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP24]], i32 4, <2 x i1> [[TMP22]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP25:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_2]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP25]]
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP26]], i32 4, <2 x i1> [[TMP22]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[TMP27:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_2]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]]
|
||||
; FVW2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP27]], <2 x float>* [[TMP29]], i32 4, <2 x i1> [[TMP22]])
|
||||
; FVW2-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 6
|
||||
; FVW2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]]
|
||||
; FVW2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_LOAD_3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP31]], align 4
|
||||
; FVW2-NEXT: [[TMP32:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD_3]], zeroinitializer
|
||||
; FVW2-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]]
|
||||
; FVW2-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <2 x i32>*
|
||||
; FVW2-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* [[TMP34]], i32 4, <2 x i1> [[TMP32]], <2 x i32> undef)
|
||||
; FVW2-NEXT: [[TMP35:%.*]] = sext <2 x i32> [[WIDE_MASKED_LOAD_3]] to <2 x i64>
|
||||
; FVW2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <2 x i64> [[TMP35]]
|
||||
; FVW2-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> [[TMP36]], i32 4, <2 x i1> [[TMP32]], <2 x float> undef)
|
||||
; FVW2-NEXT: [[TMP37:%.*]] = fadd <2 x float> [[WIDE_MASKED_GATHER_3]], <float 5.000000e-01, float 5.000000e-01>
|
||||
; FVW2-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]]
|
||||
; FVW2-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <2 x float>*
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP31]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP11]])
|
||||
; FVW2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX6]], 8
|
||||
; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
|
||||
; FVW2-NEXT: call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> [[TMP37]], <2 x float>* [[TMP39]], i32 4, <2 x i1> [[TMP32]])
|
||||
; FVW2-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 8
|
||||
; FVW2-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096
|
||||
; FVW2-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
|
||||
; FVW2: for.end:
|
||||
; FVW2-NEXT: ret void
|
||||
|
Loading…
Reference in New Issue
Block a user