mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-28 22:00:30 +00:00
[ARM][MVE] Tail-predication: remove the BTC + 1 overflow checks
This adapts tail-predication to the new semantics of get.active.lane.mask as defined in D86147. This means that: - we can remove the BTC + 1 overflow checks because now the loop tripcount is passed in to the intrinsic, - we can immediately use that value to setup a counter for the number of elements processed by the loop and don't need to materialize BTC + 1. Differential Revision: https://reviews.llvm.org/D86303
This commit is contained in:
parent
2a33728d72
commit
bd571cfbf0
@ -33,8 +33,8 @@
|
||||
/// This pass:
|
||||
/// 1) Checks if the predicates of the masked load/store instructions are
|
||||
/// generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
|
||||
/// the Backedge Taken Count (BTC) of the scalar loop as its second argument,
|
||||
/// which we extract to set up the number of elements processed by the loop.
|
||||
/// the the scalar loop tripcount as its second argument, which we extract
|
||||
/// to set up the number of elements processed by the loop.
|
||||
/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
|
||||
/// specific VCTP intrinsic to represent the effect of tail predication.
|
||||
/// This will be picked up by the ARM Low-overhead loop pass, which performs
|
||||
@ -352,14 +352,14 @@ static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
|
||||
|
||||
// The active lane intrinsic has this form:
|
||||
//
|
||||
// @llvm.get.active.lane.mask(IV, BTC)
|
||||
// @llvm.get.active.lane.mask(IV, TC)
|
||||
//
|
||||
// Here we perform checks that this intrinsic behaves as expected,
|
||||
// which means:
|
||||
//
|
||||
// 1) The element count, which is calculated with BTC + 1, cannot overflow.
|
||||
// 2) The element count needs to be sufficiently large that the decrement of
|
||||
// element counter doesn't overflow, which means that we need to prove:
|
||||
// 1) Check that the TripCount (TC) belongs to this loop (originally).
|
||||
// 2) The element count (TC) needs to be sufficiently large that the decrement
|
||||
// of element counter doesn't overflow, which means that we need to prove:
|
||||
// ceil(ElementCount / VectorWidth) >= TripCount
|
||||
// by rounding up ElementCount up:
|
||||
// ((ElementCount + (VectorWidth - 1)) / VectorWidth
|
||||
@ -373,29 +373,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
|
||||
EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
|
||||
EnableTailPredication == TailPredication::ForceEnabled;
|
||||
|
||||
// 1) Test whether entry to the loop is protected by a conditional
|
||||
// BTC + 1 < 0. In other words, if the scalar trip count overflows,
|
||||
// becomes negative, we shouldn't enter the loop and creating
|
||||
// tripcount expression BTC + 1 is not safe. So, check that BTC
|
||||
// isn't max. This is evaluated in unsigned, because the semantics
|
||||
// of @get.active.lane.mask is a ULE comparison.
|
||||
auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
|
||||
auto *BTC = SE->getSCEV(BackedgeTakenCount);
|
||||
auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L);
|
||||
|
||||
if (isa<SCEVCouldNotCompute>(MaxBTC)) {
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: ";
|
||||
BTC->dump());
|
||||
return false;
|
||||
}
|
||||
|
||||
APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0);
|
||||
if (cast<SCEVConstant>(MaxBTC)->getAPInt().eq(MaxInt) &&
|
||||
!ForceTailPredication) {
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be int max: ";
|
||||
BTC->dump());
|
||||
return false;
|
||||
}
|
||||
// 1) TODO: Check that the TripCount (TC) belongs to this loop (originally).
|
||||
// The scalar tripcount corresponds the number of elements processed by the
|
||||
// loop, so we will refer to that from this point on.
|
||||
auto *ElemCountVal = ActiveLaneMask->getOperand(1);
|
||||
|
||||
// 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
|
||||
//
|
||||
@ -415,8 +396,10 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
|
||||
auto *TC = SE->getSCEV(TripCount);
|
||||
unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
|
||||
int VectorWidth = VecTy->getNumElements();
|
||||
auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
|
||||
auto Diff = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
|
||||
uint64_t MaxMinusVW = Diff.getZExtValue();
|
||||
// FIXME: since ranges can be negative we work with signed ranges here, but
|
||||
// we shouldn't extract the zext'ed values for them.
|
||||
uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
|
||||
|
||||
if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
|
||||
@ -434,7 +417,7 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
|
||||
//
|
||||
// where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
|
||||
// values (and not constants), we have to compensate for the lowerbound value
|
||||
// range to be off by 1. The reason is that BTC lives in the preheader in
|
||||
// range to be off by 1. The reason is that the TC lives in the preheader in
|
||||
// this form:
|
||||
//
|
||||
// %trip.count.minus = add nsw nuw i32 %N, -1
|
||||
@ -449,9 +432,7 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
|
||||
// 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
|
||||
// we first add 0 to TC such that we can do the <= comparison on both sets.
|
||||
//
|
||||
auto *One = SE->getOne(TripCount->getType());
|
||||
// ElementCount = BTC + 1
|
||||
auto *ElementCount = SE->getAddExpr(BTC, One);
|
||||
auto *ElementCount = SE->getSCEV(ElemCountVal);
|
||||
// Tmp = ElementCount + (VW-1)
|
||||
auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount,
|
||||
SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
|
||||
@ -504,38 +485,6 @@ bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
|
||||
return false;
|
||||
}
|
||||
|
||||
// Materialize NumElements in the preheader block.
|
||||
static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
|
||||
// First, check the preheader if it not already exist:
|
||||
//
|
||||
// preheader:
|
||||
// %BTC = add i32 %N, -1
|
||||
// ..
|
||||
// vector.body:
|
||||
//
|
||||
// if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
|
||||
// but instead can just return %N.
|
||||
for (auto &I : *Preheader) {
|
||||
if (I.getOpcode() != Instruction::Add || &I != BTC)
|
||||
continue;
|
||||
ConstantInt *MinusOne = nullptr;
|
||||
if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
|
||||
continue;
|
||||
if (MinusOne->getSExtValue() == -1) {
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
|
||||
return I.getOperand(0);
|
||||
}
|
||||
}
|
||||
|
||||
// But we do need to materialise BTC if it is not already there,
|
||||
// e.g. if it is a constant.
|
||||
IRBuilder<> Builder(Preheader->getTerminator());
|
||||
Value *NumElements = Builder.CreateAdd(BTC,
|
||||
ConstantInt::get(BTC->getType(), 1), "num.elements");
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
|
||||
return NumElements;
|
||||
}
|
||||
|
||||
void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
|
||||
Value *TripCount, FixedVectorType *VecTy) {
|
||||
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
|
||||
@ -543,23 +492,15 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
|
||||
Type *Ty = IntegerType::get(M->getContext(), 32);
|
||||
unsigned VectorWidth = VecTy->getNumElements();
|
||||
|
||||
// The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
|
||||
// is one less than the trip count. So we need to find or create
|
||||
// %num.elements = %BTC + 1 in the preheader.
|
||||
Value *BTC = ActiveLaneMask->getOperand(1);
|
||||
Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
|
||||
Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
|
||||
|
||||
// Insert a phi to count the number of elements processed by the loop.
|
||||
Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI() );
|
||||
PHINode *Processed = Builder.CreatePHI(Ty, 2);
|
||||
Processed->addIncoming(NumElements, L->getLoopPreheader());
|
||||
Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
|
||||
|
||||
// Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus
|
||||
// represent the effect of tail predication.
|
||||
// Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and
|
||||
// thus represent the effect of tail predication.
|
||||
Builder.SetInsertPoint(ActiveLaneMask);
|
||||
ConstantInt *Factor =
|
||||
ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
|
||||
ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
|
||||
|
||||
Intrinsic::ID VCTPID;
|
||||
switch (VectorWidth) {
|
||||
|
@ -37,7 +37,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i8* %tmp to <16 x i8>*
|
||||
%wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -94,7 +94,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i16* %tmp to <8 x i16>*
|
||||
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
@ -150,7 +150,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
%tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
|
||||
%tmp4 = bitcast i32* %tmp3 to <4 x i32>*
|
||||
@ -204,7 +204,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
%extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
|
||||
@ -264,7 +264,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
@ -323,7 +323,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
@ -352,10 +352,10 @@ for.cond.cleanup: ; preds = %vector.body, %entry
|
||||
;
|
||||
; CHECK-LABEL: interleave4
|
||||
; CHECK: vector.body:
|
||||
; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
|
||||
; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
|
||||
; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
|
||||
; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
|
||||
; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
|
||||
; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
|
||||
;
|
||||
define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
|
||||
entry:
|
||||
@ -386,13 +386,13 @@ vector.body:
|
||||
%lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
|
||||
%lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
|
||||
%lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
%v7 = add i32 %index, 4
|
||||
%active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
|
||||
%active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
|
||||
%v8 = add i32 %v7, 4
|
||||
%active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
|
||||
%active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
|
||||
%v9 = add i32 %v8, 4
|
||||
%active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
|
||||
%active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
|
||||
%scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
%scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
|
||||
|
@ -23,13 +23,12 @@ define hidden i32 @_Z4loopPiPjiS0_i(i32* noalias nocapture readonly %s1, i32* no
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP3]])
|
||||
; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ]
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[N]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>*
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
@ -108,7 +107,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
%5 = insertelement <4 x i32> undef, i32 %trip.count.minus.183, i32 0
|
||||
%6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
%7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.183)
|
||||
%7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %broadcast.splat72, <4 x i32>* %lsr.iv911, i32 4, <4 x i1> %7)
|
||||
%index.next = add i32 %index, 4
|
||||
%scevgep10 = getelementptr i32, i32* %lsr.iv9, i32 4
|
||||
|
@ -64,7 +64,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
|
||||
@ -166,7 +166,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
|
||||
@ -268,7 +268,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
|
||||
@ -367,7 +367,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%wide.masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
|
||||
@ -441,7 +441,7 @@ bb9: ; preds = %bb9, %bb3
|
||||
%tmp14 = getelementptr inbounds i32, i32* %arg1, i32 %tmp10
|
||||
|
||||
; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
|
||||
%tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %tmp6)
|
||||
%tmp15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp10, i32 %arg2)
|
||||
|
||||
%tmp16 = bitcast i32* %tmp14 to <4 x i32>*
|
||||
%tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp16, i32 4, <4 x i1> %tmp15, <4 x i32> undef)
|
||||
@ -505,7 +505,7 @@ bb12: ; preds = %bb12, %bb4
|
||||
%tmp17 = getelementptr inbounds i32, i32* %arg, i32 %tmp13
|
||||
|
||||
; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
|
||||
%tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %tmp7)
|
||||
%tmp18= call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %tmp13, i32 %arg3)
|
||||
|
||||
%tmp19 = bitcast i32* %tmp17 to <4 x i32>*
|
||||
%tmp20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp19, i32 4, <4 x i1> %tmp18, <4 x i32> undef)
|
||||
|
@ -40,7 +40,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i8* %0 to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
|
||||
@ -99,7 +99,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i8* %0 to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
|
||||
@ -158,7 +158,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
|
||||
@ -217,7 +217,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
|
||||
|
@ -139,7 +139,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%2 = getelementptr inbounds float, float* %b, i32 %index
|
||||
|
||||
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%4 = bitcast float* %2 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %3, <4 x float> undef)
|
||||
@ -280,7 +280,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
|
@ -55,7 +55,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i8* %0 to <4 x i8>*
|
||||
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
|
||||
@ -130,7 +130,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
|
||||
@ -205,7 +205,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i8* %0 to <4 x i8>*
|
||||
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
|
||||
@ -280,7 +280,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
|
||||
@ -354,7 +354,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i32, i32* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -514,7 +514,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%2 = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
|
||||
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%4 = bitcast i8* %2 to <4 x i8>*
|
||||
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
|
||||
@ -653,7 +653,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %a, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
|
||||
@ -815,7 +815,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%2 = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
|
||||
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%4 = bitcast i8* %2 to <4 x i8>*
|
||||
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
|
||||
@ -954,7 +954,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %a, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
|
||||
@ -1115,7 +1115,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%2 = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%3 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%4 = bitcast i32* %2 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
|
||||
@ -1238,7 +1238,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i8* %0 to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %2, i32 1, <8 x i1> %1, <8 x i8> undef)
|
||||
|
@ -24,13 +24,12 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
|
||||
; CHECK-NEXT: [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX8_PROMOTED_US]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
|
||||
; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
|
||||
@ -95,7 +94,7 @@ vector.body: ; preds = %vector.body, %for.c
|
||||
%tmp6 = getelementptr inbounds i16, i16* %tmp3, i32 %index
|
||||
|
||||
; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat29
|
||||
%tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp8 = bitcast i16* %tmp6 to <4 x i16>*
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %tmp8, i32 2, <4 x i1> %tmp7, <4 x i16> undef)
|
||||
@ -146,13 +145,12 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
|
||||
; CHECK-NEXT: [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX7_PROMOTED_US]], i32 0
|
||||
; CHECK-NEXT: call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
|
||||
; CHECK-NEXT: [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1
|
||||
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
|
||||
; CHECK: vector.body:
|
||||
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
|
||||
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
|
||||
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
|
||||
@ -215,7 +213,7 @@ vector.body: ; preds = %vector.body, %for.c
|
||||
%tmp6 = getelementptr inbounds i32, i32* %tmp3, i32 %index
|
||||
|
||||
; %tmp7 = icmp ule <4 x i32> %induction, %broadcast.splat28
|
||||
%tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp8 = bitcast i32* %tmp6 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp8, i32 4, <4 x i1> %tmp7, <4 x i32> undef)
|
||||
|
@ -49,7 +49,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
%i2 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
@ -119,7 +119,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
|
||||
%i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
|
||||
@ -180,7 +180,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
%i2 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
@ -239,7 +239,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
|
||||
%i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
|
||||
@ -300,7 +300,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
%i2 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
@ -359,7 +359,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
|
||||
%i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
|
||||
@ -450,7 +450,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <4 x i8>*
|
||||
%wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
|
||||
%i2 = zext <4 x i8> %wide.masked.load to <4 x i32>
|
||||
@ -480,7 +480,7 @@ vector.body46: ; preds = %vector.body46, %vec
|
||||
%index51 = phi i32 [ 0, %vector.ph47 ], [ %index.next52, %vector.body46 ]
|
||||
%vec.phi60 = phi <4 x i32> [ %i11, %vector.ph47 ], [ %i19, %vector.body46 ]
|
||||
%i12 = getelementptr inbounds i8, i8* %a, i32 %index51
|
||||
%active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %trip.count.minus.154)
|
||||
%active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %N)
|
||||
%i13 = bitcast i8* %i12 to <4 x i8>*
|
||||
%wide.masked.load62 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %i13, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
|
||||
%i14 = zext <4 x i8> %wide.masked.load62 to <4 x i32>
|
||||
@ -564,7 +564,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i8, %vector.body ]
|
||||
%vec.phi.1 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i9, %vector.body ]
|
||||
%i = getelementptr inbounds i8, i8* %a, i32 %index
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
%i1 = bitcast i8* %i to <8 x i8>*
|
||||
%wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
|
||||
%i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
|
||||
|
@ -53,7 +53,7 @@ vector.body:
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
|
||||
; %1 = icmp ult <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
|
||||
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -388,7 +388,7 @@ vector.body:
|
||||
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
|
||||
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -432,7 +432,7 @@ vector.body:
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
|
||||
; The induction variable %D is not an IV:
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32002)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
|
||||
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -474,7 +474,7 @@ vector.body:
|
||||
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
|
||||
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -519,7 +519,7 @@ vector.body:
|
||||
%broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
|
||||
%broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
%wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
%2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
|
||||
|
@ -34,7 +34,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
||||
%next.gep20 = getelementptr i16, i16* %pDst, i32 %index
|
||||
%next.gep21 = getelementptr i16, i16* %pSrcB, i32 %index
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize)
|
||||
%0 = bitcast i16* %next.gep to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
%1 = bitcast i16* %next.gep21 to <8 x i16>*
|
||||
@ -83,7 +83,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%next.gep = getelementptr i16, i16* %pSrcA, i32 %index
|
||||
%next.gep20 = getelementptr i16, i16* %pDst, i32 %index
|
||||
%next.gep21 = getelementptr i16, i16* %pSrcB, i32 %index
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize)
|
||||
%0 = bitcast i16* %next.gep to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
%1 = bitcast i16* %next.gep21 to <8 x i16>*
|
||||
|
@ -32,7 +32,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pDst, i32 %index
|
||||
%next.gep13 = getelementptr float, float* %pSrcA, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %blockSize)
|
||||
%0 = bitcast float* %next.gep13 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %wide.masked.load)
|
||||
|
@ -32,7 +32,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pSrcA, i32 %index
|
||||
%next.gep14 = getelementptr float, float* %pDst, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = bitcast float* %next.gep to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load)
|
||||
@ -77,7 +77,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pSrcA, i32 %index
|
||||
%next.gep14 = getelementptr float, float* %pDst, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = bitcast float* %next.gep to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load)
|
||||
@ -122,7 +122,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pSrcA, i32 %index
|
||||
%next.gep14 = getelementptr float, float* %pDst, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = bitcast float* %next.gep to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load)
|
||||
@ -167,7 +167,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pSrcA, i32 %index
|
||||
%next.gep14 = getelementptr float, float* %pDst, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = bitcast float* %next.gep to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load)
|
||||
@ -212,7 +212,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pSrcA, i32 %index
|
||||
%next.gep14 = getelementptr float, float* %pDst, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = bitcast float* %next.gep to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load)
|
||||
@ -236,22 +236,21 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA,
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r7, pc}
|
||||
; CHECK-NEXT: adds r3, r2, #3
|
||||
; CHECK-NEXT: vdup.32 q1, r2
|
||||
; CHECK-NEXT: bic r3, r3, #3
|
||||
; CHECK-NEXT: sub.w r12, r3, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r12, lsr #2
|
||||
; CHECK-NEXT: adr r3, .LCPI5_0
|
||||
; CHECK-NEXT: sub.w r12, r2, #1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r3]
|
||||
; CHECK-NEXT: movs r2, #0
|
||||
; CHECK-NEXT: vdup.32 q1, r12
|
||||
; CHECK-NEXT: mov.w r12, #0
|
||||
; CHECK-NEXT: dls lr, lr
|
||||
; CHECK-NEXT: .LBB5_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vadd.i32 q2, q0, r2
|
||||
; CHECK-NEXT: vdup.32 q3, r2
|
||||
; CHECK-NEXT: vadd.i32 q2, q0, r12
|
||||
; CHECK-NEXT: vdup.32 q3, r12
|
||||
; CHECK-NEXT: vcmp.u32 hi, q3, q2
|
||||
; CHECK-NEXT: adds r2, #4
|
||||
; CHECK-NEXT: add.w r12, r12, #4
|
||||
; CHECK-NEXT: vpnot
|
||||
; CHECK-NEXT: vpstt
|
||||
; CHECK-NEXT: vcmpt.u32 cs, q1, q2
|
||||
@ -286,7 +285,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%next.gep = getelementptr float, float* %pSrcA, i32 %index
|
||||
%next.gep14 = getelementptr float, float* %pDst, i32 %index
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = bitcast float* %next.gep to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
||||
%1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load)
|
||||
|
@ -9,6 +9,7 @@ define arm_aapcs_vfpcc void @usub_sat(i16* noalias nocapture readonly %pSrcA, i1
|
||||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r7, pc}
|
||||
; CHECK-NEXT: subs r3, #1
|
||||
; CHECK-NEXT: dlstp.16 lr, r3
|
||||
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
@ -58,6 +59,7 @@ define arm_aapcs_vfpcc void @ssub_sat(i16* noalias nocapture readonly %pSrcA, i1
|
||||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: it eq
|
||||
; CHECK-NEXT: popeq {r7, pc}
|
||||
; CHECK-NEXT: subs r3, #1
|
||||
; CHECK-NEXT: dlstp.16 lr, r3
|
||||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
|
@ -29,7 +29,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
|
||||
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i16* %tmp to <8 x i16>*
|
||||
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
|
||||
@ -89,7 +89,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i16, i16* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
|
||||
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i16* %tmp to <8 x i16>*
|
||||
%wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
|
||||
@ -151,7 +151,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%tmp1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp2 = bitcast i32* %tmp to <4 x i32>*
|
||||
%wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
|
||||
|
@ -36,7 +36,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
|
||||
|
||||
; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
|
||||
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp)
|
||||
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
|
||||
@ -107,7 +107,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
|
||||
|
||||
; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
|
||||
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp)
|
||||
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
|
||||
@ -170,7 +170,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%tmp2 = getelementptr inbounds i16, i16* %A, i32 %index
|
||||
|
||||
; %tmp3 = icmp ule <8 x i32> %induction, %broadcast.splat2
|
||||
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %tmp)
|
||||
%tmp3 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%tmp4 = bitcast i16* %tmp2 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp3, <8 x i16> undef)
|
||||
@ -238,7 +238,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%9 = phi i32 [ %7, %vector.ph ], [ %17, %vector.body ]
|
||||
%lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
|
||||
%lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %8)
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
|
||||
%10 = sext <4 x i16> %wide.masked.load to <4 x i32>
|
||||
%wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
|
||||
|
@ -138,7 +138,6 @@ define dso_local void @varying_outer_2d_reduction(i16* nocapture readonly %Input
|
||||
; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17
|
||||
; NOREDUCTIONS-NEXT: add sp, #4
|
||||
; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
|
||||
;
|
||||
entry:
|
||||
%conv = sext i16 %N to i32
|
||||
%cmp36 = icmp sgt i16 %N, 0
|
||||
@ -178,7 +177,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%i9 = phi i32 [ %i7, %vector.ph ], [ %i17, %vector.body ]
|
||||
%lsr.iv4850 = bitcast i16* %lsr.iv48 to <4 x i16>*
|
||||
%lsr.iv45 = bitcast i16* %lsr.iv to <4 x i16>*
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i8)
|
||||
%wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv45, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
|
||||
%i10 = sext <4 x i16> %wide.masked.load to <4 x i32>
|
||||
%wide.masked.load42 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %lsr.iv4850, i32 2, <4 x i1> %active.lane.mask, <4 x i16> undef)
|
||||
|
@ -45,7 +45,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -110,7 +110,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -171,7 +171,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i32, i32* %a, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -230,7 +230,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i32, i32* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -286,7 +286,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i32, i32* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat9
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
|
||||
@ -341,7 +341,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i8, i8* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <16 x i32> %induction, %broadcast.splat13
|
||||
%1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef)
|
||||
@ -400,7 +400,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds i16, i16* %b, i32 %index
|
||||
|
||||
; %1 = icmp ule <8 x i32> %induction, %broadcast.splat13
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%2 = bitcast i16* %0 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef)
|
||||
|
@ -49,7 +49,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
|
||||
|
||||
; %7 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%7 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
|
||||
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %7, <4 x i32> undef)
|
||||
%wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %7, <4 x i32> undef)
|
||||
|
@ -44,7 +44,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -106,7 +106,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -169,7 +169,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -231,7 +231,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -296,7 +296,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -361,7 +361,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -426,7 +426,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -491,7 +491,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -556,7 +556,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -619,7 +619,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -683,7 +683,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
@ -747,7 +747,7 @@ vector.body: ; preds = %vector.body, %vecto
|
||||
%0 = getelementptr inbounds float, float* %x, i32 %index
|
||||
|
||||
; %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
|
||||
%2 = bitcast float* %0 to <4 x float>*
|
||||
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> undef)
|
||||
|
@ -4,16 +4,15 @@
|
||||
define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
|
||||
; CHECK-LABEL: mve_gather_qi_wb:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, lr}
|
||||
; CHECK-NEXT: push {r4, lr}
|
||||
; CHECK-NEXT: adr r4, .LCPI0_0
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: add.w r12, r0, r3, lsl #2
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
||||
; CHECK-NEXT: adds r0, r3, #1
|
||||
; CHECK-NEXT: adr r0, .LCPI0_0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: vmov.i32 q2, #0x0
|
||||
; CHECK-NEXT: dlstp.32 lr, r3
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r1
|
||||
; CHECK-NEXT: adds r1, r3, #4
|
||||
; CHECK-NEXT: dlstp.32 lr, r0
|
||||
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r12], #16
|
||||
@ -25,7 +24,7 @@ define dso_local void @mve_gather_qi_wb(i32* noalias nocapture readonly %A, i32*
|
||||
; CHECK-NEXT: vmov q0, q2
|
||||
; CHECK-NEXT: vaddv.u32 r0, q0
|
||||
; CHECK-NEXT: str.w r0, [r2, r1, lsl #2]
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
; CHECK-NEXT: .LCPI0_0:
|
||||
@ -74,18 +73,17 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly
|
||||
; CHECK-NEXT: push {r4, lr}
|
||||
; CHECK-NEXT: .vsave {d8, d9}
|
||||
; CHECK-NEXT: vpush {d8, d9}
|
||||
; CHECK-NEXT: adr r4, .LCPI1_0
|
||||
; CHECK-NEXT: add.w r12, r0, r3, lsl #2
|
||||
; CHECK-NEXT: adds r0, r3, #1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
||||
; CHECK-NEXT: adds r3, #4
|
||||
; CHECK-NEXT: add.w r4, r0, r3, lsl #2
|
||||
; CHECK-NEXT: adr r0, .LCPI1_0
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
||||
; CHECK-NEXT: add.w r12, r3, #4
|
||||
; CHECK-NEXT: vmov.i32 q3, #0x0
|
||||
; CHECK-NEXT: vmov.i32 q0, #0x14
|
||||
; CHECK-NEXT: dlstp.32 lr, r0
|
||||
; CHECK-NEXT: dlstp.32 lr, r3
|
||||
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q2, [r1, q1, uxtw #2]
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r12], #16
|
||||
; CHECK-NEXT: vldrw.u32 q4, [r4], #16
|
||||
; CHECK-NEXT: vmul.i32 q2, q2, q4
|
||||
; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2]
|
||||
; CHECK-NEXT: vadd.i32 q1, q1, q0
|
||||
@ -94,7 +92,7 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly
|
||||
; CHECK-NEXT: @ %bb.2: @ %middle.block
|
||||
; CHECK-NEXT: vmov q0, q3
|
||||
; CHECK-NEXT: vaddv.u32 r0, q0
|
||||
; CHECK-NEXT: str.w r0, [r2, r3, lsl #2]
|
||||
; CHECK-NEXT: str.w r0, [r2, r12, lsl #2]
|
||||
; CHECK-NEXT: vpop {d8, d9}
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
@ -141,17 +139,16 @@ end: ; preds = %middle.block
|
||||
define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) {
|
||||
; CHECK-LABEL: mve_scatter_qi:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, lr}
|
||||
; CHECK-NEXT: push {r4, lr}
|
||||
; CHECK-NEXT: adr r4, .LCPI2_0
|
||||
; CHECK-NEXT: .save {r7, lr}
|
||||
; CHECK-NEXT: push {r7, lr}
|
||||
; CHECK-NEXT: add.w r12, r0, r3, lsl #2
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r4]
|
||||
; CHECK-NEXT: adds r0, r3, #1
|
||||
; CHECK-NEXT: adr r0, .LCPI2_0
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: vmov.i32 q3, #0x0
|
||||
; CHECK-NEXT: vmov.i32 q2, #0x3
|
||||
; CHECK-NEXT: vadd.i32 q0, q0, r1
|
||||
; CHECK-NEXT: adds r1, r3, #4
|
||||
; CHECK-NEXT: vmov.i32 q2, #0x3
|
||||
; CHECK-NEXT: dlstp.32 lr, r0
|
||||
; CHECK-NEXT: dlstp.32 lr, r3
|
||||
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r12], #16
|
||||
@ -163,7 +160,7 @@ define dso_local void @mve_scatter_qi(i32* noalias nocapture readonly %A, i32* n
|
||||
; CHECK-NEXT: vmov q0, q3
|
||||
; CHECK-NEXT: vaddv.u32 r0, q0
|
||||
; CHECK-NEXT: str.w r0, [r2, r1, lsl #2]
|
||||
; CHECK-NEXT: pop {r4, pc}
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
; CHECK-NEXT: .p2align 4
|
||||
; CHECK-NEXT: @ %bb.3:
|
||||
; CHECK-NEXT: .LCPI2_0:
|
||||
|
@ -1730,7 +1730,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
||||
%1 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
@ -1781,7 +1781,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
||||
%1 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
@ -1835,7 +1835,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
||||
%1 = bitcast i16* %0 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
@ -1887,7 +1887,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
||||
%1 = bitcast i16* %0 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
@ -1943,7 +1943,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
||||
%1 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -1995,7 +1995,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
||||
%1 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -2051,7 +2051,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
||||
%1 = bitcast i16* %0 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
@ -2102,7 +2102,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
||||
%1 = bitcast i16* %0 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
@ -2156,7 +2156,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
||||
%1 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -2208,7 +2208,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
||||
%1 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -2264,7 +2264,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
||||
%1 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -2315,7 +2315,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i8, i8* %x, i32 %index
|
||||
%1 = bitcast i8* %0 to <16 x i8>*
|
||||
%wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
|
||||
@ -2371,7 +2371,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
||||
%1 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
@ -2425,7 +2425,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i32, i32* %x, i32 %index
|
||||
%1 = bitcast i32* %0 to <4 x i32>*
|
||||
%wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
|
||||
@ -2484,7 +2484,7 @@ vector.ph: ; preds = %entry
|
||||
vector.body: ; preds = %vector.body, %vector.ph
|
||||
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
%vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
%active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
|
||||
%0 = getelementptr inbounds i16, i16* %x, i32 %index
|
||||
%1 = bitcast i16* %0 to <8 x i16>*
|
||||
%wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
|
||||
|
Loading…
Reference in New Issue
Block a user