mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-03-04 08:27:50 +00:00
[LV] Strip wrap flags from vectorized reductions
A sequence of additions or multiplications that is known not to wrap, may wrap if it's order is changed (i.e., reassociated). Therefore when vectorizing integer sum or product reductions, their no-wrap flags need to be removed. Fixes PR43828 Patch by Denis Antrushin Differential Revision: https://reviews.llvm.org/D69563
This commit is contained in:
parent
04329dbfa6
commit
e498be5738
@ -531,6 +531,9 @@ protected:
|
||||
/// vectorizing this phi node.
|
||||
void fixReduction(PHINode *Phi);
|
||||
|
||||
/// Clear NSW/NUW flags from reduction instructions if necessary.
|
||||
void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
|
||||
|
||||
/// The Loop exit block may have single value PHI nodes with some
|
||||
/// incoming value. While vectorizing we only handled real values
|
||||
/// that were defined inside the loop and we should have one value for
|
||||
@ -3711,16 +3714,20 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
|
||||
}
|
||||
}
|
||||
|
||||
// Wrap flags are in general invalid after vectorization, clear them.
|
||||
clearReductionWrapFlags(RdxDesc);
|
||||
|
||||
// Fix the vector-loop phi.
|
||||
|
||||
// Reductions do not have to start at zero. They can start with
|
||||
// any loop invariant values.
|
||||
BasicBlock *Latch = OrigLoop->getLoopLatch();
|
||||
Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
|
||||
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
|
||||
Value *Val = getOrCreateVectorValue(LoopVal, Part);
|
||||
// Make sure to add the reduction stat value only to the
|
||||
// Make sure to add the reduction start value only to the
|
||||
// first unroll part.
|
||||
Value *StartVal = (Part == 0) ? VectorStart : Identity;
|
||||
cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
|
||||
@ -3857,6 +3864,37 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
|
||||
Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
|
||||
}
|
||||
|
||||
void InnerLoopVectorizer::clearReductionWrapFlags(
|
||||
RecurrenceDescriptor &RdxDesc) {
|
||||
RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
|
||||
if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
|
||||
RK != RecurrenceDescriptor::RK_IntegerMult)
|
||||
return;
|
||||
|
||||
Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
|
||||
assert(LoopExitInstr && "null loop exit instruction");
|
||||
SmallVector<Instruction *, 8> Worklist;
|
||||
SmallPtrSet<Instruction *, 8> Visited;
|
||||
Worklist.push_back(LoopExitInstr);
|
||||
Visited.insert(LoopExitInstr);
|
||||
|
||||
while (!Worklist.empty()) {
|
||||
Instruction *Cur = Worklist.pop_back_val();
|
||||
if (isa<OverflowingBinaryOperator>(Cur))
|
||||
for (unsigned Part = 0; Part < UF; ++Part) {
|
||||
Value *V = getOrCreateVectorValue(Cur, Part);
|
||||
cast<Instruction>(V)->dropPoisonGeneratingFlags();
|
||||
}
|
||||
|
||||
for (User *U : Cur->users()) {
|
||||
Instruction *UI = cast<Instruction>(U);
|
||||
if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
|
||||
Visited.insert(UI).second)
|
||||
Worklist.push_back(UI);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void InnerLoopVectorizer::fixLCSSAPHIs() {
|
||||
for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
|
||||
if (LCSSAPhi.getNumIncomingValues() == 1) {
|
||||
|
@ -15,15 +15,15 @@ target triple = "aarch64--linux-gnueabi"
|
||||
; CHECK: load <4 x i32>, <4 x i32>*
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: %index.next = add i64 %index, 8
|
||||
; CHECK: icmp eq i64 %index.next, 512
|
||||
|
||||
; FORCE-VEC-LABEL: @ind_plus2(
|
||||
; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
|
||||
; FORCE-VEC: mul nsw <2 x i32>
|
||||
; FORCE-VEC: add nsw <2 x i32>
|
||||
; FORCE-VEC: add <2 x i32>
|
||||
; FORCE-VEC: %index.next = add i64 %index, 2
|
||||
; FORCE-VEC: icmp eq i64 %index.next, 512
|
||||
define i32 @ind_plus2(i32* %A) {
|
||||
@ -59,15 +59,15 @@ for.end: ; preds = %for.body
|
||||
; CHECK: load <4 x i32>, <4 x i32>*
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: %index.next = add i64 %index, 8
|
||||
; CHECK: icmp eq i64 %index.next, 512
|
||||
|
||||
; FORCE-VEC-LABEL: @ind_minus2(
|
||||
; FORCE-VEC: %wide.load = load <2 x i32>, <2 x i32>*
|
||||
; FORCE-VEC: mul nsw <2 x i32>
|
||||
; FORCE-VEC: add nsw <2 x i32>
|
||||
; FORCE-VEC: add <2 x i32>
|
||||
; FORCE-VEC: %index.next = add i64 %index, 2
|
||||
; FORCE-VEC: icmp eq i64 %index.next, 512
|
||||
define i32 @ind_minus2(i32* %A) {
|
||||
@ -110,8 +110,8 @@ for.end: ; preds = %for.body
|
||||
; CHECK: shufflevector <8 x i32> %[[V1]], <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
; CHECK: mul nsw <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: %index.next = add i64 %index, 8
|
||||
; CHECK: icmp eq i64 %index.next, 1024
|
||||
|
||||
@ -120,7 +120,7 @@ for.end: ; preds = %for.body
|
||||
; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 0, i32 2>
|
||||
; FORCE-VEC: shufflevector <4 x i32> %[[V]], <4 x i32> undef, <2 x i32> <i32 1, i32 3>
|
||||
; FORCE-VEC: mul nsw <2 x i32>
|
||||
; FORCE-VEC: add nsw <2 x i32>
|
||||
; FORCE-VEC: add <2 x i32>
|
||||
; FORCE-VEC: %index.next = add i64 %index, 2
|
||||
; FORCE-VEC: icmp eq i64 %index.next, 1024
|
||||
define i32 @ptr_ind_plus2(i32* %A) {
|
||||
|
@ -96,7 +96,7 @@ define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B
|
||||
; CHECK: [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
|
||||
; CHECK: [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef)
|
||||
; CHECK-NEXT: [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]]
|
||||
; CHECK-NEXT: [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]]
|
||||
; CHECK-NEXT: [[ACCUM]] = add <8 x i32> [[ADD]], [[ACCUM_PHI]]
|
||||
; CHECK: [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]]
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
|
||||
; CHECK: middle.block:
|
||||
|
@ -248,8 +248,8 @@ define void @bug18724(i1 %cond) {
|
||||
; UNROLL-NOSIMPLIFY-NEXT: store i32 2, i32* [[TMP1]], align 4
|
||||
; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]]
|
||||
; UNROLL-NOSIMPLIFY: pred.store.continue4:
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = add nsw i32 [[VEC_PHI]], 1
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = add nsw i32 [[VEC_PHI2]], 1
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = add i32 [[VEC_PHI]], 1
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = add i32 [[VEC_PHI2]], 1
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 undef, i32 [[VEC_PHI]], i32 [[TMP4]]
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 undef, i32 [[VEC_PHI2]], i32 [[TMP5]]
|
||||
; UNROLL-NOSIMPLIFY-NEXT: [[OFFSET_IDX6:%.*]] = add i64 undef, [[INDEX]]
|
||||
|
@ -139,9 +139,9 @@ for.end: ; preds = %for.body
|
||||
; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
|
||||
; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
|
||||
; CHECK: shufflevector <16 x i32> %wide.vec, <16 x i32> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: sub <4 x i32>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: sub <4 x i32>
|
||||
|
||||
%struct.ST4 = type { i32, i32, i32, i32 }
|
||||
@ -529,7 +529,7 @@ for.body: ; preds = %for.body, %entry
|
||||
; CHECK: %[[V0:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
; CHECK: %[[V1:.*]] = shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
|
||||
; CHECK: bitcast <4 x i32> %[[V1]] to <4 x float>
|
||||
; CHECK: add nsw <4 x i32>
|
||||
; CHECK: add <4 x i32>
|
||||
; CHECK: fadd fast <4 x float>
|
||||
|
||||
%struct.IntFloat = type { i32, float }
|
||||
@ -645,7 +645,7 @@ for.end:
|
||||
; CHECK: store i32 %[[X4:.+]], {{.*}}
|
||||
; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
|
||||
; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]]
|
||||
; CHECK: add <4 x i32> %[[S1]], %[[Phi]]
|
||||
|
||||
define i32 @PR27626_1(%pair.i32 *%p, i64 %n) {
|
||||
entry:
|
||||
@ -746,7 +746,7 @@ for.end:
|
||||
; CHECK: store i32 %[[X4:.+]], {{.*}}
|
||||
; CHECK: %[[L2:.+]] = load <8 x i32>, <8 x i32>* {{.*}}
|
||||
; CHECK: %[[S1:.+]] = shufflevector <8 x i32> %[[L2]], <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
|
||||
; CHECK: add nsw <4 x i32> %[[S1]], %[[Phi]]
|
||||
; CHECK: add <4 x i32> %[[S1]], %[[Phi]]
|
||||
|
||||
define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) {
|
||||
entry:
|
||||
|
@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
|
||||
;CHECK: phi i64
|
||||
;CHECK: phi <4 x i32>
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: ret i32
|
||||
define i32 @sum_array(i32* %A, i32 %n) nounwind uwtable readonly noinline ssp {
|
||||
%1 = sext i32 %n to i64
|
||||
@ -37,7 +37,7 @@ _ZSt10accumulateIPiiET0_T_S2_S1_.exit: ; preds = %.lr.ph.i, %0
|
||||
;CHECK: phi i16
|
||||
;CHECK: phi <4 x i32>
|
||||
;CHECK: load <4 x i32>
|
||||
;CHECK: add nsw <4 x i32>
|
||||
;CHECK: add <4 x i32>
|
||||
;CHECK: ret i32
|
||||
define i32 @sum_array_as1(i32 addrspace(1)* %A, i32 %n) nounwind uwtable readonly noinline ssp {
|
||||
%1 = sext i32 %n to i64
|
||||
|
58
llvm/test/Transforms/LoopVectorize/nuw.ll
Normal file
58
llvm/test/Transforms/LoopVectorize/nuw.ll
Normal file
@ -0,0 +1,58 @@
|
||||
; RUN: opt %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -S | FileCheck %s
|
||||
|
||||
; Fixes PR43828
|
||||
|
||||
define void @test(i32* %B) {
|
||||
; CHECK-LABEL: @test(
|
||||
; CHECK: vector.body:
|
||||
; CHECK-COUNT-2: sub <4 x i32>
|
||||
entry:
|
||||
br label %outer_loop
|
||||
|
||||
outer_loop:
|
||||
%local_4 = phi i32 [ 2, %entry ], [ %4, %outer_tail]
|
||||
br label %inner_loop
|
||||
|
||||
inner_loop:
|
||||
%local_2 = phi i32 [ 0, %outer_loop ], [ %1, %inner_loop ]
|
||||
%local_3 = phi i32 [ -104, %outer_loop ], [ %0, %inner_loop ]
|
||||
%0 = sub nuw nsw i32 %local_3, %local_4
|
||||
%1 = add nuw nsw i32 %local_2, 1
|
||||
%2 = icmp ugt i32 %local_2, 126
|
||||
br i1 %2, label %outer_tail, label %inner_loop
|
||||
|
||||
outer_tail:
|
||||
%3 = phi i32 [ %0, %inner_loop ]
|
||||
store atomic i32 %3, i32 * %B unordered, align 8
|
||||
%4 = add i32 %local_4, 1
|
||||
%5 = icmp slt i32 %4, 6
|
||||
br i1 %5, label %outer_loop, label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
define i32 @multi-instr(i32* noalias nocapture %A, i32* noalias nocapture %B, i32 %inc) {
|
||||
; CHECK-LABEL: @multi-instr(
|
||||
; CHECK: vector.body:
|
||||
; CHECK-COUNT-4: add <4 x i32>
|
||||
entry:
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%iv = phi i32 [0, %entry], [%iv_inc, %loop]
|
||||
%redu = phi i32 [0, %entry], [%3, %loop]
|
||||
%gepa = getelementptr inbounds i32, i32* %A, i32 %iv
|
||||
%gepb = getelementptr inbounds i32, i32* %B, i32 %iv
|
||||
%0 = load i32, i32* %gepa
|
||||
%1 = load i32, i32* %gepb
|
||||
%2 = add nuw nsw i32 %redu, %0
|
||||
%3 = add nuw nsw i32 %2, %1
|
||||
%iv_inc = add nuw nsw i32 %iv, 1
|
||||
%4 = icmp ult i32 %iv_inc, 128
|
||||
br i1 %4, label %loop, label %exit
|
||||
|
||||
exit:
|
||||
%lcssa = phi i32 [%3, %loop]
|
||||
ret i32 %lcssa
|
||||
}
|
@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
||||
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP17:%.*]], %[[LATCH]] ]
|
||||
; CHECK: [[LATCH]]:
|
||||
; CHECK: [[TMP13:%.*]] = and <4 x i32> [[VEC_PHI]], <i32 255, i32 255, i32 255, i32 255>
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw <4 x i32> [[TMP13]], {{.*}}
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], {{.*}}
|
||||
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
|
||||
; CHECK: [[TMP16:%.*]] = trunc <4 x i32> [[TMP14]] to <4 x i8>
|
||||
; CHECK-NEXT: [[TMP17]] = zext <4 x i8> [[TMP16]] to <4 x i32>
|
||||
|
@ -300,7 +300,7 @@ for.end: ; preds = %for.body, %entry
|
||||
; In this test the reduction variable is on the LHS and we can vectorize it.
|
||||
;CHECK-LABEL: @reduction_sub_lhs(
|
||||
;CHECK: phi <4 x i32>
|
||||
;CHECK: sub nsw <4 x i32>
|
||||
;CHECK: sub <4 x i32>
|
||||
;CHECK: ret i32
|
||||
define i32 @reduction_sub_lhs(i32 %n, i32* noalias nocapture %A) nounwind uwtable readonly {
|
||||
entry:
|
||||
|
Loading…
x
Reference in New Issue
Block a user