Fixed consecutive memory access detection in Loop Vectorizer.

It did not handle correctly cases without GEP. The following loop wasn't vectorized: for (int i=0; i<len; i++) *to++ = *from++; I use getPtrStride() to find Stride for memory access and return 0 is the Stride is not 1 or -1. Re-commit rL273257 - revision: http://reviews.llvm.org/D20789 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@273864 91177308-0d34-0410-b5e6-96231b3b80d8
2024-11-30 23:20:54 +00:00 · 2016-06-27 11:19:23 +00:00 · 2016-06-27 11:19:23 +00:00 · 1abadbff39
commit 1abadbff39
parent c4cd97e86f
6 changed files with 93 additions and 90 deletions
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@ -679,9 +679,11 @@ const SCEV *replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
 /// to \p PtrToStride and therefore add further predicates to \p PSE.
 /// The \p Assume parameter indicates if we are allowed to make additional
 /// run-time assumptions.
+/// The \p ShouldCheckWrap indicates that we should ensure that address 
+/// calculation does not wrap.
 int getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                 const ValueToValueMap &StridesMap = ValueToValueMap(),
-                 bool Assume = false);
+                 bool Assume = false, bool ShouldCheckWrap = true);

 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@ -866,7 +866,7 @@ static bool isNoWrapAddRec(Value *Ptr, const SCEVAddRecExpr *AR,
 /// \brief Check whether the access through \p Ptr has a constant stride.
 int llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
                       const Loop *Lp, const ValueToValueMap &StridesMap,
-                       bool Assume) {
+                       bool Assume, bool ShouldCheckWrap) {
  Type *Ty = Ptr->getType();
  assert(Ty->isPointerTy() && "Unexpected non-ptr");

@ -905,9 +905,9 @@ int llvm::getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr,
  // to access the pointer value "0" which is undefined behavior in address
  // space 0, therefore we can also vectorize this case.
  bool IsInBoundsGEP = isInBoundsGep(Ptr);
-  bool IsNoWrapAddRec =
-      PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
-      isNoWrapAddRec(Ptr, AR, PSE, Lp);
+  bool IsNoWrapAddRec = !ShouldCheckWrap ||
+    PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW) ||
+    isNoWrapAddRec(Ptr, AR, PSE, Lp);
  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
    if (Assume) {
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -2242,87 +2242,13 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx,
 }

 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
-  assert(Ptr->getType()->isPointerTy() && "Unexpected non-ptr");
-  auto *SE = PSE.getSE();
-  // Make sure that the pointer does not point to structs.
-  if (Ptr->getType()->getPointerElementType()->isAggregateType())
-    return 0;

-  // If this value is a pointer induction variable, we know it is consecutive.
-  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
-  if (Phi && Inductions.count(Phi)) {
-    InductionDescriptor II = Inductions[Phi];
-    return II.getConsecutiveDirection();
-  }
-
-  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
-  if (!Gep)
-    return 0;
-
-  unsigned NumOperands = Gep->getNumOperands();
-  Value *GpPtr = Gep->getPointerOperand();
-  // If this GEP value is a consecutive pointer induction variable and all of
-  // the indices are constant, then we know it is consecutive.
-  Phi = dyn_cast<PHINode>(GpPtr);
-  if (Phi && Inductions.count(Phi)) {
-
-    // Make sure that the pointer does not point to structs.
-    PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
-    if (GepPtrType->getElementType()->isAggregateType())
-      return 0;
-
-    // Make sure that all of the index operands are loop invariant.
-    for (unsigned i = 1; i < NumOperands; ++i)
-      if (!SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
-        return 0;
-
-    InductionDescriptor II = Inductions[Phi];
-    return II.getConsecutiveDirection();
-  }
-
-  unsigned InductionOperand = getGEPInductionOperand(Gep);
-
-  // Check that all of the gep indices are uniform except for our induction
-  // operand.
-  for (unsigned i = 0; i != NumOperands; ++i)
-    if (i != InductionOperand &&
-        !SE->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)), TheLoop))
-      return 0;
-
-  // We can emit wide load/stores only if the last non-zero index is the
-  // induction variable.
-  const SCEV *Last = nullptr;
-  if (!getSymbolicStrides() || !getSymbolicStrides()->count(Gep))
-    Last = PSE.getSCEV(Gep->getOperand(InductionOperand));
-  else {
-    // Because of the multiplication by a stride we can have a s/zext cast.
-    // We are going to replace this stride by 1 so the cast is safe to ignore.
-    //
-    //  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-    //  %0 = trunc i64 %indvars.iv to i32
-    //  %mul = mul i32 %0, %Stride1
-    //  %idxprom = zext i32 %mul to i64  << Safe cast.
-    //  %arrayidx = getelementptr inbounds i32* %B, i64 %idxprom
-    //
-    Last = replaceSymbolicStrideSCEV(PSE, *getSymbolicStrides(),
-                                     Gep->getOperand(InductionOperand), Gep);
-    if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(Last))
-      Last =
-          (C->getSCEVType() == scSignExtend || C->getSCEVType() == scZeroExtend)
-              ? C->getOperand()
-              : Last;
-  }
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
-    const SCEV *Step = AR->getStepRecurrence(*SE);
-
-    // The memory is consecutive because the last index is consecutive
-    // and all other indices are loop invariant.
-    if (Step->isOne())
-      return 1;
-    if (Step->isAllOnesValue())
-      return -1;
-  }
+  const ValueToValueMap &Strides = getSymbolicStrides() ? *getSymbolicStrides() :
+    ValueToValueMap();

+  int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, true, false);
+  if (Stride == 1 || Stride == -1)
+    return Stride;
  return 0;
 }

@ -2658,7 +2584,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
  // Handle consecutive loads/stores.
  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
  if (ConsecutiveStride) {
-    if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+    if (Gep &&
+        !PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
+                                      OrigLoop)) {
      setDebugLocFromInst(Builder, Gep);
      Value *PtrOperand = Gep->getPointerOperand();
      Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
@ -2671,9 +2599,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
      Ptr = Builder.Insert(Gep2);
    } else if (Gep) {
      setDebugLocFromInst(Builder, Gep);
-      assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
-                                          OrigLoop) &&
-             "Base ptr must be invariant");
      // The last index does not have to be the induction. It can be
      // consecutive and be a function of the index. For example A[I+1];
      unsigned NumOperands = Gep->getNumOperands();
@ -2702,8 +2627,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
      }
      Ptr = Builder.Insert(Gep2);
    } else { // No GEP
-      // Use the induction element ptr.
-      assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
      setDebugLocFromInst(Builder, Ptr);
      VectorParts &PtrVal = getVectorValue(Ptr);
      Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
--- a/test/Transforms/LoopVectorize/consec_no_gep.ll
+++ b/test/Transforms/LoopVectorize/consec_no_gep.ll
@ -0,0 +1,43 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;; Check consecutive memory access without preceding GEP instruction
+
+;  for (int i=0; i<len; i++) {
+;    *to++ = *from++;
+;  }
+
+; CHECK-LABEL: @consecutive_no_gep(
+; CHECK: vector.body
+; CHECK: %[[index:.*]] = phi i64 [ 0, %vector.ph ]
+; CHECK: getelementptr float, float* %{{.*}}, i64 %[[index]]
+; CHECK: load <4 x float>
+
+define void @consecutive_no_gep(float* noalias nocapture readonly %from, float* noalias nocapture %to, i32 %len) #0 {
+entry:
+  %cmp2 = icmp sgt i32 %len, 0
+  br i1 %cmp2, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %from.addr.04 = phi float* [ %incdec.ptr, %for.body ], [ %from, %for.body.preheader ]
+  %to.addr.03 = phi float* [ %incdec.ptr1, %for.body ], [ %to, %for.body.preheader ]
+  %incdec.ptr = getelementptr inbounds float, float* %from.addr.04, i64 1
+  %val = load float, float* %from.addr.04, align 4
+  %incdec.ptr1 = getelementptr inbounds float, float* %to.addr.03, i64 1
+  store float %val, float* %to.addr.03, align 4
+  %inc = add nsw i32 %i.05, 1
+  %cmp = icmp slt i32 %inc, %len
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
--- a/test/Transforms/LoopVectorize/consec_no_gep2.ll
+++ b/test/Transforms/LoopVectorize/consec_no_gep2.ll
@ -0,0 +1,34 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; CHECK-LABEL: @img2buf
+; CHECK: store <4 x i32>
+; Function Attrs: nounwind
+define void @img2buf(i64 %val, i8* nocapture %buf, i32 %N) local_unnamed_addr #0 {
+entry:
+  br label %l2
+
+l2:
+  br label %for.body57.us
+
+for.body57.us: 
+  %indvars.iv24 = phi i64 [ %val, %l2 ], [ %indvars.iv.next25, %for.body57.us ]
+  %0 = trunc i64 %indvars.iv24 to i32
+  %add77.us = add i32 5, %0
+  %mul78.us = shl nsw i32 %add77.us, 2
+  %idx.ext79.us = sext i32 %mul78.us to i64
+  %add.ptr80.us = getelementptr inbounds i8, i8* %buf, i64 %idx.ext79.us
+  %ui32.0.add.ptr80.sroa_cast.us = bitcast i8* %add.ptr80.us to i32*
+  store i32 0, i32* %ui32.0.add.ptr80.sroa_cast.us, align 1
+  %indvars.iv.next25 = add nsw i64 %indvars.iv24, 1
+  %lftr.wideiv26 = trunc i64 %indvars.iv.next25 to i32
+  %exitcond27 = icmp eq i32 %lftr.wideiv26, %N
+  br i1 %exitcond27, label %l3, label %for.body57.us
+
+l3: 
+  ret void
+}
+
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-qpx,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
--- a/test/Transforms/LoopVectorize/ptr-induction.ll
+++ b/test/Transforms/LoopVectorize/ptr-induction.ll
@ -18,6 +18,7 @@ while.body.preheader:                             ; preds = %entry
 while.body:                                       ; preds = %while.body.preheader, %while.body
  %a.pn = phi i32* [ %incdec.ptr8, %while.body ], [ %a, %while.body.preheader ]
  %acc.07 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
+  %a1.pn = getelementptr inbounds i32, i32* %a.pn, i64 0
  %incdec.ptr8 = getelementptr inbounds i32, i32* %a.pn, i64 1
  %0 = load i32, i32* %incdec.ptr8, align 1
  %add = add nuw nsw i32 %0, %acc.07