diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index eeda593a698..67c4e42db63 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -54,6 +54,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" @@ -408,10 +409,11 @@ bool LoadHoisting::canHoistAllLoads() { class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT, TargetLibraryInfo *TLI) - : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI), + DominatorTree *DT, TargetTransformInfo* TTI, + AliasAnalysis *AA, TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false), - MaxSafeDepDistBytes(-1U), LoadSpeculation(L, DT) {} + LoadSpeculation(L, DT) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -498,8 +500,7 @@ public: } /// Insert a pointer and calculate the start and end SCEVs. - void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr, - unsigned DepSetId); + void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr); /// This flag indicates if we need to add the runtime check. bool Need; @@ -511,9 +512,6 @@ public: SmallVector Ends; /// Holds the information if this pointer is used for writing to memory. SmallVector IsWritePtr; - /// Holds the id of the set of pointers that could be dependent because of a - /// shared underlying object. - SmallVector DependencySetId; }; /// A POD for saving information about induction variables. @@ -534,6 +532,11 @@ public: /// induction descriptor. typedef MapVector InductionList; + /// Alias(Multi)Map stores the values (GEPs or underlying objects and their + /// respective Store/Load instruction(s) to calculate aliasing. + typedef MapVector AliasMap; + typedef DenseMap > AliasMultiMap; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -580,9 +583,6 @@ public: /// This function returns the identity element (or neutral element) for /// the operation K. static Constant *getReductionIdentity(ReductionKind K, Type *Tp); - - unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; } - private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -623,6 +623,16 @@ private: /// Returns the induction kind of Phi. This function may return NoInduction /// if the PHI is not an induction variable. InductionKind isInductionVariable(PHINode *Phi); + /// Return true if can compute the address bounds of Ptr within the loop. + bool hasComputableBounds(Value *Ptr); + /// Return true if there is the chance of write reorder. + bool hasPossibleGlobalWriteReorder(Value *Object, + Instruction *Inst, + AliasMultiMap &WriteObjects, + unsigned MaxByteWidth); + /// Return the AA location for a load or a store. + AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); + /// The loop that we evaluate. Loop *TheLoop; @@ -632,6 +642,10 @@ private: DataLayout *DL; /// Dominators. DominatorTree *DT; + /// Target Info. + TargetTransformInfo *TTI; + /// Alias Analysis. + AliasAnalysis *AA; /// Target Library Info. TargetLibraryInfo *TLI; @@ -661,8 +675,6 @@ private: /// Can we assume the absence of NaNs. bool HasFunNoNaNAttr; - unsigned MaxSafeDepDistBytes; - /// Utility to determine whether loads can be speculated. LoadHoisting LoadSpeculation; }; @@ -891,6 +903,7 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; + AliasAnalysis *AA; TargetLibraryInfo *TLI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { @@ -903,6 +916,7 @@ struct LoopVectorize : public LoopPass { LI = &getAnalysis(); TTI = &getAnalysis(); DT = &getAnalysis(); + AA = getAnalysisIfAvailable(); TLI = getAnalysisIfAvailable(); if (DL == NULL) { @@ -921,7 +935,7 @@ struct LoopVectorize : public LoopPass { } // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT, TLI); + LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; @@ -996,8 +1010,7 @@ struct LoopVectorize : public LoopPass { void LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, - bool WritePtr, - unsigned DepSetId) { + bool WritePtr) { const SCEV *Sc = SE->getSCEV(Ptr); const SCEVAddRecExpr *AR = dyn_cast(Sc); assert(AR && "Invalid addrec expression"); @@ -1007,7 +1020,6 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, Starts.push_back(AR->getStart()); Ends.push_back(ScEnd); IsWritePtr.push_back(WritePtr); - DependencySetId.push_back(DepSetId); } Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { @@ -1345,9 +1357,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (!PtrRtCheck->Need) return NULL; + Instruction *MemoryRuntimeCheck = 0; unsigned NumPointers = PtrRtCheck->Pointers.size(); - SmallVector , 2> Starts; - SmallVector , 2> Ends; + SmallVector Starts; + SmallVector Ends; SCEVExpander Exp(*SE, "induction"); @@ -1374,18 +1387,13 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } IRBuilder<> ChkBuilder(Loc); - // Our instructions might fold to a constant. - Value *MemoryRuntimeCheck = 0; + for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { // No need to check if two readonly pointers intersect. if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j]) continue; - // Only need to check pointers between two different dependency sets. - if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j]) - continue; - Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); @@ -1397,18 +1405,12 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (MemoryRuntimeCheck) IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx"); - MemoryRuntimeCheck = IsConflict; + + MemoryRuntimeCheck = cast(IsConflict); } } - // We have to do this trickery because the IRBuilder might fold the check to a - // constant expression in which case there is no Instruction anchored in a - // the block. - LLVMContext &Ctx = Loc->getContext(); - Instruction * Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck, - ConstantInt::getTrue(Ctx)); - ChkBuilder.Insert(Check, "memcheck.conflict"); - return Check; + return MemoryRuntimeCheck; } void @@ -2979,7 +2981,7 @@ bool AccessAnalysis::canCheckPtrAtRT( // Each access has its own dependence set. DepId = RunningDepId++; - RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); + //RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId); DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr <<"\n"); } else { @@ -3461,29 +3463,53 @@ MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets, return true; } +AliasAnalysis::Location +LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) { + if (StoreInst *Store = dyn_cast(Inst)) + return AA->getLocation(Store); + else if (LoadInst *Load = dyn_cast(Inst)) + return AA->getLocation(Load); + + llvm_unreachable("Should be either load or store instruction"); +} + +bool +LoopVectorizationLegality::hasPossibleGlobalWriteReorder( + Value *Object, + Instruction *Inst, + AliasMultiMap& WriteObjects, + unsigned MaxByteWidth) { + + AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst); + + std::vector::iterator + it = WriteObjects[Object].begin(), + end = WriteObjects[Object].end(); + + for (; it != end; ++it) { + Instruction* I = *it; + if (I == Inst) + continue; + + AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I); + if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth), + ThatLoc.getWithNewSize(MaxByteWidth))) + return true; + } + return false; +} + bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector ValueVector; typedef SmallPtrSet ValueSet; - - // Stores a pair of memory access location and whether the access is a store - // (true) or a load (false). - typedef std::pair MemAccessInfo; - typedef DenseSet PtrAccessSet; - // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; - - // Holds all the different accesses in the loop. - unsigned NumReads = 0; - unsigned NumReadWrites = 0; - PtrRtCheck.Pointers.clear(); PtrRtCheck.Need = false; const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel(); - MemoryDepChecker DepChecker(SE, DL, TheLoop); // For each block. for (Loop::block_iterator bb = TheLoop->block_begin(), @@ -3504,7 +3530,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Loads.push_back(Ld); - DepChecker.addAccess(Ld); continue; } @@ -3517,7 +3542,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } Stores.push_back(St); - DepChecker.addAccess(St); } } // next instr. } // next block. @@ -3532,8 +3556,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - AccessAnalysis::DepCandidates DependentAccesses; - AccessAnalysis Accesses(DL, DependentAccesses); + // Holds the read and read-write *pointers* that we find. These maps hold + // unique values for pointers (so no need for multi-map). + AliasMap Reads; + AliasMap ReadWrites; // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects // multiple times on the same object. If the ptr is accessed twice, once @@ -3552,12 +3578,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - // If we did *not* see this pointer before, insert it to the read-write - // list. At this phase it is only a 'write' list. - if (Seen.insert(Ptr)) { - ++NumReadWrites; - Accesses.addStore(Ptr); - } + // If we did *not* see this pointer before, insert it to + // the read-write list. At this phase it is only a 'write' list. + if (Seen.insert(Ptr)) + ReadWrites.insert(std::make_pair(Ptr, ST)); } if (IsAnnotatedParallel) { @@ -3567,7 +3591,6 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return true; } - SmallPtrSet ReadOnlyPtr; for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) { LoadInst *LD = cast(*I); Value* Ptr = LD->getPointerOperand(); @@ -3579,44 +3602,51 @@ bool LoopVectorizationLegality::canVectorizeMemory() { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - bool IsReadOnlyPtr = false; - if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) { - ++NumReads; - IsReadOnlyPtr = true; - } - Accesses.addLoad(Ptr, IsReadOnlyPtr); + if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr)) + Reads.insert(std::make_pair(Ptr, LD)); } // If we write (or read-write) to a single destination and there are no // other reads in this loop then is it safe to vectorize. - if (NumReadWrites == 1 && NumReads == 0) { + if (ReadWrites.size() == 1 && Reads.size() == 0) { DEBUG(dbgs() << "LV: Found a write-only loop!\n"); return true; } - // Build dependence sets and check whether we need a runtime pointer bounds - // check. - Accesses.buildDependenceSets(); - bool NeedRTCheck = Accesses.isRTCheckNeeded(); + unsigned NumReadPtrs = 0; + unsigned NumWritePtrs = 0; // Find pointers with computable bounds. We are going to use this information // to place a runtime bound check. - unsigned NumComparisons = 0; - bool CanDoRT = false; - if (NeedRTCheck) - CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop); + bool CanDoRT = true; + AliasMap::iterator MI, ME; + for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { + Value *V = (*MI).first; + if (hasComputableBounds(V)) { + PtrRtCheck.insert(SE, TheLoop, V, true); + NumWritePtrs++; + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); + } else { + CanDoRT = false; + break; + } + } + for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { + Value *V = (*MI).first; + if (hasComputableBounds(V)) { + PtrRtCheck.insert(SE, TheLoop, V, false); + NumReadPtrs++; + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n"); + } else { + CanDoRT = false; + break; + } + } - - DEBUG(dbgs() << "LV: We need to do " << NumComparisons << - " pointer comparisons.\n"); - - // If we only have one set of dependences to check pointers among we don't - // need a runtime check. - if (NumComparisons == 0 && NeedRTCheck) - NeedRTCheck = false; - - // Check that we did not collect too many pointers or found a unsizeable - // pointer. + // Check that we did not collect too many pointers or found a + // unsizeable pointer. + unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1)); + DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n"); if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) { PtrRtCheck.reset(); CanDoRT = false; @@ -3626,6 +3656,113 @@ bool LoopVectorizationLegality::canVectorizeMemory() { DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); } + bool NeedRTCheck = false; + + // Biggest vectorized access possible, vector width * unroll factor. + // TODO: We're being very pessimistic here, find a way to know the + // real access width before getting here. + unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) * + TTI->getMaximumUnrollFactor(); + // Now that the pointers are in two lists (Reads and ReadWrites), we + // can check that there are no conflicts between each of the writes and + // between the writes to the reads. + // Note that WriteObjects duplicates the stores (indexed now by underlying + // objects) to avoid pointing to elements inside ReadWrites. + // TODO: Maybe create a new type where they can interact without duplication. + AliasMultiMap WriteObjects; + ValueVector TempObjects; + + // Check that the read-writes do not conflict with other read-write + // pointers. + bool AllWritesIdentified = true; + for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) { + Value *Val = (*MI).first; + Instruction *Inst = (*MI).second; + + GetUnderlyingObjects(Val, TempObjects, DL); + for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); + UI != UE; ++UI) { + if (!isIdentifiedObject(*UI)) { + DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n"); + NeedRTCheck = true; + AllWritesIdentified = false; + } + + // Never seen it before, can't alias. + if (WriteObjects[*UI].empty()) { + DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n"); + WriteObjects[*UI].push_back(Inst); + continue; + } + // Direct alias found. + if (!AA || dyn_cast(*UI) == NULL) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" + << **UI <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found a conflicting global value:" + << **UI <<"\n"); + DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n"); + DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); + + // If global alias, make sure they do alias. + if (hasPossibleGlobalWriteReorder(*UI, + Inst, + WriteObjects, + MaxByteWidth)) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI + << "\n"); + return false; + } + + // Didn't alias, insert into map for further reference. + WriteObjects[*UI].push_back(Inst); + } + TempObjects.clear(); + } + + /// Check that the reads don't conflict with the read-writes. + for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) { + Value *Val = (*MI).first; + GetUnderlyingObjects(Val, TempObjects, DL); + for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end(); + UI != UE; ++UI) { + // If all of the writes are identified then we don't care if the read + // pointer is identified or not. + if (!AllWritesIdentified && !isIdentifiedObject(*UI)) { + DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n"); + NeedRTCheck = true; + } + + // Never seen it before, can't alias. + if (WriteObjects[*UI].empty()) + continue; + // Direct alias found. + if (!AA || dyn_cast(*UI) == NULL) { + DEBUG(dbgs() << "LV: Found a possible write-write reorder:" + << **UI <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found a global value: " + << **UI <<"\n"); + Instruction *Inst = (*MI).second; + DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n"); + DEBUG(dbgs() << "LV: On value:" << *Val <<"\n"); + + // If global alias, make sure they do alias. + if (hasPossibleGlobalWriteReorder(*UI, + Inst, + WriteObjects, + MaxByteWidth)) { + DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI + << "\n"); + return false; + } + } + TempObjects.clear(); + } + + PtrRtCheck.Need = NeedRTCheck; if (NeedRTCheck && !CanDoRT) { DEBUG(dbgs() << "LV: We can't vectorize because we can't find " << "the array bounds.\n"); @@ -3633,20 +3770,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() { return false; } - PtrRtCheck.Need = NeedRTCheck; - - bool CanVecMem = true; - if (Accesses.isDependencyCheckNeeded()) { - DEBUG(dbgs() << "LV: Checking memory dependencies\n"); - CanVecMem = DepChecker.areDepsSafe(DependentAccesses, - Accesses.getDependenciesToCheck()); - MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes(); - } - DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") << " need a runtime memory check.\n"); - - return CanVecMem; + return true; } static bool hasMultipleUsesOf(Instruction *I, @@ -3999,6 +4125,15 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { return true; } +bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { + const SCEV *PhiScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast(PhiScev); + if (!AR) + return false; + + return AR->isAffine(); +} + LoopVectorizationCostModel::VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned UserVF) { @@ -4015,10 +4150,6 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize, unsigned WidestType = getWidestType(); unsigned WidestRegister = TTI.getRegisterBitWidth(true); - unsigned MaxSafeDepDist = -1U; - if (Legal->getMaxSafeDepDistBytes() != -1U) - MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8; - WidestRegister = WidestRegister < MaxSafeDepDist ? WidestRegister : MaxSafeDepDist; unsigned MaxVectorSize = WidestRegister / WidestType; DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n"); DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n"); @@ -4152,10 +4283,6 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize, if (OptForSize) return 1; - // We used the distance for the unroll factor. - if (Legal->getMaxSafeDepDistBytes() != -1U) - return 1; - // Do not unroll loops with a relatively small trip count. unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch()); @@ -4552,6 +4679,7 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) diff --git a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll index 6ef101074de..bab6300f2e7 100644 --- a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll +++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll @@ -30,7 +30,7 @@ if.then: ; preds = %for.body if.end: ; preds = %for.body, %if.then %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ] store i32 %z.0, i32* %arrayidx, align 4 - %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 %exitcond = icmp eq i32 %lftr.wideiv, %x br i1 %exitcond, label %for.end, label %for.body diff --git a/test/Transforms/LoopVectorize/memdep.ll b/test/Transforms/LoopVectorize/memdep.ll deleted file mode 100644 index 56e86a4c0dc..00000000000 --- a/test/Transforms/LoopVectorize/memdep.ll +++ /dev/null @@ -1,222 +0,0 @@ -; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -S | FileCheck %s -check-prefix=WIDTH - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" - -; Vectorization with dependence checks. - -; No plausible dependence - can be vectorized. -; for (i = 0; i < 1024; ++i) -; A[i] = A[i + 1] + 1; - -; CHECK: f1_vec -; CHECK: <2 x i32> - -define void @f1_vec(i32* %A) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %indvars.iv.next = add i32 %indvars.iv, 1 - %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv.next - %0 = load i32* %arrayidx, align 4 - %add1 = add nsw i32 %0, 1 - %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv - store i32 %add1, i32* %arrayidx3, align 4 - %exitcond = icmp ne i32 %indvars.iv.next, 1024 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret void -} - -; Plausible dependence of distance 1 - can't be vectorized. -; for (i = 0; i < 1024; ++i) -; A[i+1] = A[i] + 1; - -; CHECK: f2_novec -; CHECK-NOT: <2 x i32> - -define void @f2_novec(i32* %A) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i32 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32* %A, i32 %indvars.iv - %0 = load i32* %arrayidx, align 4 - %add = add nsw i32 %0, 1 - %indvars.iv.next = add i32 %indvars.iv, 1 - %arrayidx3 = getelementptr inbounds i32* %A, i32 %indvars.iv.next - store i32 %add, i32* %arrayidx3, align 4 - %exitcond = icmp ne i32 %indvars.iv.next, 1024 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret void -} - -; Plausible dependence of distance 2 - can be vectorized with a width of 2. -; for (i = 0; i < 1024; ++i) -; A[i+2] = A[i] + 1; - -; CHECK: f3_vec_len -; CHECK: <2 x i32> - -; WIDTH: f3_vec_len -; WIDTH-NOT: <4 x i32> - -define void @f3_vec_len(i32* %A) { -entry: - br label %for.body - -for.body: - %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ] - %idxprom = sext i32 %i.01 to i64 - %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom - %0 = load i32* %arrayidx, align 4 - %add = add nsw i32 %0, 1 - %add1 = add nsw i32 %i.01, 2 - %idxprom2 = sext i32 %add1 to i64 - %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2 - store i32 %add, i32* %arrayidx3, align 4 - %inc = add nsw i32 %i.01, 1 - %cmp = icmp slt i32 %inc, 1024 - br i1 %cmp, label %for.body, label %for.end - -for.end: - ret void -} - -; Plausible dependence of distance 1 - cannot be vectorized (without reordering -; accesses). -; for (i = 0; i < 1024; ++i) { -; B[i] = A[i]; -; A[i] = B[i + 1]; -; } - -; CHECK: f5 -; CHECK-NOT: <2 x i32> - -define void @f5(i32* %A, i32* %B) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv - %0 = load i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv - store i32 %0, i32* %arrayidx2, align 4 - %indvars.iv.next = add nsw i64 %indvars.iv, 1 - %arrayidx4 = getelementptr inbounds i32* %B, i64 %indvars.iv.next - %1 = load i32* %arrayidx4, align 4 - store i32 %1, i32* %arrayidx, align 4 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp ne i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret void -} - -; Dependence through a phi node - must not vectorize. -; for (i = 0; i < 1024; ++i) { -; a[i+1] = tmp; -; tmp = a[i]; -; } - -; CHECK: f6 -; CHECK-NOT: <2 x i32> - -define i32 @f6(i32* %a, i32 %tmp) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] - %tmp.addr.08 = phi i32 [ %tmp, %entry ], [ %0, %for.body ] - %indvars.iv.next = add nsw i64 %indvars.iv, 1 - %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv.next - store i32 %tmp.addr.08, i32* %arrayidx, align 4 - %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv - %0 = load i32* %arrayidx3, align 4 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp ne i32 %lftr.wideiv, 1024 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret i32 undef -} - -; Don't vectorize true loop carried dependencies that are not a multiple of the -; vector width. -; Example: -; for (int i = ...; ++i) { -; a[i] = a[i-3] + ...; -; It is a bad idea to vectorize this loop because store-load forwarding will not -; happen. -; - -; CHECK: @nostoreloadforward -; CHECK-NOT: <2 x i32> - -define void @nostoreloadforward(i32* %A) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] - %0 = add nsw i64 %indvars.iv, -3 - %arrayidx = getelementptr inbounds i32* %A, i64 %0 - %1 = load i32* %arrayidx, align 4 - %2 = add nsw i64 %indvars.iv, 4 - %arrayidx2 = getelementptr inbounds i32* %A, i64 %2 - %3 = load i32* %arrayidx2, align 4 - %add3 = add nsw i32 %3, %1 - %arrayidx5 = getelementptr inbounds i32* %A, i64 %indvars.iv - store i32 %add3, i32* %arrayidx5, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp ne i32 %lftr.wideiv, 128 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret void -} - -; Example: -; for (int i = ...; ++i) { -; a[i] = b[i]; -; c[i] = a[i-3] + ...; -; It is a bad idea to vectorize this loop because store-load forwarding will not -; happen. -; - -; CHECK: @nostoreloadforward2 -; CHECK-NOT: <2 x i32> - -define void @nostoreloadforward2(i32* noalias %A, i32* noalias %B, i32* noalias %C) { -entry: - br label %for.body - -for.body: - %indvars.iv = phi i64 [ 16, %entry ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv - %0 = load i32* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv - store i32 %0, i32* %arrayidx2, align 4 - %1 = add nsw i64 %indvars.iv, -3 - %arrayidx4 = getelementptr inbounds i32* %A, i64 %1 - %2 = load i32* %arrayidx4, align 4 - %arrayidx6 = getelementptr inbounds i32* %C, i64 %indvars.iv - store i32 %2, i32* %arrayidx6, align 4 - %indvars.iv.next = add i64 %indvars.iv, 1 - %lftr.wideiv = trunc i64 %indvars.iv.next to i32 - %exitcond = icmp ne i32 %lftr.wideiv, 128 - br i1 %exitcond, label %for.body, label %for.end - -for.end: - ret void -} diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll index 47722566e17..014c4fc48f8 100644 --- a/test/Transforms/LoopVectorize/runtime-check.ll +++ b/test/Transforms/LoopVectorize/runtime-check.ll @@ -12,7 +12,7 @@ target triple = "x86_64-apple-macosx10.9.0" ;CHECK: for.body.preheader: ;CHECK: br i1 %cmp.zero, label %middle.block, label %vector.memcheck ;CHECK: vector.memcheck: -;CHECK: br i1 %memcheck.conflict, label %middle.block, label %vector.ph +;CHECK: br i1 %found.conflict, label %middle.block, label %vector.ph ;CHECK: load <4 x float> define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp { entry: