diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 2b07fa2c015..91c04633244 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -99,7 +99,10 @@ struct SLPVectorizer : public FunctionPass { } // Try to hoist some of the scalarization code to the preheader. - if (BBChanged) hoistGatherSequence(LI, BB, R); + if (BBChanged) { + hoistGatherSequence(LI, BB, R); + Changed |= vectorizeUsingGatherHints(R.getGatherSeqInstructions()); + } Changed |= BBChanged; } @@ -130,8 +133,10 @@ private: /// \brief Try to vectorize a chain that starts at two arithmetic instrs. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); - /// \brief Try to vectorize a list of operands. - bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R); + /// \brief Try to vectorize a list of operands. If \p NeedExtracts is true + /// then we calculate the cost of extracting the scalars from the vector. + /// \returns true if a value was vectorized. + bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, bool NeedExtracts); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -143,6 +148,13 @@ private: /// all of the sources are loop invariant. void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R); + /// \brief Try to vectorize additional sequences in different basic blocks + /// based on values that we gathered in previous blocks. The list \p Gathers + /// holds the gather InsertElement instructions that were generated during + /// vectorization. + /// \returns True if some code was vectorized. + bool vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers); + /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); @@ -179,10 +191,11 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R); + return tryToVectorizeList(VL, R, true); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, + bool NeedExtracts) { if (VL.size() < 2) return false; @@ -204,7 +217,7 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { } int Cost = R.getTreeCost(VL); - int ExtrCost = R.getScalarizationCost(VL); + int ExtrCost = NeedExtracts ? R.getScalarizationCost(VL) : 0; DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << " Cost of extract:" << ExtrCost << ".\n"); if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; @@ -307,7 +320,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } if (Incoming.size() > 1) - Changed |= tryToVectorizeList(Incoming, R); + Changed |= tryToVectorizeList(Incoming, R, true); } return Changed; @@ -329,6 +342,51 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { return Changed; } +bool SLPVectorizer::vectorizeUsingGatherHints(BoUpSLP::InstrList &Gathers) { + SmallVector Seq; + bool Changed = false; + for (int i = 0, e = Gathers.size(); i < e; ++i) { + InsertElementInst *IEI = dyn_cast_or_null(Gathers[i]); + + if (IEI) { + if (Instruction *I = dyn_cast(IEI->getOperand(1))) + Seq.push_back(I); + } else { + + if (!Seq.size()) + continue; + + Instruction *I = cast(Seq[0]); + BasicBlock *BB = I->getParent(); + + DEBUG(dbgs()<<"SLP: Inspecting a gather list of size " << Seq.size() << + " in " << BB->getName() << ".\n"); + + // Check if the gathered values have multiple uses. If they only have one + // user then we know that the insert/extract pair will go away. + bool HasMultipleUsers = false; + for (int i=0; e = Seq.size(), i < e; ++i) { + if (!Seq[i]->hasOneUse()) { + HasMultipleUsers = true; + break; + } + } + + BoUpSLP BO(BB, SE, DL, TTI, AA, LI->getLoopFor(BB)); + + if (tryToVectorizeList(Seq, BO, HasMultipleUsers)) { + DEBUG(dbgs()<<"SLP: Vectorized a gather list of len " << Seq.size() << + " in " << BB->getName() << ".\n"); + Changed = true; + } + + Seq.clear(); + } + } + + return Changed; +} + void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R) { // Check if this block is inside a loop. @@ -344,12 +402,14 @@ void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, // Mark the insertion point for the block. Instruction *Location = PreHeader->getTerminator(); - BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions(); - for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end(); + BoUpSLP::InstrList &Gathers = R.getGatherSeqInstructions(); + for (BoUpSLP::InstrList::iterator it = Gathers.begin(), e = Gathers.end(); it != e; ++it) { - InsertElementInst *Insert = dyn_cast(*it); + InsertElementInst *Insert = dyn_cast_or_null(*it); // The InsertElement sequence can be simplified into a constant. + // Also Ignore NULL pointers because they are only here to separate + // sequences. if (!Insert) continue; diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp index e79f08a56dd..88c457d1176 100644 --- a/lib/Transforms/Vectorize/VecUtils.cpp +++ b/lib/Transforms/Vectorize/VecUtils.cpp @@ -731,9 +731,13 @@ Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { // Remember that this instruction is used as part of a 'gather' sequence. // The caller of the bottom-up slp vectorizer can try to hoist the sequence // if the users are outside of the basic block. - GatherInstructions.push_back(Vec); + if (InsertElementInst *IEI = dyn_cast(Vec)) + GatherInstructions.push_back(IEI); } + // Mark the end of the gather sequence. + GatherInstructions.push_back(0); + for (unsigned i = 0; i < Ty->getNumElements(); ++i) VectorizedValues[VL[i]] = Vec; diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h index f76ae8413d7..28a61e3c0dd 100644 --- a/lib/Transforms/Vectorize/VecUtils.h +++ b/lib/Transforms/Vectorize/VecUtils.h @@ -34,6 +34,7 @@ class Loop; /// Bottom Up SLP vectorization utility class. struct BoUpSLP { typedef SmallVector ValueList; + typedef SmallVector InstrList; typedef SmallPtrSet ValueSet; typedef SmallVector StoreList; static const int max_cost = 1<<20; @@ -78,7 +79,7 @@ struct BoUpSLP { /// \returns the list of new instructions that were added in order to collect /// scalars into vectors. This list can be used to further optimize the gather /// sequences. - ValueList &getGatherSeqInstructions() {return GatherInstructions; } + InstrList &getGatherSeqInstructions() {return GatherInstructions; } private: /// \brief This method contains the recursive part of getTreeCost. @@ -166,7 +167,9 @@ private: /// A list of instructions that are used when gathering scalars into vectors. /// In many cases these instructions can be hoisted outside of the BB. /// Iterating over this list is faster than calling LICM. - ValueList GatherInstructions; + /// Notice: We insert NULL ptrs to separate between the different gather + /// sequences. + InstrList GatherInstructions; /// Instruction builder to construct the vectorized tree. IRBuilder<> Builder; diff --git a/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll new file mode 100644 index 00000000000..8246453519b --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll @@ -0,0 +1,54 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; int foo(double *A, float *B, int g) { +; float B0 = B[0]; +; float B1 = B[1]; <----- BasicBlock #1 +; B0 += 5; +; B1 += 8; +; +; if (g) bar(); +; +; A[0] += B0; <------- BasicBlock #3 +; A[1] += B1; +; } + + +;CHECK: @foo +;CHECK: load <2 x float> +;CHECK: fadd <2 x float> +;CHECK: call i32 +;CHECK: load <2 x double> +;CHECK: fadd <2 x double> +;CHECK: store <2 x double> +;CHECK: ret +define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) { +entry: + %0 = load float* %B, align 4 + %arrayidx1 = getelementptr inbounds float* %B, i64 1 + %1 = load float* %arrayidx1, align 4 + %add = fadd float %0, 5.000000e+00 + %add2 = fadd float %1, 8.000000e+00 + %tobool = icmp eq i32 %g, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: + %call = tail call i32 (...)* @bar() + br label %if.end + +if.end: + %conv = fpext float %add to double + %2 = load double* %A, align 8 + %add4 = fadd double %conv, %2 + store double %add4, double* %A, align 8 + %conv5 = fpext float %add2 to double + %arrayidx6 = getelementptr inbounds double* %A, i64 1 + %3 = load double* %arrayidx6, align 8 + %add7 = fadd double %conv5, %3 + store double %add7, double* %arrayidx6, align 8 + ret i32 undef +} + +declare i32 @bar(...)