diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 7f45c7a3571..963dad5ff20 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1648,58 +1648,12 @@ public: bool hasStride(Value *V) { return LAI->hasStride(V); } - /// Returns true if the target machine supports masked store operation - /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedStore(Type *DataType, Value *Ptr) { - return isConsecutivePtr(Ptr) && TTI->isLegalMaskedStore(DataType); - } - - /// Returns true if the target machine supports masked load operation - /// for the given \p DataType and kind of access to \p Ptr. - bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { - return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType); - } - - /// Returns true if the target machine supports masked scatter operation - /// for the given \p DataType. - bool isLegalMaskedScatter(Type *DataType) { - return TTI->isLegalMaskedScatter(DataType); - } - - /// Returns true if the target machine supports masked gather operation - /// for the given \p DataType. - bool isLegalMaskedGather(Type *DataType) { - return TTI->isLegalMaskedGather(DataType); - } - - /// Returns true if the target machine can represent \p V as a masked gather - /// or scatter operation. - bool isLegalGatherOrScatter(Value *V) { - auto *LI = dyn_cast(V); - auto *SI = dyn_cast(V); - if (!LI && !SI) - return false; - auto *Ptr = getPointerOperand(V); - auto *Ty = cast(Ptr->getType())->getElementType(); - return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); - } - /// Returns true if vector representation of the instruction \p I /// requires mask. bool isMaskRequired(const Instruction *I) { return (MaskedOp.count(I) != 0); } unsigned getNumStores() const { return LAI->getNumStores(); } unsigned getNumLoads() const { return LAI->getNumLoads(); } - unsigned getNumPredStores() const { return NumPredStores; } - - /// Returns true if \p I is an instruction that will be scalarized with - /// predication. Such instructions include conditional stores and - /// instructions that may divide by zero. - bool isScalarWithPredication(Instruction *I); - - /// Returns true if \p I is a memory instruction with consecutive memory - /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); // Returns true if the NoNaN attribute is set on the function. bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } @@ -1753,8 +1707,6 @@ private: return LAI ? &LAI->getSymbolicStrides() : nullptr; } - unsigned NumPredStores = 0; - /// The loop that we evaluate. Loop *TheLoop; @@ -2060,7 +2012,53 @@ public: collectLoopScalars(VF); } + /// Returns true if the target machine supports masked store operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedStore(Type *DataType, Value *Ptr) { + return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedStore(DataType); + } + + /// Returns true if the target machine supports masked load operation + /// for the given \p DataType and kind of access to \p Ptr. + bool isLegalMaskedLoad(Type *DataType, Value *Ptr) { + return Legal->isConsecutivePtr(Ptr) && TTI.isLegalMaskedLoad(DataType); + } + + /// Returns true if the target machine supports masked scatter operation + /// for the given \p DataType. + bool isLegalMaskedScatter(Type *DataType) { + return TTI.isLegalMaskedScatter(DataType); + } + + /// Returns true if the target machine supports masked gather operation + /// for the given \p DataType. + bool isLegalMaskedGather(Type *DataType) { + return TTI.isLegalMaskedGather(DataType); + } + + /// Returns true if the target machine can represent \p V as a masked gather + /// or scatter operation. + bool isLegalGatherOrScatter(Value *V) { + bool LI = isa(V); + bool SI = isa(V); + if (!LI && !SI) + return false; + auto *Ty = getMemInstValueType(V); + return (LI && isLegalMaskedGather(Ty)) || (SI && isLegalMaskedScatter(Ty)); + } + + /// Returns true if \p I is an instruction that will be scalarized with + /// predication. Such instructions include conditional stores and + /// instructions that may divide by zero. + bool isScalarWithPredication(Instruction *I); + + /// Returns true if \p I is a memory instruction with consecutive memory + /// access that can be widened. + bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + private: + unsigned NumPredStores = 0; + /// \return An upper bound for the vectorization factor, larger than zero. /// One is returned if vectorization should best be avoided due to cost. unsigned computeFeasibleMaxVF(bool OptForSize, unsigned ConstTripCount); @@ -2112,6 +2110,10 @@ private: /// as a vector operation. bool isConsecutiveLoadOrStore(Instruction *I); + /// Returns true if an artificially high cost for emulated masked memrefs + /// should be used. + bool useEmulatedMaskMemRefHack(Instruction *I); + /// Create an analysis remark that explains why vectorization failed /// /// \p RemarkName is the identifier for the remark. \return the remark object @@ -5421,14 +5423,22 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { Scalars[VF].insert(Worklist.begin(), Worklist.end()); } -bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { - if (!blockNeedsPredication(I->getParent())) +bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) { + if (!Legal->blockNeedsPredication(I->getParent())) return false; switch(I->getOpcode()) { default: break; - case Instruction::Store: - return !isMaskRequired(I); + case Instruction::Load: + case Instruction::Store: { + if (!Legal->isMaskRequired(I)) + return false; + auto *Ptr = getPointerOperand(I); + auto *Ty = getMemInstValueType(I); + return isa(I) ? + !(isLegalMaskedLoad(Ty, Ptr) || isLegalMaskedGather(Ty)) + : !(isLegalMaskedStore(Ty, Ptr) || isLegalMaskedScatter(Ty)); + } case Instruction::UDiv: case Instruction::SDiv: case Instruction::SRem: @@ -5438,8 +5448,8 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) { return false; } -bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, - unsigned VF) { +bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I, + unsigned VF) { // Get and ensure we have a valid memory instruction. LoadInst *LI = dyn_cast(I); StoreInst *SI = dyn_cast(I); @@ -5448,7 +5458,7 @@ bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I, auto *Ptr = getPointerOperand(I); // In order to be widened, the pointer should be consecutive, first of all. - if (!isConsecutivePtr(Ptr)) + if (!Legal->isConsecutivePtr(Ptr)) return false; // If the instruction is a store located in a predicated block, it will be @@ -5703,39 +5713,26 @@ bool LoopVectorizationLegality::blockCanBePredicated( if (!LI) return false; if (!SafePtrs.count(LI->getPointerOperand())) { - if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) || - isLegalMaskedGather(LI->getType())) { - MaskedOp.insert(LI); - continue; - } // !llvm.mem.parallel_loop_access implies if-conversion safety. - if (IsAnnotatedParallel) - continue; - return false; + // Otherwise, record that the load needs (real or emulated) masking + // and let the cost model decide. + if (!IsAnnotatedParallel) + MaskedOp.insert(LI); + continue; } } if (I.mayWriteToMemory()) { auto *SI = dyn_cast(&I); - // We only support predication of stores in basic blocks with one - // predecessor. if (!SI) return false; - - // Build a masked store if it is legal for the target. - if (isLegalMaskedStore(SI->getValueOperand()->getType(), - SI->getPointerOperand()) || - isLegalMaskedScatter(SI->getValueOperand()->getType())) { - MaskedOp.insert(SI); - continue; - } - - bool isSafePtr = (SafePtrs.count(SI->getPointerOperand()) != 0); - bool isSinglePredecessor = SI->getParent()->getSinglePredecessor(); - - if (++NumPredStores > NumberOfStoresToPredicate || !isSafePtr || - !isSinglePredecessor) - return false; + // Predicated store requires some form of masking: + // 1) masked store HW instruction, + // 2) emulation via load-blend-store (only if safe and legal to do so, + // be aware on the race conditions), or + // 3) element-by-element predicate check and scalar store. + MaskedOp.insert(SI); + continue; } if (I.mayThrow()) return false; @@ -6050,13 +6047,6 @@ void InterleavedAccessInfo::analyzeInterleaving( } Optional LoopVectorizationCostModel::computeMaxVF(bool OptForSize) { - if (!EnableCondStoresVectorization && Legal->getNumPredStores()) { - ORE->emit(createMissedAnalysis("ConditionalStore") - << "store that is conditionally executed prevents vectorization"); - DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); - return None; - } - if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) { // TODO: It may by useful to do since it's still likely to be dynamically // uniform if the target can skip. @@ -6183,9 +6173,7 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize, VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { float Cost = expectedCost(1).first; -#ifndef NDEBUG const float ScalarCost = Cost; -#endif /* NDEBUG */ unsigned Width = 1; DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n"); @@ -6216,6 +6204,14 @@ LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) { } } + if (!EnableCondStoresVectorization && NumPredStores) { + ORE->emit(createMissedAnalysis("ConditionalStore") + << "store that is conditionally executed prevents vectorization"); + DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n"); + Width = 1; + Cost = ScalarCost; + } + DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs() << "LV: Vectorization seems to be not beneficial, " << "but was forced by a user.\n"); @@ -6267,7 +6263,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() { // optimization to non-pointer types. // if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) && - !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I)) + !Legal->isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I)) continue; MinWidth = std::min(MinWidth, @@ -6592,6 +6588,22 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { return RUs; } +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){ + // TODO: Cost model for emulated masked load/store is completely + // broken. This hack guides the cost model to use an artificially + // high enough value to practically disable vectorization with such + // operations, except where previously deployed legality hack allowed + // using very low cost values. This is to avoid regressions coming simply + // from moving "masked load/store" check from legality to cost model. + // Masked Load/Gather emulation was previously never allowed. + // Limited number of Masked Store/Scatter emulation was allowed. + assert(isScalarWithPredication(I) && + "Expecting a scalar emulated instruction"); + return isa(I) || + (isa(I) && + NumPredStores > NumberOfStoresToPredicate); +} + void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already @@ -6612,11 +6624,13 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) { if (!Legal->blockNeedsPredication(BB)) continue; for (Instruction &I : *BB) - if (Legal->isScalarWithPredication(&I)) { + if (isScalarWithPredication(&I)) { ScalarCostsTy ScalarCosts; - if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + // Do not apply discount logic if hacked cost is needed + // for emulated masked memrefs. + if (!useEmulatedMaskMemRefHack(&I) && + computePredInstDiscount(&I, ScalarCosts, VF) >= 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); - // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization.insert(BB); } @@ -6651,7 +6665,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (Legal->isScalarWithPredication(I)) + if (isScalarWithPredication(I)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6705,7 +6719,7 @@ int LoopVectorizationCostModel::computePredInstDiscount( // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (Legal->isScalarWithPredication(I) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF), true, false); ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI); @@ -6848,9 +6862,15 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, // If we have a predicated store, it may not be executed for each vector // lane. Scale the cost by the probability of executing the predicated // block. - if (Legal->isScalarWithPredication(I)) + if (isScalarWithPredication(I)) { Cost /= getReciprocalPredBlockProb(); + if (useEmulatedMaskMemRefHack(I)) + // Artificially setting to a high enough value to practically disable + // vectorization with such operations. + Cost = 3000000; + } + return Cost; } @@ -6975,6 +6995,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (VF == 1) return; + NumPredStores = 0; for (BasicBlock *BB : TheLoop->blocks()) { // For each instruction in the old loop. for (Instruction &I : *BB) { @@ -6982,6 +7003,8 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { if (!Ptr) continue; + if (isa(&I) && isScalarWithPredication(&I)) + NumPredStores++; if (isa(&I) && Legal->isUniform(Ptr)) { // Scalar load + broadcast unsigned Cost = getUniformMemOpCost(&I, VF); @@ -6990,7 +7013,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } // We assume that widening is the best solution when possible. - if (Legal->memoryInstructionCanBeWidened(&I, VF)) { + if (memoryInstructionCanBeWidened(&I, VF)) { unsigned Cost = getConsecutiveMemOpCost(&I, VF); int ConsecutiveStride = Legal->isConsecutivePtr(getPointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && @@ -7017,7 +7040,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) { } unsigned GatherScatterCost = - Legal->isLegalGatherOrScatter(&I) + isLegalGatherOrScatter(&I) ? getGatherScatterCost(&I, VF) * NumAccesses : std::numeric_limits::max(); @@ -7178,7 +7201,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, // vector lane. Get the scalarization cost and scale this amount by the // probability of executing the predicated block. If the instruction is not // predicated, we fall through to the next case. - if (VF > 1 && Legal->isScalarWithPredication(I)) { + if (VF > 1 && isScalarWithPredication(I)) { unsigned Cost = 0; // These instructions have a non-void type, so account for the phi nodes @@ -7799,7 +7822,7 @@ LoopVectorizationPlanner::tryToBlend(Instruction *I, VPlanPtr &Plan) { bool LoopVectorizationPlanner::tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range) { - if (Legal->isScalarWithPredication(I)) + if (CM.isScalarWithPredication(I)) return false; auto IsVectorizableOpcode = [](unsigned Opcode) { @@ -7906,7 +7929,7 @@ VPBasicBlock *LoopVectorizationPlanner::handleReplication( [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); }, Range); - bool IsPredicated = Legal->isScalarWithPredication(I); + bool IsPredicated = CM.isScalarWithPredication(I); auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated); // Find if I uses a predicated instruction. If so, it will use its scalar diff --git a/test/Transforms/LoopVectorize/conditional-assignment.ll b/test/Transforms/LoopVectorize/conditional-assignment.ll index 25d6111d65a..7d1071b05d8 100644 --- a/test/Transforms/LoopVectorize/conditional-assignment.ll +++ b/test/Transforms/LoopVectorize/conditional-assignment.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s ; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s -; CHECK: remark: source.c:2:8: loop not vectorized: store that is conditionally executed prevents vectorization +; CHECK: remark: source.c:2:8: the cost-model indicates that vectorization is not beneficial target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/test/Transforms/LoopVectorize/hoist-loads.ll b/test/Transforms/LoopVectorize/hoist-loads.ll index a20b0f6cdf3..db4774d8ba9 100644 --- a/test/Transforms/LoopVectorize/hoist-loads.ll +++ b/test/Transforms/LoopVectorize/hoist-loads.ll @@ -37,8 +37,9 @@ for.end: } ; However, we can't hoist loads whose address we have not seen unconditionally -; accessed. +; accessed. One wide load is fine, but not the second. ; CHECK-LABEL: @dont_hoist_cond_load( +; CHECK: load <2 x float> ; CHECK-NOT: load <2 x float> define void @dont_hoist_cond_load() {