From 90be3c24a7162a488f68f7cce159017c10408133 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 6 Apr 2020 15:59:45 +0100 Subject: [PATCH] [VPlan] Introduce new VPWidenCallRecipe (NFC). This patch moves calls to their own recipe, to simplify the transition to VPUser for operands of VPWidenRecipe, as discussed in D76992. Subsequently additional information can be added to the recipe rather than computing it during the execute step. Reviewers: rengolin, Ayal, gilr, hsaito Reviewed By: gilr Differential Revision: https://reviews.llvm.org/D77467 --- .../Transforms/Vectorize/LoopVectorize.cpp | 170 ++++++++++-------- .../Transforms/Vectorize/VPRecipeBuilder.h | 5 + llvm/lib/Transforms/Vectorize/VPlan.cpp | 6 + llvm/lib/Transforms/Vectorize/VPlan.h | 25 +++ 4 files changed, 132 insertions(+), 74 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 58358eb5d662..eecad0c3d3b5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -409,6 +409,9 @@ public: /// Widen a single instruction within the innermost loop. void widenInstruction(Instruction &I); + /// Widen a single call instruction within the innermost loop. + void widenCallInstruction(CallInst &I); + /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(); @@ -4224,6 +4227,7 @@ static bool mayDivideByZero(Instruction &I) { void InnerLoopVectorizer::widenInstruction(Instruction &I) { switch (I.getOpcode()) { + case Instruction::Call: case Instruction::Br: case Instruction::PHI: case Instruction::GetElementPtr: @@ -4348,54 +4352,62 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { } break; } + default: + // This instruction is not vectorized by simple widening. + LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); + llvm_unreachable("Unhandled instruction!"); + } // end of switch. +} - case Instruction::Call: { - // Ignore dbg intrinsics. - if (isa(I)) - break; - setDebugLocFromInst(Builder, &I); +void InnerLoopVectorizer::widenCallInstruction(CallInst &I) { + // Ignore dbg intrinsics. + // TODO: Debug intrinsics should be skipped/handled during VPlan construction + // rather than dropping them here. + if (isa(I)) + return; + setDebugLocFromInst(Builder, &I); - Module *M = I.getParent()->getParent()->getParent(); - auto *CI = cast(&I); + Module *M = I.getParent()->getParent()->getParent(); + auto *CI = cast(&I); - SmallVector Tys; - for (Value *ArgOperand : CI->arg_operands()) - Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); + SmallVector Tys; + for (Value *ArgOperand : CI->arg_operands()) + Tys.push_back(ToVectorTy(ArgOperand->getType(), VF)); - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize = false; - unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; - assert((UseVectorIntrinsic || !NeedToScalarize) && - "Instruction should be scalarized elsewhere."); + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize = false; + unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost; + assert((UseVectorIntrinsic || !NeedToScalarize) && + "Instruction should be scalarized elsewhere."); - for (unsigned Part = 0; Part < UF; ++Part) { - SmallVector Args; - for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { - Value *Arg = CI->getArgOperand(i); - // Some intrinsics have a scalar argument - don't replace it with a - // vector. - if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) - Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); - Args.push_back(Arg); - } + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Args; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + Value *Arg = CI->getArgOperand(i); + // Some intrinsics have a scalar argument - don't replace it with a + // vector. + if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i)) + Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part); + Args.push_back(Arg); + } - Function *VectorF; - if (UseVectorIntrinsic) { - // Use vector version of the intrinsic. - Type *TysForDecl[] = {CI->getType()}; - if (VF > 1) - TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); - VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); - } else { - // Use vector version of the function call. - const VFShape Shape = - VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); + Function *VectorF; + if (UseVectorIntrinsic) { + // Use vector version of the intrinsic. + Type *TysForDecl[] = {CI->getType()}; + if (VF > 1) + TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF); + VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl); + } else { + // Use vector version of the function call. + const VFShape Shape = + VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/); #ifndef NDEBUG const SmallVector Infos = VFDatabase::getMappings(*CI); assert(std::find_if(Infos.begin(), Infos.end(), @@ -4405,7 +4417,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { "Vector function shape is missing from the database."); #endif VectorF = VFDatabase(*CI).getVectorizedFunction(Shape); - } + } assert(VectorF && "Can't create vector function."); SmallVector OpBundles; @@ -4417,16 +4429,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) { VectorLoopValueMap.setVectorValue(&I, Part, V); addMetadata(V, &I); - } - - break; } - - default: - // This instruction is not vectorized by simple widening. - LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I); - llvm_unreachable("Unhandled instruction!"); - } // end of switch. } void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) { @@ -6884,11 +6887,45 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) { return new VPBlendRecipe(Phi, Masks); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { +VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(Instruction *I, + VFRange &Range) { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + CallInst *CI = dyn_cast(I); + if (IsPredicated || !CI) + return nullptr; + + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || + ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) + return nullptr; + + auto willWiden = [&](unsigned VF) -> bool { + Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); + // The following case may be scalarized depending on the VF. + // The flag shows whether we use Intrinsic or a usual Call for vectorized + // version of the instruction. + // Is it beneficial to perform intrinsic call compared to lib call? + bool NeedToScalarize; + unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); + bool UseVectorIntrinsic = + ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; + return UseVectorIntrinsic || !NeedToScalarize; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return nullptr; + + // Success: widen this call. + return new VPWidenCallRecipe(*CI); +} + +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { + bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( + [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range); + if (IsPredicated) return nullptr; @@ -6899,7 +6936,6 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { case Instruction::AShr: case Instruction::BitCast: case Instruction::Br: - case Instruction::Call: case Instruction::FAdd: case Instruction::FCmp: case Instruction::FDiv: @@ -6941,29 +6977,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) { if (!IsVectorizableOpcode(I->getOpcode())) return nullptr; - if (CallInst *CI = dyn_cast(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end || - ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect)) - return nullptr; - } - auto willWiden = [&](unsigned VF) -> bool { if (!isa(I) && (CM.isScalarAfterVectorization(I, VF) || CM.isProfitableToScalarize(I, VF))) return false; - if (CallInst *CI = dyn_cast(I)) { - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); - // The following case may be scalarized depending on the VF. - // The flag shows whether we use Intrinsic or a usual Call for vectorized - // version of the instruction. - // Is it beneficial to perform intrinsic call compared to lib call? - bool NeedToScalarize; - unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize); - bool UseVectorIntrinsic = - ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost; - return UseVectorIntrinsic || !NeedToScalarize; - } if (isa(I) || isa(I)) { assert(CM.getWideningDecision(I, VF) == LoopVectorizationCostModel::CM_Scalarize && @@ -7052,9 +7069,10 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan, VPBasicBlock *VPBB) { VPRecipeBase *Recipe = nullptr; - // First, check for specific widening recipes that deal with memory + // First, check for specific widening recipes that deal with calls, memory // operations, inductions and Phi nodes. - if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) || + if ((Recipe = tryToWidenCall(Instr, Range)) || + (Recipe = tryToWidenMemory(Instr, Range, Plan)) || (Recipe = tryToOptimizeInduction(Instr, Range)) || (Recipe = tryToBlend(Instr, Plan)) || (isa(Instr) && @@ -7371,6 +7389,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, << Indent << "\" " << VPlanIngredient(I) << " " << i << "\\l\""; } +void VPWidenCallRecipe::execute(VPTransformState &State) { + State.ILV->widenCallInstruction(Ingredient); +} + void VPWidenRecipe::execute(VPTransformState &State) { State.ILV->widenInstruction(Ingredient); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index ab9cd774f428..d5435c23e65c 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -107,6 +107,11 @@ public: /// full if-conversion. VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan); + /// Handle call instruction. If \p I is a call that can be widened for \p + /// Range.Start, return a new VPWidenCallRecipe. Range.End may be decreased to + /// ensure same decision from \p Range.Start to \p Range.End. + VPWidenCallRecipe *tryToWidenCall(Instruction *I, VFRange &Range); + /// Check if \p I can be widened within the given VF \p Range. If \p I can be /// widened for \p Range.Start, build a new VPWidenRecipe and return it. /// Range.End may be decreased to ensure same decision from \p Range.Start to diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 7944cf540538..17b4a3da6a06 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -712,6 +712,12 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) { O << DOT::EscapeString(IngredientString); } +void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << " +\n" + << Indent << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient) << "\\l\""; +} + void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << " +\n" << Indent << "\"WIDEN\\l\""; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 68928d02d56d..7e1f74735918 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -609,6 +609,7 @@ public: VPInterleaveSC, VPPredInstPHISC, VPReplicateSC, + VPWidenCallSC, VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, @@ -783,6 +784,30 @@ public: VPSlotTracker &SlotTracker) const override; }; +/// A recipe for widening Call instructions. +class VPWidenCallRecipe : public VPRecipeBase { +private: + /// Hold the call to be widened. + CallInst &Ingredient; + +public: + VPWidenCallRecipe(CallInst &I) : VPRecipeBase(VPWidenCallSC), Ingredient(I) {} + + ~VPWidenCallRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPRecipeBase *V) { + return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC; + } + + /// Produce a widened version of the call instruction. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeBase { private: