[VPlan] Introduce new VPWidenCallRecipe (NFC).

This patch moves calls to their own recipe, to simplify the transition to VPUser for operands of VPWidenRecipe, as discussed in D76992. Subsequently additional information can be added to the recipe rather than computing it during the execute step. Reviewers: rengolin, Ayal, gilr, hsaito Reviewed By: gilr Differential Revision: https://reviews.llvm.org/D77467
2025-01-27 03:48:33 +00:00 · 2020-04-06 15:59:45 +01:00 · 2020-04-06 15:59:45 +01:00 · 90be3c24a7
commit 90be3c24a7
parent d6ea82d11c
4 changed files with 132 additions and 74 deletions
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -409,6 +409,9 @@ public:
  /// Widen a single instruction within the innermost loop.
  void widenInstruction(Instruction &I);

+  /// Widen a single call instruction within the innermost loop.
+  void widenCallInstruction(CallInst &I);
+
  /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
  void fixVectorizedLoop();

@ -4224,6 +4227,7 @@ static bool mayDivideByZero(Instruction &I) {

 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
  switch (I.getOpcode()) {
+  case Instruction::Call:
  case Instruction::Br:
  case Instruction::PHI:
  case Instruction::GetElementPtr:
@ -4348,54 +4352,62 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
    }
    break;
  }
+  default:
+    // This instruction is not vectorized by simple widening.
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  } // end of switch.
+}

-  case Instruction::Call: {
-    // Ignore dbg intrinsics.
-    if (isa<DbgInfoIntrinsic>(I))
-      break;
-    setDebugLocFromInst(Builder, &I);
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I) {
+  // Ignore dbg intrinsics.
+  // TODO: Debug intrinsics should be skipped/handled during VPlan construction
+  // rather than dropping them here.
+  if (isa<DbgInfoIntrinsic>(I))
+    return;
+  setDebugLocFromInst(Builder, &I);

-    Module *M = I.getParent()->getParent()->getParent();
-    auto *CI = cast<CallInst>(&I);
+  Module *M = I.getParent()->getParent()->getParent();
+  auto *CI = cast<CallInst>(&I);

-    SmallVector<Type *, 4> Tys;
-    for (Value *ArgOperand : CI->arg_operands())
-      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+  SmallVector<Type *, 4> Tys;
+  for (Value *ArgOperand : CI->arg_operands())
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));

-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);

-    // The flag shows whether we use Intrinsic or a usual Call for vectorized
-    // version of the instruction.
-    // Is it beneficial to perform intrinsic call compared to lib call?
-    bool NeedToScalarize = false;
-    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
-    bool UseVectorIntrinsic =
-        ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
-    assert((UseVectorIntrinsic || !NeedToScalarize) &&
-           "Instruction should be scalarized elsewhere.");
+  // The flag shows whether we use Intrinsic or a usual Call for vectorized
+  // version of the instruction.
+  // Is it beneficial to perform intrinsic call compared to lib call?
+  bool NeedToScalarize = false;
+  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+  bool UseVectorIntrinsic =
+      ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+  assert((UseVectorIntrinsic || !NeedToScalarize) &&
+         "Instruction should be scalarized elsewhere.");

-    for (unsigned Part = 0; Part < UF; ++Part) {
-      SmallVector<Value *, 4> Args;
-      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-        Value *Arg = CI->getArgOperand(i);
-        // Some intrinsics have a scalar argument - don't replace it with a
-        // vector.
-        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
-          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
-        Args.push_back(Arg);
-      }
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    SmallVector<Value *, 4> Args;
+    for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+      Value *Arg = CI->getArgOperand(i);
+      // Some intrinsics have a scalar argument - don't replace it with a
+      // vector.
+      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
+        Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
+      Args.push_back(Arg);
+    }

-      Function *VectorF;
-      if (UseVectorIntrinsic) {
-        // Use vector version of the intrinsic.
-        Type *TysForDecl[] = {CI->getType()};
-        if (VF > 1)
-          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
-      } else {
-        // Use vector version of the function call.
-        const VFShape Shape =
-            VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+    Function *VectorF;
+    if (UseVectorIntrinsic) {
+      // Use vector version of the intrinsic.
+      Type *TysForDecl[] = {CI->getType()};
+      if (VF > 1)
+        TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+    } else {
+      // Use vector version of the function call.
+      const VFShape Shape =
+          VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
 #ifndef NDEBUG
        const SmallVector<VFInfo, 8> Infos = VFDatabase::getMappings(*CI);
        assert(std::find_if(Infos.begin(), Infos.end(),
@ -4405,7 +4417,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
               "Vector function shape is missing from the database.");
 #endif
        VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
-      }
+    }
      assert(VectorF && "Can't create vector function.");

      SmallVector<OperandBundleDef, 1> OpBundles;
@ -4417,16 +4429,7 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {

      VectorLoopValueMap.setVectorValue(&I, Part, V);
      addMetadata(V, &I);
-    }
-
-    break;
  }
-
-  default:
-    // This instruction is not vectorized by simple widening.
-    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
-    llvm_unreachable("Unhandled instruction!");
-  } // end of switch.
 }

 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@ -6884,11 +6887,45 @@ VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
  return new VPBlendRecipe(Phi, Masks);
 }

-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(Instruction *I,
+                                                   VFRange &Range) {

  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);

+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (IsPredicated || !CI)
+    return nullptr;
+
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+    return nullptr;
+
+  auto willWiden = [&](unsigned VF) -> bool {
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    // The following case may be scalarized depending on the VF.
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize;
+    unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+    return UseVectorIntrinsic || !NeedToScalarize;
+  };
+
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  // Success: widen this call.
+  return new VPWidenCallRecipe(*CI);
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
+  bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+
  if (IsPredicated)
    return nullptr;

@ -6899,7 +6936,6 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
    case Instruction::AShr:
    case Instruction::BitCast:
    case Instruction::Br:
-    case Instruction::Call:
    case Instruction::FAdd:
    case Instruction::FCmp:
    case Instruction::FDiv:
@ -6941,29 +6977,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VFRange &Range) {
  if (!IsVectorizableOpcode(I->getOpcode()))
    return nullptr;

-  if (CallInst *CI = dyn_cast<CallInst>(I)) {
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-               ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
-      return nullptr;
-  }
-
  auto willWiden = [&](unsigned VF) -> bool {
    if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
                             CM.isProfitableToScalarize(I, VF)))
      return false;
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      // The following case may be scalarized depending on the VF.
-      // The flag shows whether we use Intrinsic or a usual Call for vectorized
-      // version of the instruction.
-      // Is it beneficial to perform intrinsic call compared to lib call?
-      bool NeedToScalarize;
-      unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
-      bool UseVectorIntrinsic =
-          ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
-      return UseVectorIntrinsic || !NeedToScalarize;
-    }
    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
      assert(CM.getWideningDecision(I, VF) ==
                 LoopVectorizationCostModel::CM_Scalarize &&
@ -7052,9 +7069,10 @@ bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
                                        VPlanPtr &Plan, VPBasicBlock *VPBB) {
  VPRecipeBase *Recipe = nullptr;

-  // First, check for specific widening recipes that deal with memory
+  // First, check for specific widening recipes that deal with calls, memory
  // operations, inductions and Phi nodes.
-  if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
+  if ((Recipe = tryToWidenCall(Instr, Range)) ||
+      (Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
      (Recipe = tryToOptimizeInduction(Instr, Range)) ||
      (Recipe = tryToBlend(Instr, Plan)) ||
      (isa<PHINode>(Instr) &&
@ -7371,6 +7389,10 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
        << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
 }

+void VPWidenCallRecipe::execute(VPTransformState &State) {
+  State.ILV->widenCallInstruction(Ingredient);
+}
+
 void VPWidenRecipe::execute(VPTransformState &State) {
  State.ILV->widenInstruction(Ingredient);
 }
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@ -107,6 +107,11 @@ public:
  /// full if-conversion.
  VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);

+  /// Handle call instruction. If \p I is a call that can be widened for \p
+  /// Range.Start, return a new VPWidenCallRecipe. Range.End may be decreased to
+  /// ensure same decision from \p Range.Start to \p Range.End.
+  VPWidenCallRecipe *tryToWidenCall(Instruction *I, VFRange &Range);
+
  /// Check if \p I can be widened within the given VF \p Range. If \p I can be
  /// widened for \p Range.Start, build a new VPWidenRecipe and return it.
  /// Range.End may be decreased to ensure same decision from \p Range.Start to
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@ -712,6 +712,12 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
  O << DOT::EscapeString(IngredientString);
 }

+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << " +\n"
+    << Indent << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient) << "\\l\"";
+}
+
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                          VPSlotTracker &SlotTracker) const {
  O << " +\n" << Indent << "\"WIDEN\\l\"";
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@ -609,6 +609,7 @@ public:
    VPInterleaveSC,
    VPPredInstPHISC,
    VPReplicateSC,
+    VPWidenCallSC,
    VPWidenGEPSC,
    VPWidenIntOrFpInductionSC,
    VPWidenMemoryInstructionSC,
@ -783,6 +784,30 @@ public:
             VPSlotTracker &SlotTracker) const override;
 };

+/// A recipe for widening Call instructions.
+class VPWidenCallRecipe : public VPRecipeBase {
+private:
+  /// Hold the call to be widened.
+  CallInst &Ingredient;
+
+public:
+  VPWidenCallRecipe(CallInst &I) : VPRecipeBase(VPWidenCallSC), Ingredient(I) {}
+
+  ~VPWidenCallRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC;
+  }
+
+  /// Produce a widened version of the call instruction.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
 /// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeBase {
 private: