Implement a basic cost model for vector and scalar instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166642 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-22 11:42:42 +00:00 · 2012-10-24 23:47:38 +00:00 · 2012-10-24 23:47:38 +00:00 · 2652c50f74
commit 2652c50f74
parent d95666c226
6 changed files with 168 additions and 23 deletions
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -1958,6 +1958,7 @@ private:

  ValueTypeActionImpl ValueTypeActions;

+public:
  LegalizeKind
  getTypeConversion(LLVMContext &Context, EVT VT) const {
    // If this is a simple type, use the ComputeRegisterProp mechanism.
@ -2074,6 +2075,7 @@ private:
    return LegalizeKind(TypeSplitVector, NVT);
  }

+private:
  std::vector<std::pair<EVT, const TargetRegisterClass*> > AvailableRegClasses;

  /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@ -16,6 +16,7 @@
 #define LLVM_TARGET_TARGET_TRANSFORMATION_IMPL_H

 #include "llvm/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"

 namespace llvm {

@ -51,6 +52,10 @@ class VectorTargetTransformImpl : public VectorTargetTransformInfo {
 private:
  const TargetLowering *TLI;

+  /// Estimate the cost of type-legalization and the legalized type.
+  std::pair<unsigned, EVT>
+  getTypeLegalizationCost(LLVMContext &C, EVT Ty) const;
+
 public:
  explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
  
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@ -9,6 +9,7 @@

 #include "llvm/Target/TargetTransformImpl.h"
 #include "llvm/Target/TargetLowering.h"
+#include <utility>

 using namespace llvm;

@ -53,11 +54,131 @@ unsigned ScalarTargetTransformImpl::getJumpBufSize() const {
 // Calls used by the vectorizers.
 //
 //===----------------------------------------------------------------------===//
+int InstructionOpcodeToISD(unsigned Opcode) {
+  static const int OpToISDTbl[] = {
+    /*Instruction::Ret           */ 0, // Opcode numbering start at #1.
+    /*Instruction::Br            */ 0,
+    /*Instruction::Switch        */ 0,
+    /*Instruction::IndirectBr    */ 0,
+    /*Instruction::Invoke        */ 0,
+    /*Instruction::Resume        */ 0,
+    /*Instruction::Unreachable   */ 0,
+    /*Instruction::Add           */ ISD::ADD,
+    /*Instruction::FAdd          */ ISD::FADD,
+    /*Instruction::Sub           */ ISD::SUB,
+    /*Instruction::FSub          */ ISD::FSUB,
+    /*Instruction::Mul           */ ISD::MUL,
+    /*Instruction::FMul          */ ISD::FMUL,
+    /*Instruction::UDiv          */ ISD::UDIV,
+    /*Instruction::SDiv          */ ISD::UDIV,
+    /*Instruction::FDiv          */ ISD::FDIV,
+    /*Instruction::URem          */ ISD::UREM,
+    /*Instruction::SRem          */ ISD::SREM,
+    /*Instruction::FRem          */ ISD::FREM,
+    /*Instruction::Shl           */ ISD::SHL,
+    /*Instruction::LShr          */ ISD::SRL,
+    /*Instruction::AShr          */ ISD::SRA,
+    /*Instruction::And           */ ISD::AND,
+    /*Instruction::Or            */ ISD::OR,
+    /*Instruction::Xor           */ ISD::XOR,
+    /*Instruction::Alloca        */ 0,
+    /*Instruction::Load          */ ISD::LOAD,
+    /*Instruction::Store         */ ISD::STORE,
+    /*Instruction::GetElementPtr */ 0,
+    /*Instruction::Fence         */ 0,
+    /*Instruction::AtomicCmpXchg */ 0,
+    /*Instruction::AtomicRMW     */ 0,
+    /*Instruction::Trunc         */ ISD::TRUNCATE,
+    /*Instruction::ZExt          */ ISD::ZERO_EXTEND,
+    /*Instruction::SExt          */ ISD::SEXTLOAD,
+    /*Instruction::FPToUI        */ ISD::FP_TO_UINT,
+    /*Instruction::FPToSI        */ ISD::FP_TO_SINT,
+    /*Instruction::UIToFP        */ ISD::UINT_TO_FP,
+    /*Instruction::SIToFP        */ ISD::SINT_TO_FP,
+    /*Instruction::FPTrunc       */ ISD::FP_ROUND,
+    /*Instruction::FPExt         */ ISD::FP_EXTEND,
+    /*Instruction::PtrToInt      */ ISD::BITCAST,
+    /*Instruction::IntToPtr      */ ISD::BITCAST,
+    /*Instruction::BitCast       */ ISD::BITCAST,
+    /*Instruction::ICmp          */ ISD::SETCC,
+    /*Instruction::FCmp          */ ISD::SETCC,
+    /*Instruction::PHI           */ 0,
+    /*Instruction::Call          */ 0,
+    /*Instruction::Select        */ ISD::SELECT,
+    /*Instruction::UserOp1       */ 0,
+    /*Instruction::UserOp2       */ 0,
+    /*Instruction::VAArg         */ 0,
+    /*Instruction::ExtractElement*/ ISD::EXTRACT_VECTOR_ELT,
+    /*Instruction::InsertElement */ ISD::INSERT_VECTOR_ELT,
+    /*Instruction::ShuffleVector */ ISD::VECTOR_SHUFFLE,
+    /*Instruction::ExtractValue  */ ISD::MERGE_VALUES,
+    /*Instruction::InsertValue   */ ISD::MERGE_VALUES,
+    /*Instruction::LandingPad    */ 0};
+
+  assert((Instruction::Ret == 1) && (Instruction::LandingPad == 58) &&
+         "Instruction order had changed");
+
+  // Opcode numbering starts at #1 but the table starts at #0, so we subtract
+  // one from the opcode number.
+  return OpToISDTbl[Opcode - 1];
+}
+
+std::pair<unsigned, EVT> 
+VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
+                                                         EVT Ty) const {
+  unsigned Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, Ty);
+
+    if (LK.first == TargetLowering::TypeLegal)
+      return std::make_pair(Cost, LK.second);
+
+    if (LK.first == TargetLowering::TypeSplitVector)
+      Cost *= 2;
+
+    // Keep legalizing the type.
+    Ty = LK.second;
+  }
+}

 unsigned
 VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1,
                                        Type *Ty2) const {
-  return 1;
+  // Check if any of the operands are vector operands.
+  int ISD = InstructionOpcodeToISD(Opcode);
+
+  // Selects on vectors are actually vector selects.
+  if (ISD == ISD::SELECT) {
+    assert(Ty2 && "Ty2 must hold the select type");
+    if (Ty2->isVectorTy())
+    ISD = ISD::VSELECT;
+  }
+
+  // If we don't have any information about this instruction assume it costs 1.
+  if (ISD == 0)
+    return 1;
+
+  assert(Ty1 && "We need to have at least one type");
+
+  // From this stage we look at the legalized type.
+  std::pair<unsigned, EVT>  LT =
+  getTypeLegalizationCost(Ty1->getContext(), TLI->getValueType(Ty1));
+
+  if (TLI->isOperationLegalOrCustom(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  unsigned NumElem =
+  (LT.second.isVector() ? LT.second.getVectorNumElements() : 1);
+
+  // We will probably scalarize this instruction. Assume that the cost is the
+  // number of the vector elements.
+  return LT.first * NumElem * 1;
 }

 unsigned
@ -69,5 +190,9 @@ unsigned
 VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                           unsigned Alignment,
                                           unsigned AddressSpace) const {
-  return 1;
+  // From this stage we look at the legalized type.
+  std::pair<unsigned, EVT>  LT =
+  getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
+  // Assume that all loads of legal types cost 1.
+  return LT.first;
 }
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@ -185,7 +185,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
  MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
  MPM.add(createLoopDeletionPass());          // Delete dead loops

-  if (Vectorize) {
+  if (Vectorize || true) {
    MPM.add(createLoopVectorizePass());
    MPM.add(createLICMPass());
  }
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -300,10 +300,10 @@ private:
 class LoopVectorizationCostModel {
 public:
  /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
                             LoopVectorizationLegality *Leg,
                             const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }

  /// Returns the most profitable vectorization factor for the loop that is
  /// smaller or equal to the VF argument. This method checks every power
@ -325,8 +325,7 @@ private:
  Loop *TheLoop;
  /// Scev analysis.
  ScalarEvolution *SE;
-  /// DataLayout analysis.
-  DataLayout *DL;
+
  /// Vectorization legality.
  LoopVectorizationLegality *Legal;
  /// Vector target information.
@ -372,7 +371,7 @@ struct LoopVectorize : public LoopPass {
      if (TTI)
        VTTI = TTI->getVectorTargetTransformInfo();
      // Use the cost model.
-      LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
+      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
      VF = CM.findBestVectorizationFactor();

      if (VF == 1) {
@ -1432,11 +1431,12 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
  // For each instruction in the old loop.
  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
    Instruction *Inst = it;
-    Cost += getInstructionCost(Inst, VF);
+    unsigned C = getInstructionCost(Inst, VF);
+    Cost += C;
+    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+          " For instruction: "<< *Inst << "\n");
  }

-  // Return the cost divided by VF, because we will be executing
-  // less iterations of the vector form.
  return Cost;
 }

@ -1444,11 +1444,13 @@ unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
  assert(VTTI && "Invalid vector target transformation info");
  switch (I->getOpcode()) {
+    case Instruction::GetElementPtr:
+      return 0;
    case Instruction::Br: {
      return VTTI->getInstrCost(I->getOpcode());
    }
    case Instruction::PHI:
-      // PHIs are handled the same as the binary instructions below.
+      return 0;
    case Instruction::Add:
    case Instruction::FAdd:
    case Instruction::Sub:
@ -1493,11 +1495,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
      // Scalarized stores.
      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
        unsigned Cost = 0;
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
-        // The cost of extracting from the vector value.
-        Cost += VF * ExtCost;
+        if (VF != 1) {
+          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                                VTy);
+          // The cost of extracting from the value vector and pointer vector.
+          Cost += VF * (ExtCost * 2);
+        }
        // The cost of the scalar stores.
-        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           VTy->getScalarType(),
+                                           SI->getAlignment(),
+                                           SI->getPointerAddressSpace());
        return Cost;
      }

@ -1512,11 +1520,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
      // Scalarized loads.
      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
        unsigned Cost = 0;
-        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-        // The cost of inserting the loaded value into the result vector.
-        Cost += VF * InCost;
+        if (VF != 1) {
+          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+          unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
+
+          // The cost of inserting the loaded value into the result vector, and
+          // extracting from a vector of pointers.
+          Cost += VF * (InCost + ExCost);
+        }
        // The cost of the scalar stores.
-        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+                                           LI->getAlignment(),
+                                           LI->getPointerAddressSpace());
        return Cost;
      }

--- a/test/Transforms/LoopVectorize/cost-model.ll
+++ b/test/Transforms/LoopVectorize/cost-model.ll
@ -8,10 +8,8 @@ target triple = "x86_64-apple-macosx10.8.0"
@d = common global [2048 x i32] zeroinitializer, align 16
@a = common global [2048 x i32] zeroinitializer, align 16

-; At this point the cost model is pretty bad and we are vectorizing the code below.
-; TODO: This code should not be vectorized on x86.
 ;CHECK: cost_model_1
-;CHECK: <4 x i32>
+;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
 define void @cost_model_1() nounwind uwtable noinline ssp {
 entry: