diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 19eb941635e..9d0aeaa3560 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -411,6 +411,13 @@ public: getOperationAction(Op, VT) == Custom); } + /// isOperationExpand - Return true if the specified operation is illegal on + /// this target or unlikely to be made legal with custom lowering. This is + /// used to help guide high-level lowering decisions. + bool isOperationExpand(unsigned Op, EVT VT) const { + return (!isTypeLegal(VT) || getOperationAction(Op, VT) == Expand); + } + /// isOperationLegal - Return true if the specified operation is legal on this /// target. bool isOperationLegal(unsigned Op, EVT VT) const { diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h index 133be87194d..fd4b737afd9 100644 --- a/include/llvm/Target/TargetTransformImpl.h +++ b/include/llvm/Target/TargetTransformImpl.h @@ -56,15 +56,32 @@ private: std::pair getTypeLegalizationCost(LLVMContext &C, EVT Ty) const; + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + public: explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {} - + virtual ~VectorTargetTransformImpl() {} virtual unsigned getInstrCost(unsigned Opcode, Type *Ty1, Type *Ty2) const; + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const; + virtual unsigned getBroadcastCost(Type *Tp) const; + virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const; + + virtual unsigned getCFInstrCost(unsigned Opcode) const; + + virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy) const; + + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index) const; + virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const; diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h index 71c78ec52eb..96761594fb0 100644 --- a/include/llvm/TargetTransformInfo.h +++ b/include/llvm/TargetTransformInfo.h @@ -143,13 +143,43 @@ public: return 1; } + /// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc. + virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const { + return 1; + } + /// Returns the cost of a vector broadcast of a scalar at place zero to a /// vector of type 'Tp'. virtual unsigned getBroadcastCost(Type *Tp) const { return 1; } - /// Returns the cost of Load and Store instructions. + /// Returns the expected cost of cast instructions, such as bitcast, trunc, + /// zext, etc. + virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const { + return 1; + } + + /// Returns the expected cost of control-flow related instrutctions such as + /// Phi, Ret, Br. + virtual unsigned getCFInstrCost(unsigned Opcode) const { + return 1; + } + + /// Returns the expected cost of compare and select instructions. + virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, + Type *CondTy = 0) const { + return 1; + } + + /// Returns the expected cost of vector Insert and Extract. + virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val, + unsigned Index = 0) const { + return 1; + } + + /// Returns the cost of Load and Store instructions. virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp index 40184ed78d3..d3ab1059882 100644 --- a/lib/Target/TargetTransformImpl.cpp +++ b/lib/Target/TargetTransformImpl.cpp @@ -126,7 +126,7 @@ static int InstructionOpcodeToISD(unsigned Opcode) { std::pair VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C, - EVT Ty) const { + EVT Ty) const { unsigned Cost = 1; // We keep legalizing the type until we find a legal kind. We assume that // the only operation that costs anything is the split. After splitting @@ -135,7 +135,7 @@ VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C, TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, Ty); if (LK.first == TargetLowering::TypeLegal) - return std::make_pair(Cost, LK.second); + return std::make_pair(Cost, Ty); if (LK.first == TargetLowering::TypeSplitVector) Cost *= 2; @@ -146,44 +146,144 @@ VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C, } unsigned -VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1, - Type *Ty2) const { - // Check if any of the operands are vector operands. - int ISD = InstructionOpcodeToISD(Opcode); +VectorTargetTransformImpl::getScalarizationOverhead(Type *Ty, + bool Insert, + bool Extract) const { + assert (Ty->isVectorTy() && "Can only scalarize vectors"); + unsigned Cost = 0; - // If we don't have any information about this instruction assume it costs 1. - if (ISD == 0) - return 1; - - // Selects on vectors are actually vector selects. - if (ISD == ISD::SELECT) { - assert(Ty2 && "Ty2 must hold the condition type"); - if (Ty2->isVectorTy()) - ISD = ISD::VSELECT; + for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) { + if (Insert) + Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (Extract) + Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i); } - assert(Ty1 && "We need to have at least one type"); + return Cost; +} - // From this stage we look at the legalized type. - std::pair LT = - getTypeLegalizationCost(Ty1->getContext(), TLI->getValueType(Ty1)); +unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode, + Type *Ty) const { + // Check if any of the operands are vector operands. + int ISD = InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); - if (TLI->isOperationLegalOrCustom(ISD, LT.second)) { + std::pair LT = + getTypeLegalizationCost(Ty->getContext(), TLI->getValueType(Ty)); + + if (!TLI->isOperationExpand(ISD, LT.second)) { // The operation is legal. Assume it costs 1. Multiply // by the type-legalization overhead. return LT.first * 1; } - unsigned NumElem = - (LT.second.isVector() ? LT.second.getVectorNumElements() : 1); + // Else, assume that we need to scalarize this op. + if (Ty->isVectorTy()) { + unsigned Num = Ty->getVectorNumElements(); + unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); + // return the cost of multiple scalar invocation plus the cost of inserting + // and extracting the values. + return getScalarizationOverhead(Ty, true, true) + Num * Cost; + } - // We will probably scalarize this instruction. Assume that the cost is the - // number of the vector elements. - return LT.first * NumElem * 1; + // We don't know anything about this scalar instruction. + return 1; +} + +unsigned VectorTargetTransformImpl::getBroadcastCost(Type *Tp) const { + return 1; +} + +unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src) const { + assert(Src->isVectorTy() == Dst->isVectorTy() && "Invalid input types"); + int ISD = InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + std::pair SrcLT = + getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src)); + + std::pair DstLT = + getTypeLegalizationCost(Dst->getContext(), TLI->getValueType(Dst)); + + // If the cast is between same-sized registers, then the check is simple. + if (SrcLT.first == DstLT.first && + SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) { + // Just check the op cost: + if (!TLI->isOperationExpand(ISD, DstLT.second)) { + // The operation is legal. Assume it costs 1. Multiply + // by the type-legalization overhead. + return SrcLT.first * 1; + } + } + + // Otherwise, assume that the cast is scalarized. + if (Dst->isVectorTy()) { + unsigned Num = Dst->getVectorNumElements(); + unsigned Cost = getCastInstrCost(Opcode, Src->getScalarType(), + Dst->getScalarType()); + // return the cost of multiple scalar invocation plus the cost of inserting + // and extracting the values. + return getScalarizationOverhead(Dst, true, true) + Num * Cost; + } + + // Unknown scalar opcode. + return 1; +} + +unsigned VectorTargetTransformImpl::getCFInstrCost(unsigned Opcode) const { + return 1; +} + +unsigned VectorTargetTransformImpl::getCmpSelInstrCost(unsigned Opcode, + Type *ValTy, + Type *CondTy) const { + int ISD = InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // Selects on vectors are actually vector selects. + if (ISD == ISD::SELECT) { + assert(CondTy && "CondTy must exist"); + if (CondTy->isVectorTy()) + ISD = ISD::VSELECT; + } + + std::pair LT = + getTypeLegalizationCost(ValTy->getContext(), TLI->getValueType(ValTy)); + + if (!TLI->isOperationExpand(ISD, LT.second)) { + // The operation is legal. Assume it costs 1. Multiply + // by the type-legalization overhead. + return LT.first * 1; + } + + // Otherwise, assume that the cast is scalarized. + if (ValTy->isVectorTy()) { + unsigned Num = ValTy->getVectorNumElements(); + if (CondTy) + CondTy = CondTy->getScalarType(); + unsigned Cost = getCmpSelInstrCost(Opcode, ValTy->getScalarType(), + CondTy); + + // return the cost of multiple scalar invocation plus the cost of inserting + // and extracting the values. + return getScalarizationOverhead(ValTy, true, false) + Num * Cost; + } + + // Unknown scalar opcode. + return 1; +} + +/// Returns the expected cost of Vector Insert and Extract. +unsigned VectorTargetTransformImpl::getVectorInstrCost(unsigned Opcode, + Type *Val, + unsigned Index) const { + return 1; } unsigned -VectorTargetTransformImpl::getBroadcastCost(Type *Tp) const { +VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1, + Type *Ty2) const { return 1; } @@ -191,17 +291,15 @@ unsigned VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment, unsigned AddressSpace) const { - // From this stage we look at the legalized type. - std::pair LT = + std::pair LT = getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src)); + // Assume that all loads of legal types cost 1. return LT.first; } unsigned VectorTargetTransformImpl::getNumberOfParts(Type *Tp) const { - std::pair LT = - getTypeLegalizationCost(Tp->getContext(), TLI->getValueType(Tp)); - return LT.first; + return TLI->getNumRegisters(Tp->getContext(), TLI->getValueType(Tp)); } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index e47baf89083..1773812da24 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -108,7 +108,7 @@ public: createEmptyLoop(Legal); /// Widen each instruction in the old loop to a new one in the new loop. /// Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); + vectorizeLoop(Legal); // register the new loop. cleanup(); } @@ -254,6 +254,9 @@ public: /// This check allows us to vectorize A[idx] into a wide load/store. bool isConsecutiveGep(Value *Ptr); + /// Returns true if this instruction will remain scalar after vectorization. + bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -291,6 +294,9 @@ private: /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. SmallPtrSet AllowedExit; + /// This set holds the variables which are known to be uniform after + /// vectorization. + SmallPtrSet Uniforms; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1177,9 +1183,40 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { return false; } - // If the memory dependencies do not prevent us from - // vectorizing, then vectorize. - return canVectorizeMemory(BB); + // Don't vectorize if the memory dependencies do not allow vectorization. + if (!canVectorizeMemory(BB)) + return false; + + // We now know that the loop is vectorizable! + // Collect variables that will remain uniform after vectorization. + std::vector Worklist; + + // Start with the conditional branch and walk up the block. + Worklist.push_back(BB.getTerminator()->getOperand(0)); + + while (Worklist.size()) { + Instruction *I = dyn_cast(Worklist.back()); + Worklist.pop_back(); + // Look at instructions inside this block. + if (!I) continue; + if (I->getParent() != &BB) continue; + + // Stop when reaching PHI nodes. + if (isa(I)) { + assert(I == Induction && "Found a uniform PHI that is not the induction"); + break; + } + + // This is a known uniform. + Uniforms.insert(I); + + // Insert all operands. + for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { + Worklist.push_back(I->getOperand(i)); + } + } + + return true; } bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { @@ -1484,9 +1521,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { assert(VTTI && "Invalid vector target transformation info"); + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (Legal->isUniformAfterVectorization(I)) + VF = 1; + Type *RetTy = I->getType(); Type *VectorTy = ToVectorTy(RetTy, VF); + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: @@ -1495,7 +1538,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { // generate vector geps. return 0; case Instruction::Br: { - return VTTI->getInstrCost(I->getOpcode()); + return VTTI->getCFInstrCost(I->getOpcode()); } case Instruction::PHI: return 0; @@ -1517,7 +1560,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - return VTTI->getInstrCost(I->getOpcode(), VectorTy); + return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); } case Instruction::Select: { SelectInst *SI = cast(I); @@ -1527,13 +1570,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { if (ScalarCond) CondTy = VectorType::get(CondTy, VF); - return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); VectorTy = ToVectorTy(ValTy, VF); - return VTTI->getInstrCost(I->getOpcode(), VectorTy); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: { StoreInst *SI = cast(I); @@ -1602,7 +1645,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } default: { // We are scalarizing the instruction. Return the cost of the scalar diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll index 628f9912c8c..40e660855b1 100644 --- a/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0" @a = common global [2048 x i32] zeroinitializer, align 16 ;CHECK: cost_model_1 -;CHECK: <4 x i32> +;CHECK-NOT: <4 x i32> ;CHECK: ret void define void @cost_model_1() nounwind uwtable noinline ssp { entry: