Nadav Rotem 96f8f45bd5 Add support for bottom-up SLP vectorization infrastructure.
This commit adds the infrastructure for performing bottom-up SLP vectorization (and other optimizations) on parallel computations.
The infrastructure has three potential users:

  1. The loop vectorizer needs to be able to vectorize AOS data structures such as (sum += A[i] + A[i+1]).

  2. The BB-vectorizer needs this infrastructure for bottom-up SLP vectorization, because bottom-up vectorization is faster to compute.

  3. A loop-roller needs to be able to analyze consecutive chains and roll them into a loop, in order to reduce code size. A loop roller does not need to create vector instructions, and this infrastructure separates the chain analysis from the vectorization.

This patch also includes a simple (100 LOC) bottom up SLP vectorizer that uses the infrastructure, and can vectorize this code:

void SAXPY(int *x, int *y, int a, int i) {
  x[i]   = a * x[i]   + y[i];
  x[i+1] = a * x[i+1] + y[i+1];
  x[i+2] = a * x[i+2] + y[i+2];
  x[i+3] = a * x[i+3] + y[i+3];
}

llvm-svn: 179117
2013-04-09 19:44:35 +00:00

440 lines
15 KiB
C++

//===- VecUtils.h --- Vectorization Utilities -----------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#define DEBUG_TYPE "VecUtils"
#include "VecUtils.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/Verifier.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/Local.h"
#include <algorithm>
#include <map>
using namespace llvm;
namespace llvm {
BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
TargetTransformInfo *Tti, AliasAnalysis *Aa) :
BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa) {
numberInstructions();
}
void BoUpSLP::numberInstructions() {
int Loc = 0;
InstrIdx.clear();
InstrVec.clear();
// Number the instructions in the block.
for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
InstrIdx[it] = Loc++;
InstrVec.push_back(it);
assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
}
}
Value *BoUpSLP::getPointerOperand(Value *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
return 0;
}
unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
return -1;
}
bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
Value *PtrA = getPointerOperand(A);
Value *PtrB = getPointerOperand(B);
unsigned ASA = getAddressSpaceOperand(A);
unsigned ASB = getAddressSpaceOperand(B);
// Check that the address spaces match and that the pointers are valid.
if (!PtrA || !PtrB || (ASA != ASB)) return false;
// Check that A and B are of the same type.
if (PtrA->getType() != PtrB->getType()) return false;
// Calculate the distance.
const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
// Non constant distance.
if (!ConstOffSCEV) return false;
unsigned Offset = ConstOffSCEV->getValue()->getSExtValue();
Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
// The Instructions are connsecutive if the size of the first load/store is
// the same as the offset.
unsigned Sz = (DL ? DL->getTypeStoreSize(Ty) : Ty->getScalarSizeInBits()/8);
return ((-Offset) == Sz);
}
bool BoUpSLP::vectorizeStores(StoreList &Stores, int costThreshold) {
ValueSet Heads, Tails;
SmallDenseMap<Value*, Value*> ConsecutiveChain;
bool Changed = false;
// Do a quadratic search on all of the given stores and find
// all of the pairs of loads that follow each other.
for (unsigned i = 0, e = Stores.size(); i < e; ++i)
for (unsigned j = 0; j < e; ++j) {
if (i == j) continue;
if (isConsecutiveAccess(Stores[i], Stores[j])) {
Tails.insert(Stores[j]);
Heads.insert(Stores[i]);
ConsecutiveChain[Stores[i]] = Stores[j];
}
}
// For stores that start but don't end a link in the chain:
for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
if (Tails.count(*it)) continue;
// We found a store instr that starts a chain. Now follow the chain and try
// to vectorize it.
ValueList Operands;
Value *I = *it;
int MinCost = 0, MinVF = 0;
while (Tails.count(I) || Heads.count(I)) {
Operands.push_back(I);
unsigned VF = Operands.size();
if (isPowerOf2_32(VF) && VF > 1) {
int cost = getTreeRollCost(Operands, 0);
DEBUG(dbgs() << "Found cost=" << cost << " for VF=" << VF << "\n");
if (cost < MinCost) { MinCost = cost; MinVF = VF; }
}
// Move to the next value in the chain.
I = ConsecutiveChain[I];
}
if (MinCost <= costThreshold && MinVF > 1) {
DEBUG(dbgs() << "Decided to vectorize cost=" << MinCost << "\n");
vectorizeTree(Operands, MinVF);
Stores.clear();
// The current numbering is invalid because we added and removed instrs.
numberInstructions();
Changed = true;
}
}
return Changed;
}
int BoUpSLP::getScalarizationCost(Type *Ty) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
return Cost;
}
AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
return AliasAnalysis::Location();
}
Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
assert(Src->getParent() == Dst->getParent() && "Not the same BB");
BasicBlock::iterator I = Src, E = Dst;
/// Scan all of the instruction from SRC to DST and check if
/// the source may alias.
for (++I; I != E; ++I) {
// Ignore store instructions that are marked as 'ignore'.
if (MemBarrierIgnoreList.count(I)) continue;
if (Src->mayWriteToMemory()) /* Write */ {
if (!I->mayReadOrWriteMemory()) continue;
} else /* Read */ {
if (!I->mayWriteToMemory()) continue;
}
AliasAnalysis::Location A = getLocation(&*I);
AliasAnalysis::Location B = getLocation(Src);
if (!A.Ptr || !B.Ptr || AA->alias(A, B))
return I;
}
return 0;
}
int BoUpSLP::getTreeRollCost(ValueList &VL, unsigned Depth) {
if (Depth == 6) return max_cost;
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
/// Don't mess with vectors.
if (ScalarTy->isVectorTy()) return max_cost;
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
// Check if all of the operands are constants.
bool AllConst = true;
bool AllSameScalar = true;
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
AllConst &= isa<Constant>(VL[i]);
AllSameScalar &= (VL[0] == VL[i]);
// Must have a single use.
Instruction *I = dyn_cast<Instruction>(VL[i]);
// Need to scalarize instructions with multiple users or from other BBs.
if (I && ((I->getNumUses() > 1) || (I->getParent() != BB)))
return getScalarizationCost(VecTy);
}
// Is this a simple vector constant.
if (AllConst) return 0;
// If all of the operands are identical we can broadcast them.
if (AllSameScalar)
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
// Scalarize unknown structures.
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
if (!VL0) return getScalarizationCost(VecTy);
assert(VL0->getParent() == BB && "Wrong BB");
unsigned Opcode = VL0->getOpcode();
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
// If not all of the instructions are identical then we have to scalarize.
if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
}
// Check if it is safe to sink the loads or the stores.
if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
int MaxIdx = InstrIdx[VL0];
for (unsigned i = 1, e = VL.size(); i < e; ++i )
MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
Instruction *Last = InstrVec[MaxIdx];
for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
if (VL[i] == Last) continue;
Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
if (Barrier) {
DEBUG(dbgs() << "LR: Can't sink " << *VL[i] << "\n down to " <<
*Last << "\n because of " << *Barrier << "\n");
return max_cost;
}
}
}
switch (Opcode) {
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
ValueList Operands;
int Cost = 0;
// Calculate the cost of all of the operands.
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
// Prepare the operand vector.
for (unsigned j = 0; j < VL.size(); ++j)
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
Cost += getTreeRollCost(Operands, Depth+1);
Operands.clear();
}
// Calculate the cost of this instruction.
int ScalarCost = VecTy->getNumElements() *
TTI->getArithmeticInstrCost(Opcode, ScalarTy);
int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
Cost += (VecCost - ScalarCost);
return Cost;
}
case Instruction::Load: {
// If we are scalarize the loads, add the cost of forming the vector.
for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
if (!isConsecutiveAccess(VL[i], VL[i+1]))
return getScalarizationCost(VecTy);
// Cost of wide load - cost of scalar loads.
int ScalarLdCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
return VecLdCost - ScalarLdCost;
}
case Instruction::Store: {
// We know that we can merge the stores. Calculate the cost.
int ScalarStCost = VecTy->getNumElements() *
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
int StoreCost = VecStCost - ScalarStCost;
ValueList Operands;
for (unsigned j = 0; j < VL.size(); ++j) {
Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
MemBarrierIgnoreList.insert(VL[j]);
}
int TotalCost = StoreCost + getTreeRollCost(Operands, Depth + 1);
MemBarrierIgnoreList.clear();
return TotalCost;
}
default:
// Unable to vectorize unknown instructions.
return getScalarizationCost(VecTy);
}
}
Instruction *BoUpSLP::GetLastInstr(ValueList &VL, unsigned VF) {
int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
for (unsigned i = 0; i < VF; ++i )
MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
return InstrVec[MaxIdx + 1];
}
Value *BoUpSLP::Scalarize(ValueList &VL, VectorType *Ty) {
IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
Value *Vec = UndefValue::get(Ty);
for (unsigned i=0; i < Ty->getNumElements(); ++i)
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
return Vec;
}
Value *BoUpSLP::vectorizeTree(ValueList &VL, int VF) {
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VF);
// Check if all of the operands are constants or identical.
bool AllConst = true;
bool AllSameScalar = true;
for (unsigned i = 0, e = VF; i < e; ++i) {
AllConst &= !!dyn_cast<Constant>(VL[i]);
AllSameScalar &= (VL[0] == VL[i]);
// Must have a single use.
Instruction *I = dyn_cast<Instruction>(VL[i]);
if (I && (I->getNumUses() > 1 || I->getParent() != BB))
return Scalarize(VL, VecTy);
}
// Is this a simple vector constant.
if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
// Scalarize unknown structures.
Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
if (!VL0) return Scalarize(VL, VecTy);
unsigned Opcode = VL0->getOpcode();
for (unsigned i = 0, e = VF; i < e; ++i) {
Instruction *I = dyn_cast<Instruction>(VL[i]);
// If not all of the instructions are identical then we have to scalarize.
if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
}
switch (Opcode) {
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
case Instruction::FSub:
case Instruction::Mul:
case Instruction::FMul:
case Instruction::UDiv:
case Instruction::SDiv:
case Instruction::FDiv:
case Instruction::URem:
case Instruction::SRem:
case Instruction::FRem:
case Instruction::Shl:
case Instruction::LShr:
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
case Instruction::Xor: {
ValueList LHSVL, RHSVL;
for (int i = 0; i < VF; ++i) {
RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
}
Value *RHS = vectorizeTree(RHSVL, VF);
Value *LHS = vectorizeTree(LHSVL, VF);
IRBuilder<> Builder(GetLastInstr(VL, VF));
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(VL0);
return Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
}
case Instruction::Load: {
LoadInst *LI = dyn_cast<LoadInst>(VL0);
unsigned Alignment = LI->getAlignment();
// Check if all of the loads are consecutive.
for (unsigned i = 1, e = VF; i < e; ++i)
if (!isConsecutiveAccess(VL[i-1], VL[i]))
return Scalarize(VL, VecTy);
IRBuilder<> Builder(GetLastInstr(VL, VF));
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
VecTy->getPointerTo());
LI = Builder.CreateLoad(VecPtr);
LI->setAlignment(Alignment);
return LI;
}
case Instruction::Store: {
StoreInst *SI = dyn_cast<StoreInst>(VL0);
unsigned Alignment = SI->getAlignment();
ValueList ValueOp;
for (int i = 0; i < VF; ++i)
ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
Value *VecValue = vectorizeTree(ValueOp, VF);
IRBuilder<> Builder(GetLastInstr(VL, VF));
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
VecTy->getPointerTo());
Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
for (int i = 0; i < VF; ++i)
cast<Instruction>(VL[i])->eraseFromParent();
return 0;
}
default:
return Scalarize(VL, VecTy);
}
}
} // end of namespace