Chandler Carruth ae65e281f3 Update the file headers across all of the LLVM projects in the monorepo
to reflect the new license.

We understand that people may be surprised that we're moving the header
entirely to discuss the new license. We checked this carefully with the
Foundation's lawyer and we believe this is the correct approach.

Essentially, all code in the project is now made available by the LLVM
project under our new license, so you will see that the license headers
include that license only. Some of our contributors have contributed
code under our old license, and accordingly, we have retained a copy of
our old license notice in the top-level files in each project and
repository.

llvm-svn: 351636
2019-01-19 08:50:56 +00:00

468 lines
15 KiB
C++

//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// This file implements SLP analysis based on VPlan. The analysis is based on
/// the ideas described in
///
/// Look-ahead SLP: auto-vectorization in the presence of commutative
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
/// Luís F. W. Góes
///
//===----------------------------------------------------------------------===//
#include "VPlan.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include <cassert>
#include <iterator>
#include <string>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "vplan-slp"
// Number of levels to look ahead when re-ordering multi node operands.
static unsigned LookaheadMaxDepth = 5;
VPInstruction *VPlanSlp::markFailed() {
// FIXME: Currently this is used to signal we hit instructions we cannot
// trivially SLP'ize.
CompletelySLP = false;
return nullptr;
}
void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
if (all_of(Operands, [](VPValue *V) {
return cast<VPInstruction>(V)->getUnderlyingInstr();
})) {
unsigned BundleSize = 0;
for (VPValue *V : Operands) {
Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
assert(!T->isVectorTy() && "Only scalar types supported for now");
BundleSize += T->getScalarSizeInBits();
}
WidestBundleBits = std::max(WidestBundleBits, BundleSize);
}
auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
assert(Res.second &&
"Already created a combined instruction for the operand bundle");
(void)Res;
}
bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
// Currently we only support VPInstructions.
if (!all_of(Operands, [](VPValue *Op) {
return Op && isa<VPInstruction>(Op) &&
cast<VPInstruction>(Op)->getUnderlyingInstr();
})) {
LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
return false;
}
// Check if opcodes and type width agree for all instructions in the bundle.
// FIXME: Differing widths/opcodes can be handled by inserting additional
// instructions.
// FIXME: Deal with non-primitive types.
const Instruction *OriginalInstr =
cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
unsigned Opcode = OriginalInstr->getOpcode();
unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
return I->getOpcode() == Opcode &&
I->getType()->getPrimitiveSizeInBits() == Width;
})) {
LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
return false;
}
// For now, all operands must be defined in the same BB.
if (any_of(Operands, [this](VPValue *Op) {
return cast<VPInstruction>(Op)->getParent() != &this->BB;
})) {
LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
return false;
}
if (any_of(Operands,
[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
return false;
}
// For loads, check that there are no instructions writing to memory in
// between them.
// TODO: we only have to forbid instructions writing to memory that could
// interfere with any of the loads in the bundle
if (Opcode == Instruction::Load) {
unsigned LoadsSeen = 0;
VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
for (auto &I : *Parent) {
auto *VPI = cast<VPInstruction>(&I);
if (VPI->getOpcode() == Instruction::Load &&
std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
LoadsSeen++;
if (LoadsSeen == Operands.size())
break;
if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
LLVM_DEBUG(
dbgs() << "VPSLP: instruction modifying memory between loads\n");
return false;
}
}
if (!all_of(Operands, [](VPValue *Op) {
return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
->isSimple();
})) {
LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
return false;
}
}
if (Opcode == Instruction::Store)
if (!all_of(Operands, [](VPValue *Op) {
return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
->isSimple();
})) {
LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
return false;
}
return true;
}
static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
unsigned OperandIndex) {
SmallVector<VPValue *, 4> Operands;
for (VPValue *V : Values) {
auto *U = cast<VPUser>(V);
Operands.push_back(U->getOperand(OperandIndex));
}
return Operands;
}
static bool areCommutative(ArrayRef<VPValue *> Values) {
return Instruction::isCommutative(
cast<VPInstruction>(Values[0])->getOpcode());
}
static SmallVector<SmallVector<VPValue *, 4>, 4>
getOperands(ArrayRef<VPValue *> Values) {
SmallVector<SmallVector<VPValue *, 4>, 4> Result;
auto *VPI = cast<VPInstruction>(Values[0]);
switch (VPI->getOpcode()) {
case Instruction::Load:
llvm_unreachable("Loads terminate a tree, no need to get operands");
case Instruction::Store:
Result.push_back(getOperands(Values, 0));
break;
default:
for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
Result.push_back(getOperands(Values, I));
break;
}
return Result;
}
/// Returns the opcode of Values or ~0 if they do not all agree.
static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
if (any_of(Values, [Opcode](VPValue *V) {
return cast<VPInstruction>(V)->getOpcode() != Opcode;
}))
return None;
return {Opcode};
}
/// Returns true if A and B access sequential memory if they are loads or
/// stores or if they have identical opcodes otherwise.
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
VPInterleavedAccessInfo &IAI) {
if (A->getOpcode() != B->getOpcode())
return false;
if (A->getOpcode() != Instruction::Load &&
A->getOpcode() != Instruction::Store)
return true;
auto *GA = IAI.getInterleaveGroup(A);
auto *GB = IAI.getInterleaveGroup(B);
return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
}
/// Implements getLAScore from Listing 7 in the paper.
/// Traverses and compares operands of V1 and V2 to MaxLevel.
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
VPInterleavedAccessInfo &IAI) {
if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
return 0;
if (MaxLevel == 0)
return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
cast<VPInstruction>(V2), IAI);
unsigned Score = 0;
for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
return Score;
}
std::pair<VPlanSlp::OpMode, VPValue *>
VPlanSlp::getBest(OpMode Mode, VPValue *Last,
SmallPtrSetImpl<VPValue *> &Candidates,
VPInterleavedAccessInfo &IAI) {
assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
"Currently we only handle load and commutative opcodes");
LLVM_DEBUG(dbgs() << " getBest\n");
SmallVector<VPValue *, 4> BestCandidates;
LLVM_DEBUG(dbgs() << " Candidates for "
<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
for (auto *Candidate : Candidates) {
auto *LastI = cast<VPInstruction>(Last);
auto *CandidateI = cast<VPInstruction>(Candidate);
if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
<< " ");
BestCandidates.push_back(Candidate);
}
}
LLVM_DEBUG(dbgs() << "\n");
if (BestCandidates.empty())
return {OpMode::Failed, nullptr};
if (BestCandidates.size() == 1)
return {Mode, BestCandidates[0]};
VPValue *Best = nullptr;
unsigned BestScore = 0;
for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
unsigned PrevScore = ~0u;
bool AllSame = true;
// FIXME: Avoid visiting the same operands multiple times.
for (auto *Candidate : BestCandidates) {
unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
if (PrevScore == ~0u)
PrevScore = Score;
if (PrevScore != Score)
AllSame = false;
PrevScore = Score;
if (Score > BestScore) {
BestScore = Score;
Best = Candidate;
}
}
if (!AllSame)
break;
}
LLVM_DEBUG(dbgs() << "Found best "
<< *cast<VPInstruction>(Best)->getUnderlyingInstr()
<< "\n");
Candidates.erase(Best);
return {Mode, Best};
}
SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
SmallVector<MultiNodeOpTy, 4> FinalOrder;
SmallVector<OpMode, 4> Mode;
FinalOrder.reserve(MultiNodeOps.size());
Mode.reserve(MultiNodeOps.size());
LLVM_DEBUG(dbgs() << "Reordering multinode\n");
for (auto &Operands : MultiNodeOps) {
FinalOrder.push_back({Operands.first, {Operands.second[0]}});
if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
Instruction::Load)
Mode.push_back(OpMode::Load);
else
Mode.push_back(OpMode::Opcode);
}
for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
SmallPtrSet<VPValue *, 4> Candidates;
LLVM_DEBUG(dbgs() << " Candidates ");
for (auto Ops : MultiNodeOps) {
LLVM_DEBUG(
dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
<< " ");
Candidates.insert(Ops.second[Lane]);
}
LLVM_DEBUG(dbgs() << "\n");
for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
if (Mode[Op] == OpMode::Failed)
continue;
VPValue *Last = FinalOrder[Op].second[Lane - 1];
std::pair<OpMode, VPValue *> Res =
getBest(Mode[Op], Last, Candidates, IAI);
if (Res.second)
FinalOrder[Op].second.push_back(Res.second);
else
// TODO: handle this case
FinalOrder[Op].second.push_back(markFailed());
}
}
return FinalOrder;
}
void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
dbgs() << " Ops: ";
for (auto Op : Values)
if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
dbgs() << *Instr << " | ";
else
dbgs() << " nullptr | ";
dbgs() << "\n";
}
VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
assert(!Values.empty() && "Need some operands!");
// If we already visited this instruction bundle, re-use the existing node
auto I = BundleToCombined.find(to_vector<4>(Values));
if (I != BundleToCombined.end()) {
#ifndef NDEBUG
// Check that the resulting graph is a tree. If we re-use a node, this means
// its values have multiple users. We only allow this, if all users of each
// value are the same instruction.
for (auto *V : Values) {
auto UI = V->user_begin();
auto *FirstUser = *UI++;
while (UI != V->user_end()) {
assert(*UI == FirstUser && "Currently we only support SLP trees.");
UI++;
}
}
#endif
return I->second;
}
// Dump inputs
LLVM_DEBUG({
dbgs() << "buildGraph: ";
dumpBundle(Values);
});
if (!areVectorizable(Values))
return markFailed();
assert(getOpcode(Values) && "Opcodes for all values must match");
unsigned ValuesOpcode = getOpcode(Values).getValue();
SmallVector<VPValue *, 4> CombinedOperands;
if (areCommutative(Values)) {
bool MultiNodeRoot = !MultiNodeActive;
MultiNodeActive = true;
for (auto &Operands : getOperands(Values)) {
LLVM_DEBUG({
dbgs() << " Visiting Commutative";
dumpBundle(Operands);
});
auto OperandsOpcode = getOpcode(Operands);
if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
CombinedOperands.push_back(buildGraph(Operands));
} else {
LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
// Create dummy VPInstruction, which will we replace later by the
// re-ordered operand.
VPInstruction *Op = new VPInstruction(0, {});
CombinedOperands.push_back(Op);
MultiNodeOps.emplace_back(Op, Operands);
}
}
if (MultiNodeRoot) {
LLVM_DEBUG(dbgs() << "Reorder \n");
MultiNodeActive = false;
auto FinalOrder = reorderMultiNodeOps();
MultiNodeOps.clear();
for (auto &Ops : FinalOrder) {
VPInstruction *NewOp = buildGraph(Ops.second);
Ops.first->replaceAllUsesWith(NewOp);
for (unsigned i = 0; i < CombinedOperands.size(); i++)
if (CombinedOperands[i] == Ops.first)
CombinedOperands[i] = NewOp;
delete Ops.first;
Ops.first = NewOp;
}
LLVM_DEBUG(dbgs() << "Found final order\n");
}
} else {
LLVM_DEBUG(dbgs() << " NonCommuntative\n");
if (ValuesOpcode == Instruction::Load)
for (VPValue *V : Values)
CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
else
for (auto &Operands : getOperands(Values))
CombinedOperands.push_back(buildGraph(Operands));
}
unsigned Opcode;
switch (ValuesOpcode) {
case Instruction::Load:
Opcode = VPInstruction::SLPLoad;
break;
case Instruction::Store:
Opcode = VPInstruction::SLPStore;
break;
default:
Opcode = ValuesOpcode;
break;
}
if (!CompletelySLP)
return markFailed();
assert(CombinedOperands.size() > 0 && "Need more some operands");
auto *VPI = new VPInstruction(Opcode, CombinedOperands);
VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
addCombined(Values, VPI);
return VPI;
}