mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-04-14 13:40:10 +00:00

to reflect the new license. We understand that people may be surprised that we're moving the header entirely to discuss the new license. We checked this carefully with the Foundation's lawyer and we believe this is the correct approach. Essentially, all code in the project is now made available by the LLVM project under our new license, so you will see that the license headers include that license only. Some of our contributors have contributed code under our old license, and accordingly, we have retained a copy of our old license notice in the top-level files in each project and repository. llvm-svn: 351636
468 lines
15 KiB
C++
468 lines
15 KiB
C++
//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
/// This file implements SLP analysis based on VPlan. The analysis is based on
|
|
/// the ideas described in
|
|
///
|
|
/// Look-ahead SLP: auto-vectorization in the presence of commutative
|
|
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
|
|
/// Luís F. W. Góes
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "VPlan.h"
|
|
#include "llvm/ADT/DepthFirstIterator.h"
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
|
#include "llvm/ADT/SmallVector.h"
|
|
#include "llvm/ADT/Twine.h"
|
|
#include "llvm/Analysis/LoopInfo.h"
|
|
#include "llvm/Analysis/VectorUtils.h"
|
|
#include "llvm/IR/BasicBlock.h"
|
|
#include "llvm/IR/CFG.h"
|
|
#include "llvm/IR/Dominators.h"
|
|
#include "llvm/IR/InstrTypes.h"
|
|
#include "llvm/IR/Instruction.h"
|
|
#include "llvm/IR/Instructions.h"
|
|
#include "llvm/IR/Type.h"
|
|
#include "llvm/IR/Value.h"
|
|
#include "llvm/Support/Casting.h"
|
|
#include "llvm/Support/Debug.h"
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
#include "llvm/Support/GraphWriter.h"
|
|
#include "llvm/Support/raw_ostream.h"
|
|
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
|
#include <cassert>
|
|
#include <iterator>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
using namespace llvm;
|
|
|
|
#define DEBUG_TYPE "vplan-slp"
|
|
|
|
// Number of levels to look ahead when re-ordering multi node operands.
|
|
static unsigned LookaheadMaxDepth = 5;
|
|
|
|
VPInstruction *VPlanSlp::markFailed() {
|
|
// FIXME: Currently this is used to signal we hit instructions we cannot
|
|
// trivially SLP'ize.
|
|
CompletelySLP = false;
|
|
return nullptr;
|
|
}
|
|
|
|
void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
|
|
if (all_of(Operands, [](VPValue *V) {
|
|
return cast<VPInstruction>(V)->getUnderlyingInstr();
|
|
})) {
|
|
unsigned BundleSize = 0;
|
|
for (VPValue *V : Operands) {
|
|
Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
|
|
assert(!T->isVectorTy() && "Only scalar types supported for now");
|
|
BundleSize += T->getScalarSizeInBits();
|
|
}
|
|
WidestBundleBits = std::max(WidestBundleBits, BundleSize);
|
|
}
|
|
|
|
auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
|
|
assert(Res.second &&
|
|
"Already created a combined instruction for the operand bundle");
|
|
(void)Res;
|
|
}
|
|
|
|
bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
|
|
// Currently we only support VPInstructions.
|
|
if (!all_of(Operands, [](VPValue *Op) {
|
|
return Op && isa<VPInstruction>(Op) &&
|
|
cast<VPInstruction>(Op)->getUnderlyingInstr();
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
|
|
return false;
|
|
}
|
|
|
|
// Check if opcodes and type width agree for all instructions in the bundle.
|
|
// FIXME: Differing widths/opcodes can be handled by inserting additional
|
|
// instructions.
|
|
// FIXME: Deal with non-primitive types.
|
|
const Instruction *OriginalInstr =
|
|
cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
|
|
unsigned Opcode = OriginalInstr->getOpcode();
|
|
unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
|
|
if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
|
|
const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
|
|
return I->getOpcode() == Opcode &&
|
|
I->getType()->getPrimitiveSizeInBits() == Width;
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
|
|
return false;
|
|
}
|
|
|
|
// For now, all operands must be defined in the same BB.
|
|
if (any_of(Operands, [this](VPValue *Op) {
|
|
return cast<VPInstruction>(Op)->getParent() != &this->BB;
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
|
|
return false;
|
|
}
|
|
|
|
if (any_of(Operands,
|
|
[](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
|
|
LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
|
|
return false;
|
|
}
|
|
|
|
// For loads, check that there are no instructions writing to memory in
|
|
// between them.
|
|
// TODO: we only have to forbid instructions writing to memory that could
|
|
// interfere with any of the loads in the bundle
|
|
if (Opcode == Instruction::Load) {
|
|
unsigned LoadsSeen = 0;
|
|
VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
|
|
for (auto &I : *Parent) {
|
|
auto *VPI = cast<VPInstruction>(&I);
|
|
if (VPI->getOpcode() == Instruction::Load &&
|
|
std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
|
|
LoadsSeen++;
|
|
|
|
if (LoadsSeen == Operands.size())
|
|
break;
|
|
if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
|
|
LLVM_DEBUG(
|
|
dbgs() << "VPSLP: instruction modifying memory between loads\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (!all_of(Operands, [](VPValue *Op) {
|
|
return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
|
|
->isSimple();
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (Opcode == Instruction::Store)
|
|
if (!all_of(Operands, [](VPValue *Op) {
|
|
return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
|
|
->isSimple();
|
|
})) {
|
|
LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
|
|
unsigned OperandIndex) {
|
|
SmallVector<VPValue *, 4> Operands;
|
|
for (VPValue *V : Values) {
|
|
auto *U = cast<VPUser>(V);
|
|
Operands.push_back(U->getOperand(OperandIndex));
|
|
}
|
|
return Operands;
|
|
}
|
|
|
|
static bool areCommutative(ArrayRef<VPValue *> Values) {
|
|
return Instruction::isCommutative(
|
|
cast<VPInstruction>(Values[0])->getOpcode());
|
|
}
|
|
|
|
static SmallVector<SmallVector<VPValue *, 4>, 4>
|
|
getOperands(ArrayRef<VPValue *> Values) {
|
|
SmallVector<SmallVector<VPValue *, 4>, 4> Result;
|
|
auto *VPI = cast<VPInstruction>(Values[0]);
|
|
|
|
switch (VPI->getOpcode()) {
|
|
case Instruction::Load:
|
|
llvm_unreachable("Loads terminate a tree, no need to get operands");
|
|
case Instruction::Store:
|
|
Result.push_back(getOperands(Values, 0));
|
|
break;
|
|
default:
|
|
for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
|
|
Result.push_back(getOperands(Values, I));
|
|
break;
|
|
}
|
|
|
|
return Result;
|
|
}
|
|
|
|
/// Returns the opcode of Values or ~0 if they do not all agree.
|
|
static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
|
|
unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
|
|
if (any_of(Values, [Opcode](VPValue *V) {
|
|
return cast<VPInstruction>(V)->getOpcode() != Opcode;
|
|
}))
|
|
return None;
|
|
return {Opcode};
|
|
}
|
|
|
|
/// Returns true if A and B access sequential memory if they are loads or
|
|
/// stores or if they have identical opcodes otherwise.
|
|
static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
|
|
VPInterleavedAccessInfo &IAI) {
|
|
if (A->getOpcode() != B->getOpcode())
|
|
return false;
|
|
|
|
if (A->getOpcode() != Instruction::Load &&
|
|
A->getOpcode() != Instruction::Store)
|
|
return true;
|
|
auto *GA = IAI.getInterleaveGroup(A);
|
|
auto *GB = IAI.getInterleaveGroup(B);
|
|
|
|
return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
|
|
}
|
|
|
|
/// Implements getLAScore from Listing 7 in the paper.
|
|
/// Traverses and compares operands of V1 and V2 to MaxLevel.
|
|
static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
|
|
VPInterleavedAccessInfo &IAI) {
|
|
if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
|
|
return 0;
|
|
|
|
if (MaxLevel == 0)
|
|
return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
|
|
cast<VPInstruction>(V2), IAI);
|
|
|
|
unsigned Score = 0;
|
|
for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
|
|
for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
|
|
Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
|
|
cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
|
|
return Score;
|
|
}
|
|
|
|
std::pair<VPlanSlp::OpMode, VPValue *>
|
|
VPlanSlp::getBest(OpMode Mode, VPValue *Last,
|
|
SmallPtrSetImpl<VPValue *> &Candidates,
|
|
VPInterleavedAccessInfo &IAI) {
|
|
assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
|
|
"Currently we only handle load and commutative opcodes");
|
|
LLVM_DEBUG(dbgs() << " getBest\n");
|
|
|
|
SmallVector<VPValue *, 4> BestCandidates;
|
|
LLVM_DEBUG(dbgs() << " Candidates for "
|
|
<< *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
|
|
for (auto *Candidate : Candidates) {
|
|
auto *LastI = cast<VPInstruction>(Last);
|
|
auto *CandidateI = cast<VPInstruction>(Candidate);
|
|
if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
|
|
LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
|
|
<< " ");
|
|
BestCandidates.push_back(Candidate);
|
|
}
|
|
}
|
|
LLVM_DEBUG(dbgs() << "\n");
|
|
|
|
if (BestCandidates.empty())
|
|
return {OpMode::Failed, nullptr};
|
|
|
|
if (BestCandidates.size() == 1)
|
|
return {Mode, BestCandidates[0]};
|
|
|
|
VPValue *Best = nullptr;
|
|
unsigned BestScore = 0;
|
|
for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
|
|
unsigned PrevScore = ~0u;
|
|
bool AllSame = true;
|
|
|
|
// FIXME: Avoid visiting the same operands multiple times.
|
|
for (auto *Candidate : BestCandidates) {
|
|
unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
|
|
if (PrevScore == ~0u)
|
|
PrevScore = Score;
|
|
if (PrevScore != Score)
|
|
AllSame = false;
|
|
PrevScore = Score;
|
|
|
|
if (Score > BestScore) {
|
|
BestScore = Score;
|
|
Best = Candidate;
|
|
}
|
|
}
|
|
if (!AllSame)
|
|
break;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "Found best "
|
|
<< *cast<VPInstruction>(Best)->getUnderlyingInstr()
|
|
<< "\n");
|
|
Candidates.erase(Best);
|
|
|
|
return {Mode, Best};
|
|
}
|
|
|
|
SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
|
|
SmallVector<MultiNodeOpTy, 4> FinalOrder;
|
|
SmallVector<OpMode, 4> Mode;
|
|
FinalOrder.reserve(MultiNodeOps.size());
|
|
Mode.reserve(MultiNodeOps.size());
|
|
|
|
LLVM_DEBUG(dbgs() << "Reordering multinode\n");
|
|
|
|
for (auto &Operands : MultiNodeOps) {
|
|
FinalOrder.push_back({Operands.first, {Operands.second[0]}});
|
|
if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
|
|
Instruction::Load)
|
|
Mode.push_back(OpMode::Load);
|
|
else
|
|
Mode.push_back(OpMode::Opcode);
|
|
}
|
|
|
|
for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
|
|
LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
|
|
SmallPtrSet<VPValue *, 4> Candidates;
|
|
LLVM_DEBUG(dbgs() << " Candidates ");
|
|
for (auto Ops : MultiNodeOps) {
|
|
LLVM_DEBUG(
|
|
dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
|
|
<< " ");
|
|
Candidates.insert(Ops.second[Lane]);
|
|
}
|
|
LLVM_DEBUG(dbgs() << "\n");
|
|
|
|
for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
|
|
LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
|
|
if (Mode[Op] == OpMode::Failed)
|
|
continue;
|
|
|
|
VPValue *Last = FinalOrder[Op].second[Lane - 1];
|
|
std::pair<OpMode, VPValue *> Res =
|
|
getBest(Mode[Op], Last, Candidates, IAI);
|
|
if (Res.second)
|
|
FinalOrder[Op].second.push_back(Res.second);
|
|
else
|
|
// TODO: handle this case
|
|
FinalOrder[Op].second.push_back(markFailed());
|
|
}
|
|
}
|
|
|
|
return FinalOrder;
|
|
}
|
|
|
|
void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
|
|
dbgs() << " Ops: ";
|
|
for (auto Op : Values)
|
|
if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
|
|
dbgs() << *Instr << " | ";
|
|
else
|
|
dbgs() << " nullptr | ";
|
|
dbgs() << "\n";
|
|
}
|
|
|
|
VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
|
|
assert(!Values.empty() && "Need some operands!");
|
|
|
|
// If we already visited this instruction bundle, re-use the existing node
|
|
auto I = BundleToCombined.find(to_vector<4>(Values));
|
|
if (I != BundleToCombined.end()) {
|
|
#ifndef NDEBUG
|
|
// Check that the resulting graph is a tree. If we re-use a node, this means
|
|
// its values have multiple users. We only allow this, if all users of each
|
|
// value are the same instruction.
|
|
for (auto *V : Values) {
|
|
auto UI = V->user_begin();
|
|
auto *FirstUser = *UI++;
|
|
while (UI != V->user_end()) {
|
|
assert(*UI == FirstUser && "Currently we only support SLP trees.");
|
|
UI++;
|
|
}
|
|
}
|
|
#endif
|
|
return I->second;
|
|
}
|
|
|
|
// Dump inputs
|
|
LLVM_DEBUG({
|
|
dbgs() << "buildGraph: ";
|
|
dumpBundle(Values);
|
|
});
|
|
|
|
if (!areVectorizable(Values))
|
|
return markFailed();
|
|
|
|
assert(getOpcode(Values) && "Opcodes for all values must match");
|
|
unsigned ValuesOpcode = getOpcode(Values).getValue();
|
|
|
|
SmallVector<VPValue *, 4> CombinedOperands;
|
|
if (areCommutative(Values)) {
|
|
bool MultiNodeRoot = !MultiNodeActive;
|
|
MultiNodeActive = true;
|
|
for (auto &Operands : getOperands(Values)) {
|
|
LLVM_DEBUG({
|
|
dbgs() << " Visiting Commutative";
|
|
dumpBundle(Operands);
|
|
});
|
|
|
|
auto OperandsOpcode = getOpcode(Operands);
|
|
if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
|
|
LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
|
|
CombinedOperands.push_back(buildGraph(Operands));
|
|
} else {
|
|
LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
|
|
// Create dummy VPInstruction, which will we replace later by the
|
|
// re-ordered operand.
|
|
VPInstruction *Op = new VPInstruction(0, {});
|
|
CombinedOperands.push_back(Op);
|
|
MultiNodeOps.emplace_back(Op, Operands);
|
|
}
|
|
}
|
|
|
|
if (MultiNodeRoot) {
|
|
LLVM_DEBUG(dbgs() << "Reorder \n");
|
|
MultiNodeActive = false;
|
|
|
|
auto FinalOrder = reorderMultiNodeOps();
|
|
|
|
MultiNodeOps.clear();
|
|
for (auto &Ops : FinalOrder) {
|
|
VPInstruction *NewOp = buildGraph(Ops.second);
|
|
Ops.first->replaceAllUsesWith(NewOp);
|
|
for (unsigned i = 0; i < CombinedOperands.size(); i++)
|
|
if (CombinedOperands[i] == Ops.first)
|
|
CombinedOperands[i] = NewOp;
|
|
delete Ops.first;
|
|
Ops.first = NewOp;
|
|
}
|
|
LLVM_DEBUG(dbgs() << "Found final order\n");
|
|
}
|
|
} else {
|
|
LLVM_DEBUG(dbgs() << " NonCommuntative\n");
|
|
if (ValuesOpcode == Instruction::Load)
|
|
for (VPValue *V : Values)
|
|
CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
|
|
else
|
|
for (auto &Operands : getOperands(Values))
|
|
CombinedOperands.push_back(buildGraph(Operands));
|
|
}
|
|
|
|
unsigned Opcode;
|
|
switch (ValuesOpcode) {
|
|
case Instruction::Load:
|
|
Opcode = VPInstruction::SLPLoad;
|
|
break;
|
|
case Instruction::Store:
|
|
Opcode = VPInstruction::SLPStore;
|
|
break;
|
|
default:
|
|
Opcode = ValuesOpcode;
|
|
break;
|
|
}
|
|
|
|
if (!CompletelySLP)
|
|
return markFailed();
|
|
|
|
assert(CombinedOperands.size() > 0 && "Need more some operands");
|
|
auto *VPI = new VPInstruction(Opcode, CombinedOperands);
|
|
VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
|
|
|
|
LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
|
|
cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
|
|
addCombined(Values, VPI);
|
|
return VPI;
|
|
}
|