2013-04-09 19:44:35 +00:00
|
|
|
//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
|
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
|
|
|
|
// stores that can be put together into vector-stores. Next, it attempts to
|
|
|
|
// construct vectorizable tree using the use-def chains. If a profitable tree
|
|
|
|
// was found, the SLP vectorizer performs vectorization on the tree.
|
|
|
|
//
|
|
|
|
// The pass is inspired by the work described in the paper:
|
|
|
|
// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
2013-05-22 19:47:32 +00:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
2015-03-23 19:32:43 +00:00
|
|
|
#include "llvm/ADT/Optional.h"
|
2013-06-23 06:15:46 +00:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2013-06-22 21:34:10 +00:00
|
|
|
#include "llvm/ADT/SetVector.h"
|
2014-08-01 08:14:28 +00:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
2015-01-04 12:03:27 +00:00
|
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
2014-10-15 17:35:01 +00:00
|
|
|
#include "llvm/Analysis/CodeMetrics.h"
|
2016-02-18 14:14:40 +00:00
|
|
|
#include "llvm/Analysis/DemandedBits.h"
|
|
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
2016-01-26 02:27:47 +00:00
|
|
|
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
2016-02-18 14:14:40 +00:00
|
|
|
#include "llvm/Analysis/LoopAccessAnalysis.h"
|
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
2013-06-22 21:34:10 +00:00
|
|
|
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
2013-10-02 19:06:06 +00:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2016-02-18 14:14:40 +00:00
|
|
|
#include "llvm/Analysis/VectorUtils.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/IR/DataLayout.h"
|
2014-01-13 09:26:24 +00:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2014-01-07 11:48:04 +00:00
|
|
|
#include "llvm/IR/IRBuilder.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/IR/Instructions.h"
|
2013-04-14 03:22:20 +00:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/IR/Module.h"
|
2014-05-04 17:10:15 +00:00
|
|
|
#include "llvm/IR/NoFolder.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/IR/Type.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
2014-01-13 09:26:24 +00:00
|
|
|
#include "llvm/IR/Verifier.h"
|
2013-04-09 19:44:35 +00:00
|
|
|
#include "llvm/Pass.h"
|
|
|
|
#include "llvm/Support/CommandLine.h"
|
|
|
|
#include "llvm/Support/Debug.h"
|
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2016-02-18 14:14:40 +00:00
|
|
|
#include "llvm/Transforms/Vectorize.h"
|
2013-06-22 21:34:10 +00:00
|
|
|
#include <algorithm>
|
2013-04-09 19:44:35 +00:00
|
|
|
#include <map>
|
2014-08-01 09:20:42 +00:00
|
|
|
#include <memory>
|
2013-04-09 19:44:35 +00:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 02:55:47 +00:00
|
|
|
#define SV_NAME "slp-vectorizer"
|
|
|
|
#define DEBUG_TYPE "SLP"
|
|
|
|
|
2014-08-01 08:14:28 +00:00
|
|
|
STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
|
|
|
|
|
2013-04-09 19:44:35 +00:00
|
|
|
static cl::opt<int>
|
2013-06-20 17:54:36 +00:00
|
|
|
SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
|
2013-06-29 05:37:19 +00:00
|
|
|
cl::desc("Only vectorize if you gain more than this "
|
|
|
|
"number "));
|
2013-09-21 01:06:00 +00:00
|
|
|
|
|
|
|
static cl::opt<bool>
|
2015-11-11 15:03:46 +00:00
|
|
|
ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
|
2013-09-21 01:06:00 +00:00
|
|
|
cl::desc("Attempt to vectorize horizontal reductions"));
|
|
|
|
|
2013-09-25 14:02:32 +00:00
|
|
|
static cl::opt<bool> ShouldStartVectorizeHorAtStore(
|
|
|
|
"slp-vectorize-hor-store", cl::init(false), cl::Hidden,
|
|
|
|
cl::desc(
|
|
|
|
"Attempt to vectorize horizontal reductions feeding into a store"));
|
|
|
|
|
2015-07-08 23:40:55 +00:00
|
|
|
static cl::opt<int>
|
|
|
|
MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
|
|
|
|
cl::desc("Attempt to vectorize for this register size in bits"));
|
|
|
|
|
2015-09-30 17:00:44 +00:00
|
|
|
/// Limits the size of scheduling regions in a block.
|
|
|
|
/// It avoid long compile times for _very_ large blocks where vector
|
|
|
|
/// instructions are spread over a wide range.
|
|
|
|
/// This limit is way higher than needed by real-world functions.
|
|
|
|
static cl::opt<int>
|
|
|
|
ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
|
|
|
|
cl::desc("Limit the size of the SLP scheduling region per block"));
|
|
|
|
|
2016-03-10 02:49:47 +00:00
|
|
|
static cl::opt<int> MinVectorRegSizeOption(
|
|
|
|
"slp-min-reg-size", cl::init(128), cl::Hidden,
|
|
|
|
cl::desc("Attempt to vectorize for this register size in bits"));
|
|
|
|
|
2013-04-09 19:44:35 +00:00
|
|
|
namespace {
|
|
|
|
|
2015-07-08 23:40:55 +00:00
|
|
|
// FIXME: Set this via cl::opt to allow overriding.
|
2013-06-24 02:52:43 +00:00
|
|
|
static const unsigned RecursionMaxDepth = 12;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2015-01-19 09:33:38 +00:00
|
|
|
// Limit the number of alias checks. The limit is chosen so that
|
|
|
|
// it has no negative effect on the llvm benchmarks.
|
|
|
|
static const unsigned AliasedCheckLimit = 10;
|
|
|
|
|
2015-01-22 08:20:51 +00:00
|
|
|
// Another limit for the alias checks: The maximum distance between load/store
|
|
|
|
// instructions where alias checks are done.
|
|
|
|
// This limit is useful for very large basic blocks.
|
2015-01-22 13:57:41 +00:00
|
|
|
static const unsigned MaxMemDepDistance = 160;
|
2015-01-22 08:20:51 +00:00
|
|
|
|
2015-09-30 17:00:44 +00:00
|
|
|
/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
|
|
|
|
/// regions to be handled.
|
|
|
|
static const int MinScheduleRegionSize = 16;
|
|
|
|
|
2015-02-12 02:30:56 +00:00
|
|
|
/// \brief Predicate for the element types that the SLP vectorizer supports.
|
|
|
|
///
|
|
|
|
/// The most important thing to filter here are types which are invalid in LLVM
|
|
|
|
/// vectors. We also filter target specific types which have absolutely no
|
|
|
|
/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
|
|
|
|
/// avoids spending time checking the cost model and realizing that they will
|
|
|
|
/// be inevitably scalarized.
|
|
|
|
static bool isValidElementType(Type *Ty) {
|
|
|
|
return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
|
|
|
|
!Ty->isPPC_FP128Ty();
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// \returns the parent basic block if all of the instructions in \p VL
|
|
|
|
/// are in the same block or null otherwise.
|
|
|
|
static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
|
|
|
|
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
|
|
|
|
if (!I0)
|
2014-04-25 05:29:35 +00:00
|
|
|
return nullptr;
|
2013-07-07 06:57:07 +00:00
|
|
|
BasicBlock *BB = I0->getParent();
|
|
|
|
for (int i = 1, e = VL.size(); i < e; i++) {
|
|
|
|
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
|
|
|
if (!I)
|
2014-04-25 05:29:35 +00:00
|
|
|
return nullptr;
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
if (BB != I->getParent())
|
2014-04-25 05:29:35 +00:00
|
|
|
return nullptr;
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \returns True if all of the values in \p VL are constants.
|
|
|
|
static bool allConstant(ArrayRef<Value *> VL) {
|
|
|
|
for (unsigned i = 0, e = VL.size(); i < e; ++i)
|
|
|
|
if (!isa<Constant>(VL[i]))
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \returns True if all of the values in \p VL are identical.
|
|
|
|
static bool isSplat(ArrayRef<Value *> VL) {
|
|
|
|
for (unsigned i = 1, e = VL.size(); i < e; ++i)
|
|
|
|
if (VL[i] != VL[0])
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-06-20 04:32:48 +00:00
|
|
|
///\returns Opcode that can be clubbed with \p Op to create an alternate
|
|
|
|
/// sequence which can later be merged as a ShuffleVector instruction.
|
|
|
|
static unsigned getAltOpcode(unsigned Op) {
|
|
|
|
switch (Op) {
|
|
|
|
case Instruction::FAdd:
|
|
|
|
return Instruction::FSub;
|
|
|
|
case Instruction::FSub:
|
|
|
|
return Instruction::FAdd;
|
|
|
|
case Instruction::Add:
|
|
|
|
return Instruction::Sub;
|
|
|
|
case Instruction::Sub:
|
|
|
|
return Instruction::Add;
|
|
|
|
default:
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
///\returns bool representing if Opcode \p Op can be part
|
|
|
|
/// of an alternate sequence which can later be merged as
|
|
|
|
/// a ShuffleVector instruction.
|
|
|
|
static bool canCombineAsAltInst(unsigned Op) {
|
2015-10-24 20:16:42 +00:00
|
|
|
return Op == Instruction::FAdd || Op == Instruction::FSub ||
|
|
|
|
Op == Instruction::Sub || Op == Instruction::Add;
|
2014-06-20 04:32:48 +00:00
|
|
|
}
|
|
|
|
|
2015-08-08 18:27:36 +00:00
|
|
|
/// \returns ShuffleVector instruction if instructions in \p VL have
|
2014-06-20 04:32:48 +00:00
|
|
|
/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence.
|
|
|
|
/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...)
|
|
|
|
static unsigned isAltInst(ArrayRef<Value *> VL) {
|
|
|
|
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
|
|
|
|
unsigned Opcode = I0->getOpcode();
|
|
|
|
unsigned AltOpcode = getAltOpcode(Opcode);
|
|
|
|
for (int i = 1, e = VL.size(); i < e; i++) {
|
|
|
|
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
|
|
|
if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
return Instruction::ShuffleVector;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// \returns The opcode if all of the Instructions in \p VL have the same
|
|
|
|
/// opcode, or zero.
|
|
|
|
static unsigned getSameOpcode(ArrayRef<Value *> VL) {
|
|
|
|
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
|
|
|
|
if (!I0)
|
|
|
|
return 0;
|
|
|
|
unsigned Opcode = I0->getOpcode();
|
|
|
|
for (int i = 1, e = VL.size(); i < e; i++) {
|
|
|
|
Instruction *I = dyn_cast<Instruction>(VL[i]);
|
2014-06-20 04:32:48 +00:00
|
|
|
if (!I || Opcode != I->getOpcode()) {
|
|
|
|
if (canCombineAsAltInst(Opcode) && i == 1)
|
|
|
|
return isAltInst(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
return 0;
|
2014-06-20 04:32:48 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
return Opcode;
|
|
|
|
}
|
|
|
|
|
2014-09-03 17:40:30 +00:00
|
|
|
/// Get the intersection (logical and) of all of the potential IR flags
|
|
|
|
/// of each scalar operation (VL) that will be converted into a vector (I).
|
|
|
|
/// Flag set: NSW, NUW, exact, and all of fast-math.
|
|
|
|
static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
|
|
|
|
if (auto *VecOp = dyn_cast<BinaryOperator>(I)) {
|
|
|
|
if (auto *Intersection = dyn_cast<BinaryOperator>(VL[0])) {
|
|
|
|
// Intersection is initialized to the 0th scalar,
|
|
|
|
// so start counting from index '1'.
|
|
|
|
for (int i = 1, e = VL.size(); i < e; ++i) {
|
|
|
|
if (auto *Scalar = dyn_cast<BinaryOperator>(VL[i]))
|
|
|
|
Intersection->andIRFlags(Scalar);
|
|
|
|
}
|
|
|
|
VecOp->copyIRFlags(Intersection);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2013-11-23 00:48:34 +00:00
|
|
|
/// \returns \p I after propagating metadata from \p VL.
|
|
|
|
static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
|
|
|
|
Instruction *I0 = cast<Instruction>(VL[0]);
|
2014-11-11 21:30:22 +00:00
|
|
|
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
|
2013-11-23 00:48:34 +00:00
|
|
|
I0->getAllMetadataOtherThanDebugLoc(Metadata);
|
|
|
|
|
|
|
|
for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
|
|
|
|
unsigned Kind = Metadata[i].first;
|
2014-11-11 21:30:22 +00:00
|
|
|
MDNode *MD = Metadata[i].second;
|
2013-11-23 00:48:34 +00:00
|
|
|
|
|
|
|
for (int i = 1, e = VL.size(); MD && i != e; i++) {
|
|
|
|
Instruction *I = cast<Instruction>(VL[i]);
|
2014-11-11 21:30:22 +00:00
|
|
|
MDNode *IMD = I->getMetadata(Kind);
|
2013-11-23 00:48:34 +00:00
|
|
|
|
|
|
|
switch (Kind) {
|
|
|
|
default:
|
2014-04-25 05:29:35 +00:00
|
|
|
MD = nullptr; // Remove unknown metadata
|
2013-11-23 00:48:34 +00:00
|
|
|
break;
|
|
|
|
case LLVMContext::MD_tbaa:
|
|
|
|
MD = MDNode::getMostGenericTBAA(MD, IMD);
|
|
|
|
break;
|
Add scoped-noalias metadata
This commit adds scoped noalias metadata. The primary motivations for this
feature are:
1. To preserve noalias function attribute information when inlining
2. To provide the ability to model block-scope C99 restrict pointers
Neither of these two abilities are added here, only the necessary
infrastructure. In fact, there should be no change to existing functionality,
only the addition of new features. The logic that converts noalias function
parameters into this metadata during inlining will come in a follow-up commit.
What is added here is the ability to generally specify noalias memory-access
sets. Regarding the metadata, alias-analysis scopes are defined similar to TBAA
nodes:
!scope0 = metadata !{ metadata !"scope of foo()" }
!scope1 = metadata !{ metadata !"scope 1", metadata !scope0 }
!scope2 = metadata !{ metadata !"scope 2", metadata !scope0 }
!scope3 = metadata !{ metadata !"scope 2.1", metadata !scope2 }
!scope4 = metadata !{ metadata !"scope 2.2", metadata !scope2 }
Loads and stores can be tagged with an alias-analysis scope, and also, with a
noalias tag for a specific scope:
... = load %ptr1, !alias.scope !{ !scope1 }
... = load %ptr2, !alias.scope !{ !scope1, !scope2 }, !noalias !{ !scope1 }
When evaluating an aliasing query, if one of the instructions is associated
with an alias.scope id that is identical to the noalias scope associated with
the other instruction, or is a descendant (in the scope hierarchy) of the
noalias scope associated with the other instruction, then the two memory
accesses are assumed not to alias.
Note that is the first element of the scope metadata is a string, then it can
be combined accross functions and translation units. The string can be replaced
by a self-reference to create globally unqiue scope identifiers.
[Note: This overview is slightly stylized, since the metadata nodes really need
to just be numbers (!0 instead of !scope0), and the scope lists are also global
unnamed metadata.]
Existing noalias metadata in a callee is "cloned" for use by the inlined code.
This is necessary because the aliasing scopes are unique to each call site
(because of possible control dependencies on the aliasing properties). For
example, consider a function: foo(noalias a, noalias b) { *a = *b; } that gets
inlined into bar() { ... if (...) foo(a1, b1); ... if (...) foo(a2, b2); } --
now just because we know that a1 does not alias with b1 at the first call site,
and a2 does not alias with b2 at the second call site, we cannot let inlining
these functons have the metadata imply that a1 does not alias with b2.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213864 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-24 14:25:39 +00:00
|
|
|
case LLVMContext::MD_alias_scope:
|
2015-02-08 17:07:14 +00:00
|
|
|
MD = MDNode::getMostGenericAliasScope(MD, IMD);
|
|
|
|
break;
|
Add scoped-noalias metadata
This commit adds scoped noalias metadata. The primary motivations for this
feature are:
1. To preserve noalias function attribute information when inlining
2. To provide the ability to model block-scope C99 restrict pointers
Neither of these two abilities are added here, only the necessary
infrastructure. In fact, there should be no change to existing functionality,
only the addition of new features. The logic that converts noalias function
parameters into this metadata during inlining will come in a follow-up commit.
What is added here is the ability to generally specify noalias memory-access
sets. Regarding the metadata, alias-analysis scopes are defined similar to TBAA
nodes:
!scope0 = metadata !{ metadata !"scope of foo()" }
!scope1 = metadata !{ metadata !"scope 1", metadata !scope0 }
!scope2 = metadata !{ metadata !"scope 2", metadata !scope0 }
!scope3 = metadata !{ metadata !"scope 2.1", metadata !scope2 }
!scope4 = metadata !{ metadata !"scope 2.2", metadata !scope2 }
Loads and stores can be tagged with an alias-analysis scope, and also, with a
noalias tag for a specific scope:
... = load %ptr1, !alias.scope !{ !scope1 }
... = load %ptr2, !alias.scope !{ !scope1, !scope2 }, !noalias !{ !scope1 }
When evaluating an aliasing query, if one of the instructions is associated
with an alias.scope id that is identical to the noalias scope associated with
the other instruction, or is a descendant (in the scope hierarchy) of the
noalias scope associated with the other instruction, then the two memory
accesses are assumed not to alias.
Note that is the first element of the scope metadata is a string, then it can
be combined accross functions and translation units. The string can be replaced
by a self-reference to create globally unqiue scope identifiers.
[Note: This overview is slightly stylized, since the metadata nodes really need
to just be numbers (!0 instead of !scope0), and the scope lists are also global
unnamed metadata.]
Existing noalias metadata in a callee is "cloned" for use by the inlined code.
This is necessary because the aliasing scopes are unique to each call site
(because of possible control dependencies on the aliasing properties). For
example, consider a function: foo(noalias a, noalias b) { *a = *b; } that gets
inlined into bar() { ... if (...) foo(a1, b1); ... if (...) foo(a2, b2); } --
now just because we know that a1 does not alias with b1 at the first call site,
and a2 does not alias with b2 at the second call site, we cannot let inlining
these functons have the metadata imply that a1 does not alias with b2.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@213864 91177308-0d34-0410-b5e6-96231b3b80d8
2014-07-24 14:25:39 +00:00
|
|
|
case LLVMContext::MD_noalias:
|
|
|
|
MD = MDNode::intersect(MD, IMD);
|
|
|
|
break;
|
2013-11-23 00:48:34 +00:00
|
|
|
case LLVMContext::MD_fpmath:
|
|
|
|
MD = MDNode::getMostGenericFPMath(MD, IMD);
|
|
|
|
break;
|
2015-08-20 22:28:15 +00:00
|
|
|
case LLVMContext::MD_nontemporal:
|
|
|
|
MD = MDNode::intersect(MD, IMD);
|
|
|
|
break;
|
2013-11-23 00:48:34 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
I->setMetadata(Kind, MD);
|
|
|
|
}
|
|
|
|
return I;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// \returns The type that all of the values in \p VL have or null if there
|
|
|
|
/// are different types.
|
|
|
|
static Type* getSameType(ArrayRef<Value *> VL) {
|
|
|
|
Type *Ty = VL[0]->getType();
|
|
|
|
for (int i = 1, e = VL.size(); i < e; i++)
|
2013-07-09 21:38:08 +00:00
|
|
|
if (VL[i]->getType() != Ty)
|
2014-04-25 05:29:35 +00:00
|
|
|
return nullptr;
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
return Ty;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \returns True if the ExtractElement instructions in VL can be vectorized
|
|
|
|
/// to use the original vector.
|
|
|
|
static bool CanReuseExtract(ArrayRef<Value *> VL) {
|
|
|
|
assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
|
|
|
|
// Check if all of the extracts come from the same vector and from the
|
|
|
|
// correct offset.
|
|
|
|
Value *VL0 = VL[0];
|
|
|
|
ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
|
|
|
|
Value *Vec = E0->getOperand(0);
|
|
|
|
|
|
|
|
// We have to extract from the same vector type.
|
|
|
|
unsigned NElts = Vec->getType()->getVectorNumElements();
|
|
|
|
|
|
|
|
if (NElts != VL.size())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Check that all of the indices extract from the correct offset.
|
|
|
|
ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
|
|
|
|
if (!CI || CI->getZExtValue())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
|
|
|
|
ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
|
|
|
|
ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
|
|
|
|
|
|
|
|
if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-09-02 21:00:39 +00:00
|
|
|
/// \returns True if in-tree use also needs extract. This refers to
|
|
|
|
/// possible scalar operand in vectorized instruction.
|
|
|
|
static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
|
|
|
|
TargetLibraryInfo *TLI) {
|
|
|
|
|
|
|
|
unsigned Opcode = UserInst->getOpcode();
|
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::Load: {
|
|
|
|
LoadInst *LI = cast<LoadInst>(UserInst);
|
|
|
|
return (LI->getPointerOperand() == Scalar);
|
|
|
|
}
|
|
|
|
case Instruction::Store: {
|
|
|
|
StoreInst *SI = cast<StoreInst>(UserInst);
|
|
|
|
return (SI->getPointerOperand() == Scalar);
|
|
|
|
}
|
|
|
|
case Instruction::Call: {
|
|
|
|
CallInst *CI = cast<CallInst>(UserInst);
|
|
|
|
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
|
|
|
|
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
|
|
|
|
return (CI->getArgOperand(1) == Scalar);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-01-14 11:24:47 +00:00
|
|
|
/// \returns the AA location that is being access by the instruction.
|
2015-06-17 07:18:54 +00:00
|
|
|
static MemoryLocation getLocation(Instruction *I, AliasAnalysis *AA) {
|
2015-01-14 11:24:47 +00:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
2015-06-04 02:03:15 +00:00
|
|
|
return MemoryLocation::get(SI);
|
2015-01-14 11:24:47 +00:00
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
2015-06-04 02:03:15 +00:00
|
|
|
return MemoryLocation::get(LI);
|
2015-06-17 07:18:54 +00:00
|
|
|
return MemoryLocation();
|
2015-01-14 11:24:47 +00:00
|
|
|
}
|
|
|
|
|
2015-01-26 09:07:04 +00:00
|
|
|
/// \returns True if the instruction is not a volatile or atomic load/store.
|
|
|
|
static bool isSimple(Instruction *I) {
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(I))
|
|
|
|
return LI->isSimple();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
|
|
|
return SI->isSimple();
|
|
|
|
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
|
|
|
|
return !MI->isVolatile();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// Bottom Up SLP Vectorizer.
|
|
|
|
class BoUpSLP {
|
|
|
|
public:
|
2013-06-22 21:34:10 +00:00
|
|
|
typedef SmallVector<Value *, 8> ValueList;
|
|
|
|
typedef SmallVector<Instruction *, 16> InstrList;
|
|
|
|
typedef SmallPtrSet<Value *, 16> ValueSet;
|
|
|
|
typedef SmallVector<StoreInst *, 8> StoreList;
|
|
|
|
|
2015-03-10 02:37:25 +00:00
|
|
|
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
|
|
|
|
TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
|
2016-03-16 19:48:42 +00:00
|
|
|
DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
|
|
|
|
const DataLayout *DL)
|
2015-01-04 12:03:27 +00:00
|
|
|
: NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
|
2016-02-18 14:14:40 +00:00
|
|
|
SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
|
2016-03-16 19:48:42 +00:00
|
|
|
DL(DL), Builder(Se->getContext()) {
|
2015-01-04 12:03:27 +00:00
|
|
|
CodeMetrics::collectEphemeralValues(F, AC, EphValues);
|
2014-10-15 17:35:01 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
|
|
|
/// \brief Vectorize the tree that starts with the elements in \p VL.
|
2013-09-21 01:06:00 +00:00
|
|
|
/// Returns the vectorized root.
|
|
|
|
Value *vectorizeTree();
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2014-08-05 12:30:34 +00:00
|
|
|
/// \returns the cost incurred by unwanted spills and fills, caused by
|
|
|
|
/// holding live values over call sites.
|
|
|
|
int getSpillCost();
|
|
|
|
|
2013-06-22 21:34:10 +00:00
|
|
|
/// \returns the vectorization cost of the subtree that starts at \p VL.
|
|
|
|
/// A negative number means that this is profitable.
|
2013-07-07 06:57:07 +00:00
|
|
|
int getTreeCost();
|
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
/// Construct a vectorizable tree that starts at \p Roots, ignoring users for
|
|
|
|
/// the purpose of scheduling and extraction in the \p UserIgnoreLst.
|
|
|
|
void buildTree(ArrayRef<Value *> Roots,
|
|
|
|
ArrayRef<Value *> UserIgnoreLst = None);
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
/// Clear the internal data structures that are created by 'buildTree'.
|
|
|
|
void deleteTree() {
|
|
|
|
VectorizableTree.clear();
|
|
|
|
ScalarToTreeEntry.clear();
|
|
|
|
MustGather.clear();
|
2013-07-11 04:54:05 +00:00
|
|
|
ExternalUses.clear();
|
2014-08-01 08:05:55 +00:00
|
|
|
NumLoadsWantToKeepOrder = 0;
|
|
|
|
NumLoadsWantToChangeOrder = 0;
|
2014-08-01 09:20:42 +00:00
|
|
|
for (auto &Iter : BlocksSchedules) {
|
|
|
|
BlockScheduling *BS = Iter.second.get();
|
|
|
|
BS->clear();
|
|
|
|
}
|
2016-02-18 14:14:40 +00:00
|
|
|
MinBWs.clear();
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// \brief Perform LICM and CSE on the newly generated gather sequences.
|
|
|
|
void optimizeGatherSequence();
|
2014-06-20 04:32:48 +00:00
|
|
|
|
2015-08-08 18:27:36 +00:00
|
|
|
/// \returns true if it is beneficial to reverse the vector order.
|
2014-08-01 08:05:55 +00:00
|
|
|
bool shouldReorder() const {
|
|
|
|
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
|
2014-07-30 21:07:56 +00:00
|
|
|
}
|
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
/// \return The vector element size in bits to use when vectorizing the
|
|
|
|
/// expression tree ending at \p V. If V is a store, the size is the width of
|
|
|
|
/// the stored value. Otherwise, the size is the width of the largest loaded
|
|
|
|
/// value reaching V. This method is used by the vectorizer to calculate
|
|
|
|
/// vectorization factors.
|
|
|
|
unsigned getVectorElementSize(Value *V);
|
|
|
|
|
2016-02-18 14:14:40 +00:00
|
|
|
/// Compute the minimum type sizes required to represent the entries in a
|
|
|
|
/// vectorizable tree.
|
|
|
|
void computeMinimumValueSizes();
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
private:
|
|
|
|
struct TreeEntry;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// \returns the cost of the vectorizable entry.
|
|
|
|
int getEntryCost(TreeEntry *E);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// This is the recursive part of buildTree.
|
|
|
|
void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-22 22:18:07 +00:00
|
|
|
/// Vectorize a single entry in the tree.
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *vectorizeTree(TreeEntry *E);
|
|
|
|
|
2013-07-22 22:18:07 +00:00
|
|
|
/// Vectorize a single entry in the tree, starting in \p VL.
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *vectorizeTree(ArrayRef<Value *> VL);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-22 22:18:07 +00:00
|
|
|
/// \returns the pointer to the vectorized value if \p VL is already
|
|
|
|
/// vectorized, or NULL. They may happen in cycles.
|
2013-08-26 17:56:38 +00:00
|
|
|
Value *alreadyVectorized(ArrayRef<Value *> VL) const;
|
2013-07-22 22:18:07 +00:00
|
|
|
|
2013-06-22 21:34:10 +00:00
|
|
|
/// \returns the scalarization cost for this type. Scalarization in this
|
|
|
|
/// context means the creation of vectors from a group of scalars.
|
|
|
|
int getGatherCost(Type *Ty);
|
|
|
|
|
2013-07-11 20:56:13 +00:00
|
|
|
/// \returns the scalarization cost for this list of values. Assuming that
|
|
|
|
/// this subtree gets vectorized, we may need to extract the values from the
|
|
|
|
/// roots. This method calculates the cost of extracting the values.
|
|
|
|
int getGatherCost(ArrayRef<Value *> VL);
|
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
/// \brief Set the Builder insert point to one after the last instruction in
|
|
|
|
/// the bundle
|
|
|
|
void setInsertPointAfterBundle(ArrayRef<Value *> VL);
|
|
|
|
|
2013-06-22 21:34:10 +00:00
|
|
|
/// \returns a vector from a collection of scalars in \p VL.
|
|
|
|
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
|
|
|
|
|
2015-08-08 18:27:36 +00:00
|
|
|
/// \returns whether the VectorizableTree is fully vectorizable and will
|
2013-10-02 20:20:39 +00:00
|
|
|
/// be beneficial even the tree height is tiny.
|
2013-11-22 15:47:17 +00:00
|
|
|
bool isFullyVectorizableTinyTree();
|
2013-10-02 20:20:39 +00:00
|
|
|
|
2015-01-20 06:11:00 +00:00
|
|
|
/// \reorder commutative operands in alt shuffle if they result in
|
|
|
|
/// vectorized code.
|
|
|
|
void reorderAltShuffleOperands(ArrayRef<Value *> VL,
|
|
|
|
SmallVectorImpl<Value *> &Left,
|
|
|
|
SmallVectorImpl<Value *> &Right);
|
|
|
|
/// \reorder commutative operands to get better probability of
|
|
|
|
/// generating vectorized code.
|
|
|
|
void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
|
|
|
|
SmallVectorImpl<Value *> &Left,
|
|
|
|
SmallVectorImpl<Value *> &Right);
|
2013-07-07 06:57:07 +00:00
|
|
|
struct TreeEntry {
|
2014-08-01 09:20:42 +00:00
|
|
|
TreeEntry() : Scalars(), VectorizedValue(nullptr),
|
2013-07-07 06:57:07 +00:00
|
|
|
NeedToGather(0) {}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// \returns true if the scalars in VL are equal to this entry.
|
2013-08-26 17:56:38 +00:00
|
|
|
bool isSame(ArrayRef<Value *> VL) const {
|
2013-07-07 06:57:07 +00:00
|
|
|
assert(VL.size() == Scalars.size() && "Invalid size");
|
2013-10-02 19:06:06 +00:00
|
|
|
return std::equal(VL.begin(), VL.end(), Scalars.begin());
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// A vector of scalars.
|
|
|
|
ValueList Scalars;
|
|
|
|
|
|
|
|
/// The Scalars are vectorized into this value. It is initialized to Null.
|
|
|
|
Value *VectorizedValue;
|
|
|
|
|
|
|
|
/// Do we need to gather this sequence ?
|
|
|
|
bool NeedToGather;
|
|
|
|
};
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// Create a new VectorizableTree entry.
|
|
|
|
TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
|
2015-05-29 19:43:39 +00:00
|
|
|
VectorizableTree.emplace_back();
|
2013-07-07 06:57:07 +00:00
|
|
|
int idx = VectorizableTree.size() - 1;
|
|
|
|
TreeEntry *Last = &VectorizableTree[idx];
|
|
|
|
Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
|
|
|
|
Last->NeedToGather = !Vectorized;
|
|
|
|
if (Vectorized) {
|
|
|
|
for (int i = 0, e = VL.size(); i != e; ++i) {
|
|
|
|
assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
|
|
|
|
ScalarToTreeEntry[VL[i]] = idx;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
MustGather.insert(VL.begin(), VL.end());
|
|
|
|
}
|
|
|
|
return Last;
|
2013-06-28 22:07:09 +00:00
|
|
|
}
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2013-06-22 21:34:10 +00:00
|
|
|
/// -- Vectorization State --
|
2013-07-07 06:57:07 +00:00
|
|
|
/// Holds all of the tree entries.
|
|
|
|
std::vector<TreeEntry> VectorizableTree;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// Maps a specific scalar to its tree entry.
|
|
|
|
SmallDenseMap<Value*, int> ScalarToTreeEntry;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
/// A list of scalars that we found that we need to keep as scalars.
|
2013-06-22 21:34:10 +00:00
|
|
|
ValueSet MustGather;
|
|
|
|
|
2013-07-11 04:54:05 +00:00
|
|
|
/// This POD struct describes one external user in the vectorized tree.
|
|
|
|
struct ExternalUser {
|
|
|
|
ExternalUser (Value *S, llvm::User *U, int L) :
|
2015-07-22 20:46:11 +00:00
|
|
|
Scalar(S), User(U), Lane(L){}
|
2013-07-11 04:54:05 +00:00
|
|
|
// Which scalar in our function.
|
|
|
|
Value *Scalar;
|
|
|
|
// Which user that uses the scalar.
|
|
|
|
llvm::User *User;
|
|
|
|
// Which lane does the scalar belong to.
|
|
|
|
int Lane;
|
|
|
|
};
|
|
|
|
typedef SmallVector<ExternalUser, 16> UserList;
|
|
|
|
|
2015-01-14 11:24:47 +00:00
|
|
|
/// Checks if two instructions may access the same memory.
|
|
|
|
///
|
|
|
|
/// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
|
|
|
|
/// is invariant in the calling loop.
|
2015-06-17 07:18:54 +00:00
|
|
|
bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
|
2015-01-14 11:24:47 +00:00
|
|
|
Instruction *Inst2) {
|
|
|
|
|
|
|
|
// First check if the result is already in the cache.
|
|
|
|
AliasCacheKey key = std::make_pair(Inst1, Inst2);
|
|
|
|
Optional<bool> &result = AliasCache[key];
|
|
|
|
if (result.hasValue()) {
|
|
|
|
return result.getValue();
|
|
|
|
}
|
2015-06-17 07:18:54 +00:00
|
|
|
MemoryLocation Loc2 = getLocation(Inst2, AA);
|
2015-01-14 11:24:47 +00:00
|
|
|
bool aliased = true;
|
2015-01-26 09:07:04 +00:00
|
|
|
if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
|
2015-01-14 11:24:47 +00:00
|
|
|
// Do the alias check.
|
|
|
|
aliased = AA->alias(Loc1, Loc2);
|
|
|
|
}
|
|
|
|
// Store the result in the cache.
|
|
|
|
result = aliased;
|
|
|
|
return aliased;
|
|
|
|
}
|
|
|
|
|
|
|
|
typedef std::pair<Instruction *, Instruction *> AliasCacheKey;
|
|
|
|
|
|
|
|
/// Cache for alias results.
|
|
|
|
/// TODO: consider moving this to the AliasAnalysis itself.
|
|
|
|
DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
|
|
|
|
|
|
|
|
/// Removes an instruction from its block and eventually deletes it.
|
|
|
|
/// It's like Instruction::eraseFromParent() except that the actual deletion
|
|
|
|
/// is delayed until BoUpSLP is destructed.
|
|
|
|
/// This is required to ensure that there are no incorrect collisions in the
|
|
|
|
/// AliasCache, which can happen if a new instruction is allocated at the
|
|
|
|
/// same address as a previously deleted instruction.
|
|
|
|
void eraseInstruction(Instruction *I) {
|
|
|
|
I->removeFromParent();
|
|
|
|
I->dropAllReferences();
|
|
|
|
DeletedInstructions.push_back(std::unique_ptr<Instruction>(I));
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Temporary store for deleted instructions. Instructions will be deleted
|
|
|
|
/// eventually when the BoUpSLP is destructed.
|
|
|
|
SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
|
|
|
|
|
2013-07-11 04:54:05 +00:00
|
|
|
/// A list of values that need to extracted out of the tree.
|
|
|
|
/// This list holds pairs of (Internal Scalar : External User).
|
|
|
|
UserList ExternalUses;
|
|
|
|
|
2014-10-15 17:35:01 +00:00
|
|
|
/// Values used only by @llvm.assume calls.
|
|
|
|
SmallPtrSet<const Value *, 32> EphValues;
|
|
|
|
|
2013-06-22 21:34:10 +00:00
|
|
|
/// Holds all of the instructions that we gathered.
|
|
|
|
SetVector<Instruction *> GatherSeq;
|
2013-11-26 22:24:25 +00:00
|
|
|
/// A list of blocks that we are going to CSE.
|
2013-12-05 18:28:01 +00:00
|
|
|
SetVector<BasicBlock *> CSEBlocks;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
/// Contains all scheduling relevant data for an instruction.
|
|
|
|
/// A ScheduleData either represents a single instruction or a member of an
|
|
|
|
/// instruction bundle (= a group of instructions which is combined into a
|
|
|
|
/// vector instruction).
|
|
|
|
struct ScheduleData {
|
|
|
|
|
|
|
|
// The initial value for the dependency counters. It means that the
|
|
|
|
// dependencies are not calculated yet.
|
|
|
|
enum { InvalidDeps = -1 };
|
|
|
|
|
|
|
|
ScheduleData()
|
|
|
|
: Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr),
|
|
|
|
NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0),
|
|
|
|
Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps),
|
|
|
|
UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {}
|
|
|
|
|
|
|
|
void init(int BlockSchedulingRegionID) {
|
|
|
|
FirstInBundle = this;
|
|
|
|
NextInBundle = nullptr;
|
|
|
|
NextLoadStore = nullptr;
|
|
|
|
IsScheduled = false;
|
|
|
|
SchedulingRegionID = BlockSchedulingRegionID;
|
|
|
|
UnscheduledDepsInBundle = UnscheduledDeps;
|
|
|
|
clearDependencies();
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if the dependency information has been calculated.
|
|
|
|
bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
/// Returns true for single instructions and for bundle representatives
|
|
|
|
/// (= the head of a bundle).
|
|
|
|
bool isSchedulingEntity() const { return FirstInBundle == this; }
|
|
|
|
|
|
|
|
/// Returns true if it represents an instruction bundle and not only a
|
|
|
|
/// single instruction.
|
|
|
|
bool isPartOfBundle() const {
|
|
|
|
return NextInBundle != nullptr || FirstInBundle != this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if it is ready for scheduling, i.e. it has no more
|
|
|
|
/// unscheduled depending instructions/bundles.
|
|
|
|
bool isReady() const {
|
|
|
|
assert(isSchedulingEntity() &&
|
|
|
|
"can't consider non-scheduling entity for ready list");
|
|
|
|
return UnscheduledDepsInBundle == 0 && !IsScheduled;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Modifies the number of unscheduled dependencies, also updating it for
|
|
|
|
/// the whole bundle.
|
|
|
|
int incrementUnscheduledDeps(int Incr) {
|
|
|
|
UnscheduledDeps += Incr;
|
|
|
|
return FirstInBundle->UnscheduledDepsInBundle += Incr;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Sets the number of unscheduled dependencies to the number of
|
|
|
|
/// dependencies.
|
|
|
|
void resetUnscheduledDeps() {
|
|
|
|
incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Clears all dependency information.
|
|
|
|
void clearDependencies() {
|
|
|
|
Dependencies = InvalidDeps;
|
|
|
|
resetUnscheduledDeps();
|
|
|
|
MemoryDependencies.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
void dump(raw_ostream &os) const {
|
|
|
|
if (!isSchedulingEntity()) {
|
|
|
|
os << "/ " << *Inst;
|
|
|
|
} else if (NextInBundle) {
|
|
|
|
os << '[' << *Inst;
|
|
|
|
ScheduleData *SD = NextInBundle;
|
|
|
|
while (SD) {
|
|
|
|
os << ';' << *SD->Inst;
|
|
|
|
SD = SD->NextInBundle;
|
|
|
|
}
|
|
|
|
os << ']';
|
|
|
|
} else {
|
|
|
|
os << *Inst;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Instruction *Inst;
|
|
|
|
|
|
|
|
/// Points to the head in an instruction bundle (and always to this for
|
|
|
|
/// single instructions).
|
|
|
|
ScheduleData *FirstInBundle;
|
|
|
|
|
|
|
|
/// Single linked list of all instructions in a bundle. Null if it is a
|
|
|
|
/// single instruction.
|
|
|
|
ScheduleData *NextInBundle;
|
|
|
|
|
|
|
|
/// Single linked list of all memory instructions (e.g. load, store, call)
|
|
|
|
/// in the block - until the end of the scheduling region.
|
|
|
|
ScheduleData *NextLoadStore;
|
|
|
|
|
|
|
|
/// The dependent memory instructions.
|
|
|
|
/// This list is derived on demand in calculateDependencies().
|
|
|
|
SmallVector<ScheduleData *, 4> MemoryDependencies;
|
|
|
|
|
|
|
|
/// This ScheduleData is in the current scheduling region if this matches
|
|
|
|
/// the current SchedulingRegionID of BlockScheduling.
|
|
|
|
int SchedulingRegionID;
|
|
|
|
|
|
|
|
/// Used for getting a "good" final ordering of instructions.
|
|
|
|
int SchedulingPriority;
|
|
|
|
|
|
|
|
/// The number of dependencies. Constitutes of the number of users of the
|
|
|
|
/// instruction plus the number of dependent memory instructions (if any).
|
|
|
|
/// This value is calculated on demand.
|
|
|
|
/// If InvalidDeps, the number of dependencies is not calculated yet.
|
|
|
|
///
|
|
|
|
int Dependencies;
|
|
|
|
|
|
|
|
/// The number of dependencies minus the number of dependencies of scheduled
|
|
|
|
/// instructions. As soon as this is zero, the instruction/bundle gets ready
|
|
|
|
/// for scheduling.
|
|
|
|
/// Note that this is negative as long as Dependencies is not calculated.
|
|
|
|
int UnscheduledDeps;
|
|
|
|
|
|
|
|
/// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
|
|
|
|
/// single instructions.
|
|
|
|
int UnscheduledDepsInBundle;
|
|
|
|
|
|
|
|
/// True if this instruction is scheduled (or considered as scheduled in the
|
|
|
|
/// dry-run).
|
|
|
|
bool IsScheduled;
|
|
|
|
};
|
|
|
|
|
2014-08-01 09:47:38 +00:00
|
|
|
#ifndef NDEBUG
|
2014-08-01 09:20:42 +00:00
|
|
|
friend raw_ostream &operator<<(raw_ostream &os,
|
|
|
|
const BoUpSLP::ScheduleData &SD);
|
2014-08-01 09:47:38 +00:00
|
|
|
#endif
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
/// Contains all scheduling data for a basic block.
|
|
|
|
///
|
|
|
|
struct BlockScheduling {
|
|
|
|
|
|
|
|
BlockScheduling(BasicBlock *BB)
|
|
|
|
: BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize),
|
|
|
|
ScheduleStart(nullptr), ScheduleEnd(nullptr),
|
|
|
|
FirstLoadStoreInRegion(nullptr), LastLoadStoreInRegion(nullptr),
|
2015-09-30 17:00:44 +00:00
|
|
|
ScheduleRegionSize(0),
|
|
|
|
ScheduleRegionSizeLimit(ScheduleRegionSizeBudget),
|
2014-08-01 09:20:42 +00:00
|
|
|
// Make sure that the initial SchedulingRegionID is greater than the
|
|
|
|
// initial SchedulingRegionID in ScheduleData (which is 0).
|
|
|
|
SchedulingRegionID(1) {}
|
|
|
|
|
|
|
|
void clear() {
|
|
|
|
ReadyInsts.clear();
|
|
|
|
ScheduleStart = nullptr;
|
|
|
|
ScheduleEnd = nullptr;
|
|
|
|
FirstLoadStoreInRegion = nullptr;
|
|
|
|
LastLoadStoreInRegion = nullptr;
|
|
|
|
|
2015-09-30 17:00:44 +00:00
|
|
|
// Reduce the maximum schedule region size by the size of the
|
|
|
|
// previous scheduling run.
|
|
|
|
ScheduleRegionSizeLimit -= ScheduleRegionSize;
|
|
|
|
if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
|
|
|
|
ScheduleRegionSizeLimit = MinScheduleRegionSize;
|
|
|
|
ScheduleRegionSize = 0;
|
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
// Make a new scheduling region, i.e. all existing ScheduleData is not
|
|
|
|
// in the new region yet.
|
|
|
|
++SchedulingRegionID;
|
|
|
|
}
|
|
|
|
|
|
|
|
ScheduleData *getScheduleData(Value *V) {
|
|
|
|
ScheduleData *SD = ScheduleDataMap[V];
|
|
|
|
if (SD && SD->SchedulingRegionID == SchedulingRegionID)
|
|
|
|
return SD;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool isInSchedulingRegion(ScheduleData *SD) {
|
|
|
|
return SD->SchedulingRegionID == SchedulingRegionID;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Marks an instruction as scheduled and puts all dependent ready
|
|
|
|
/// instructions into the ready-list.
|
|
|
|
template <typename ReadyListType>
|
|
|
|
void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
|
|
|
|
SD->IsScheduled = true;
|
|
|
|
DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
|
|
|
|
|
|
|
|
ScheduleData *BundleMember = SD;
|
|
|
|
while (BundleMember) {
|
|
|
|
// Handle the def-use chain dependencies.
|
|
|
|
for (Use &U : BundleMember->Inst->operands()) {
|
|
|
|
ScheduleData *OpDef = getScheduleData(U.get());
|
|
|
|
if (OpDef && OpDef->hasValidDependencies() &&
|
|
|
|
OpDef->incrementUnscheduledDeps(-1) == 0) {
|
|
|
|
// There are no more unscheduled dependencies after decrementing,
|
|
|
|
// so we can put the dependent instruction into the ready list.
|
|
|
|
ScheduleData *DepBundle = OpDef->FirstInBundle;
|
|
|
|
assert(!DepBundle->IsScheduled &&
|
|
|
|
"already scheduled bundle gets ready");
|
|
|
|
ReadyList.insert(DepBundle);
|
|
|
|
DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Handle the memory dependencies.
|
|
|
|
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
|
|
|
|
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
|
|
|
|
// There are no more unscheduled dependencies after decrementing,
|
|
|
|
// so we can put the dependent instruction into the ready list.
|
|
|
|
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
|
|
|
|
assert(!DepBundle->IsScheduled &&
|
|
|
|
"already scheduled bundle gets ready");
|
|
|
|
ReadyList.insert(DepBundle);
|
|
|
|
DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BundleMember = BundleMember->NextInBundle;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Put all instructions into the ReadyList which are ready for scheduling.
|
|
|
|
template <typename ReadyListType>
|
|
|
|
void initialFillReadyList(ReadyListType &ReadyList) {
|
|
|
|
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
|
|
ScheduleData *SD = getScheduleData(I);
|
|
|
|
if (SD->isSchedulingEntity() && SD->isReady()) {
|
|
|
|
ReadyList.insert(SD);
|
|
|
|
DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Checks if a bundle of instructions can be scheduled, i.e. has no
|
|
|
|
/// cyclic dependencies. This is only a dry-run, no instructions are
|
|
|
|
/// actually moved at this stage.
|
2015-01-14 11:24:47 +00:00
|
|
|
bool tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP);
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
/// Un-bundles a group of instructions.
|
|
|
|
void cancelScheduling(ArrayRef<Value *> VL);
|
|
|
|
|
|
|
|
/// Extends the scheduling region so that V is inside the region.
|
2015-09-30 17:00:44 +00:00
|
|
|
/// \returns true if the region size is within the limit.
|
|
|
|
bool extendSchedulingRegion(Value *V);
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
/// Initialize the ScheduleData structures for new instructions in the
|
|
|
|
/// scheduling region.
|
|
|
|
void initScheduleData(Instruction *FromI, Instruction *ToI,
|
|
|
|
ScheduleData *PrevLoadStore,
|
|
|
|
ScheduleData *NextLoadStore);
|
|
|
|
|
|
|
|
/// Updates the dependency information of a bundle and of all instructions/
|
|
|
|
/// bundles which depend on the original bundle.
|
|
|
|
void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
|
2015-01-14 11:24:47 +00:00
|
|
|
BoUpSLP *SLP);
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
/// Sets all instruction in the scheduling region to un-scheduled.
|
|
|
|
void resetSchedule();
|
|
|
|
|
|
|
|
BasicBlock *BB;
|
|
|
|
|
|
|
|
/// Simple memory allocation for ScheduleData.
|
|
|
|
std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
|
|
|
|
|
|
|
|
/// The size of a ScheduleData array in ScheduleDataChunks.
|
|
|
|
int ChunkSize;
|
|
|
|
|
|
|
|
/// The allocator position in the current chunk, which is the last entry
|
|
|
|
/// of ScheduleDataChunks.
|
|
|
|
int ChunkPos;
|
|
|
|
|
|
|
|
/// Attaches ScheduleData to Instruction.
|
|
|
|
/// Note that the mapping survives during all vectorization iterations, i.e.
|
|
|
|
/// ScheduleData structures are recycled.
|
|
|
|
DenseMap<Value *, ScheduleData *> ScheduleDataMap;
|
|
|
|
|
|
|
|
struct ReadyList : SmallVector<ScheduleData *, 8> {
|
|
|
|
void insert(ScheduleData *SD) { push_back(SD); }
|
|
|
|
};
|
|
|
|
|
|
|
|
/// The ready-list for scheduling (only used for the dry-run).
|
|
|
|
ReadyList ReadyInsts;
|
|
|
|
|
|
|
|
/// The first instruction of the scheduling region.
|
|
|
|
Instruction *ScheduleStart;
|
|
|
|
|
|
|
|
/// The first instruction _after_ the scheduling region.
|
|
|
|
Instruction *ScheduleEnd;
|
|
|
|
|
|
|
|
/// The first memory accessing instruction in the scheduling region
|
|
|
|
/// (can be null).
|
|
|
|
ScheduleData *FirstLoadStoreInRegion;
|
|
|
|
|
|
|
|
/// The last memory accessing instruction in the scheduling region
|
|
|
|
/// (can be null).
|
|
|
|
ScheduleData *LastLoadStoreInRegion;
|
|
|
|
|
2015-09-30 17:00:44 +00:00
|
|
|
/// The current size of the scheduling region.
|
|
|
|
int ScheduleRegionSize;
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2015-09-30 17:00:44 +00:00
|
|
|
/// The maximum size allowed for the scheduling region.
|
|
|
|
int ScheduleRegionSizeLimit;
|
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
/// The ID of the scheduling region. For a new vectorization iteration this
|
|
|
|
/// is incremented which "removes" all ScheduleData from the region.
|
|
|
|
int SchedulingRegionID;
|
|
|
|
};
|
|
|
|
|
|
|
|
/// Attaches the BlockScheduling structures to basic blocks.
|
2015-01-13 19:45:52 +00:00
|
|
|
MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
/// Performs the "real" scheduling. Done before vectorization is actually
|
|
|
|
/// performed in a basic block.
|
2014-08-02 19:39:42 +00:00
|
|
|
void scheduleBlock(BlockScheduling *BS);
|
2014-05-03 15:50:37 +00:00
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
/// List of users to ignore during scheduling and that don't need extracting.
|
|
|
|
ArrayRef<Value *> UserIgnoreList;
|
2013-09-21 01:06:00 +00:00
|
|
|
|
2014-08-01 08:05:55 +00:00
|
|
|
// Number of load-bundles, which contain consecutive loads.
|
|
|
|
int NumLoadsWantToKeepOrder;
|
|
|
|
|
|
|
|
// Number of load-bundles of size 2, which are consecutive loads if reversed.
|
|
|
|
int NumLoadsWantToChangeOrder;
|
|
|
|
|
2013-06-22 21:34:10 +00:00
|
|
|
// Analysis and block reference.
|
|
|
|
Function *F;
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
TargetTransformInfo *TTI;
|
2014-05-03 09:59:54 +00:00
|
|
|
TargetLibraryInfo *TLI;
|
2013-06-22 21:34:10 +00:00
|
|
|
AliasAnalysis *AA;
|
|
|
|
LoopInfo *LI;
|
2013-06-23 21:57:27 +00:00
|
|
|
DominatorTree *DT;
|
2016-02-18 14:14:40 +00:00
|
|
|
AssumptionCache *AC;
|
|
|
|
DemandedBits *DB;
|
2016-03-16 19:48:42 +00:00
|
|
|
const DataLayout *DL;
|
2013-06-22 21:34:10 +00:00
|
|
|
/// Instruction builder to construct the vectorized tree.
|
|
|
|
IRBuilder<> Builder;
|
2016-02-18 14:14:40 +00:00
|
|
|
|
|
|
|
/// A map of scalar integer values to the smallest bit width with which they
|
|
|
|
/// can legally be represented.
|
|
|
|
MapVector<Value *, uint64_t> MinBWs;
|
2013-06-22 21:34:10 +00:00
|
|
|
};
|
2014-08-01 09:47:38 +00:00
|
|
|
|
|
|
|
#ifndef NDEBUG
|
2014-08-01 09:20:42 +00:00
|
|
|
raw_ostream &operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD) {
|
|
|
|
SD.dump(os);
|
|
|
|
return os;
|
|
|
|
}
|
2014-08-01 09:47:38 +00:00
|
|
|
#endif
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
|
|
|
|
ArrayRef<Value *> UserIgnoreLst) {
|
2013-07-07 06:57:07 +00:00
|
|
|
deleteTree();
|
2014-05-04 17:10:15 +00:00
|
|
|
UserIgnoreList = UserIgnoreLst;
|
2013-07-09 21:38:08 +00:00
|
|
|
if (!getSameType(Roots))
|
|
|
|
return;
|
2013-07-07 06:57:07 +00:00
|
|
|
buildTree_rec(Roots, 0);
|
2013-07-11 04:54:05 +00:00
|
|
|
|
|
|
|
// Collect the values that we need to extract from the tree.
|
|
|
|
for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
|
|
|
|
TreeEntry *Entry = &VectorizableTree[EIdx];
|
|
|
|
|
|
|
|
// For each lane:
|
|
|
|
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
|
|
|
|
Value *Scalar = Entry->Scalars[Lane];
|
|
|
|
|
|
|
|
// No need to handle users of gathered values.
|
|
|
|
if (Entry->NeedToGather)
|
|
|
|
continue;
|
|
|
|
|
2014-03-09 03:16:01 +00:00
|
|
|
for (User *U : Scalar->users()) {
|
|
|
|
DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
|
2013-07-11 04:54:05 +00:00
|
|
|
|
2014-03-09 03:16:01 +00:00
|
|
|
Instruction *UserInst = dyn_cast<Instruction>(U);
|
2013-09-21 01:06:00 +00:00
|
|
|
if (!UserInst)
|
|
|
|
continue;
|
2013-07-11 04:54:05 +00:00
|
|
|
|
2014-09-02 21:00:39 +00:00
|
|
|
// Skip in-tree scalars that become vectors
|
|
|
|
if (ScalarToTreeEntry.count(U)) {
|
|
|
|
int Idx = ScalarToTreeEntry[U];
|
|
|
|
TreeEntry *UseEntry = &VectorizableTree[Idx];
|
|
|
|
Value *UseScalar = UseEntry->Scalars[0];
|
|
|
|
// Some in-tree scalars will remain as scalar in vectorized
|
|
|
|
// instructions. If that is the case, the one in Lane 0 will
|
|
|
|
// be used.
|
|
|
|
if (UseScalar != U ||
|
|
|
|
!InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
|
|
|
|
DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
|
|
|
|
<< ".\n");
|
|
|
|
assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
// Ignore users in the user ignore list.
|
|
|
|
if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) !=
|
|
|
|
UserIgnoreList.end())
|
2013-07-11 04:54:05 +00:00
|
|
|
continue;
|
|
|
|
|
2014-03-09 03:16:01 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " <<
|
2013-07-11 04:54:05 +00:00
|
|
|
Lane << " from " << *Scalar << ".\n");
|
2014-03-09 03:16:01 +00:00
|
|
|
ExternalUses.push_back(ExternalUser(Scalar, U, Lane));
|
2013-07-11 04:54:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|
|
|
bool SameTy = getSameType(VL); (void)SameTy;
|
2014-06-20 04:32:48 +00:00
|
|
|
bool isAltShuffle = false;
|
2013-07-07 06:57:07 +00:00
|
|
|
assert(SameTy && "Invalid types!");
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
if (Depth == RecursionMaxDepth) {
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// Don't handle vectors.
|
|
|
|
if (VL[0]->getType()->isVectorTy()) {
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
|
|
|
if (SI->getValueOperand()->getType()->isVectorTy()) {
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
2014-06-20 04:32:48 +00:00
|
|
|
unsigned Opcode = getSameOpcode(VL);
|
|
|
|
|
|
|
|
// Check that this shuffle vector refers to the alternate
|
|
|
|
// sequence of opcodes.
|
|
|
|
if (Opcode == Instruction::ShuffleVector) {
|
|
|
|
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
|
|
|
|
unsigned Op = I0->getOpcode();
|
|
|
|
if (Op != Instruction::ShuffleVector)
|
|
|
|
isAltShuffle = true;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// If all of the operands are identical or constant we have a simple solution.
|
2014-06-20 04:32:48 +00:00
|
|
|
if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) {
|
2013-07-07 06:57:07 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// We now know that this is a vector of instructions of the same type from
|
|
|
|
// the same block.
|
|
|
|
|
2014-10-15 17:35:01 +00:00
|
|
|
// Don't vectorize ephemeral values.
|
|
|
|
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
|
|
|
|
if (EphValues.count(VL[i])) {
|
|
|
|
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
|
|
|
|
") is ephemeral.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// Check if this is a duplicate of another entry.
|
|
|
|
if (ScalarToTreeEntry.count(VL[0])) {
|
|
|
|
int Idx = ScalarToTreeEntry[VL[0]];
|
|
|
|
TreeEntry *E = &VectorizableTree[Idx];
|
|
|
|
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
|
|
|
|
DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
|
|
|
|
if (E->Scalars[i] != VL[i]) {
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
|
|
|
|
return;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// Check that none of the instructions in the bundle are already in the tree.
|
|
|
|
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
|
|
|
|
if (ScalarToTreeEntry.count(VL[i])) {
|
|
|
|
DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
|
|
|
|
") is already in tree.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2015-01-09 22:15:06 +00:00
|
|
|
// If any of the scalars is marked as a value that needs to stay scalar then
|
|
|
|
// we need to gather the scalars.
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned i = 0, e = VL.size(); i != e; ++i) {
|
2015-01-09 20:36:19 +00:00
|
|
|
if (MustGather.count(VL[i])) {
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// Check that all of the users of the scalars that we want to vectorize are
|
|
|
|
// schedulable.
|
|
|
|
Instruction *VL0 = cast<Instruction>(VL[0]);
|
|
|
|
BasicBlock *BB = cast<Instruction>(VL0)->getParent();
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2014-08-22 01:18:39 +00:00
|
|
|
if (!DT->isReachableFromEntry(BB)) {
|
|
|
|
// Don't go into unreachable blocks. They may contain instructions with
|
|
|
|
// dependency cycles which confuse the final scheduling.
|
|
|
|
DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// Check that every instructions appears once in this bundle.
|
2013-06-22 21:34:10 +00:00
|
|
|
for (unsigned i = 0, e = VL.size(); i < e; ++i)
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned j = i+1; j < e; ++j)
|
|
|
|
if (VL[i] == VL[j]) {
|
|
|
|
DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
auto &BSRef = BlocksSchedules[BB];
|
|
|
|
if (!BSRef) {
|
|
|
|
BSRef = llvm::make_unique<BlockScheduling>(BB);
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2014-08-01 09:20:42 +00:00
|
|
|
BlockScheduling &BS = *BSRef.get();
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2015-01-14 11:24:47 +00:00
|
|
|
if (!BS.tryScheduleBundle(VL, this)) {
|
2014-08-01 09:20:42 +00:00
|
|
|
DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
|
2015-09-30 17:00:44 +00:00
|
|
|
assert((!BS.getScheduleData(VL[0]) ||
|
|
|
|
!BS.getScheduleData(VL[0])->isPartOfBundle()) &&
|
|
|
|
"tryScheduleBundle should cancelScheduling on failure");
|
2014-08-01 09:20:42 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2014-08-01 09:20:42 +00:00
|
|
|
DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::PHI: {
|
|
|
|
PHINode *PH = dyn_cast<PHINode>(VL0);
|
2013-09-17 17:03:29 +00:00
|
|
|
|
|
|
|
// Check for terminator values (e.g. invoke).
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
|
|
|
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
|
2014-02-17 03:06:16 +00:00
|
|
|
TerminatorInst *Term = dyn_cast<TerminatorInst>(
|
|
|
|
cast<PHINode>(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i)));
|
2013-09-17 17:03:29 +00:00
|
|
|
if (Term) {
|
|
|
|
DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2013-09-17 17:03:29 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
|
|
|
|
|
|
|
|
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
2014-02-17 03:06:16 +00:00
|
|
|
Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(
|
|
|
|
PH->getIncomingBlock(i)));
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
buildTree_rec(Operands, Depth + 1);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case Instruction::ExtractElement: {
|
|
|
|
bool Reuse = CanReuseExtract(VL);
|
|
|
|
if (Reuse) {
|
|
|
|
DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
|
2014-08-01 09:20:42 +00:00
|
|
|
} else {
|
|
|
|
BS.cancelScheduling(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
newTreeEntry(VL, Reuse);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case Instruction::Load: {
|
2015-09-30 21:05:43 +00:00
|
|
|
// Check that a vectorized load would load the same memory as a scalar
|
|
|
|
// load.
|
|
|
|
// For example we don't want vectorize loads that are smaller than 8 bit.
|
|
|
|
// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
|
|
|
|
// loading/storing it as an i8 struct. If we vectorize loads/stores from
|
|
|
|
// such a struct we read/write packed bits disagreeing with the
|
|
|
|
// unvectorized version.
|
|
|
|
Type *ScalarTy = VL[0]->getType();
|
|
|
|
|
2016-03-16 19:48:42 +00:00
|
|
|
if (DL->getTypeSizeInBits(ScalarTy) !=
|
|
|
|
DL->getTypeAllocSizeInBits(ScalarTy)) {
|
2015-09-30 21:05:43 +00:00
|
|
|
BS.cancelScheduling(VL);
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
|
|
|
|
return;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
// Check if the loads are consecutive or of we need to swizzle them.
|
2013-10-16 17:52:40 +00:00
|
|
|
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
|
|
|
|
LoadInst *L = cast<LoadInst>(VL[i]);
|
2014-08-01 08:05:55 +00:00
|
|
|
if (!L->isSimple()) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, false);
|
2014-08-01 08:05:55 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
|
|
|
|
return;
|
|
|
|
}
|
2015-09-30 21:05:43 +00:00
|
|
|
|
2016-03-16 19:48:42 +00:00
|
|
|
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
|
|
|
|
if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], *DL, *SE)) {
|
2014-08-01 08:05:55 +00:00
|
|
|
++NumLoadsWantToChangeOrder;
|
|
|
|
}
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2014-08-01 08:05:55 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
|
2013-07-07 06:57:07 +00:00
|
|
|
return;
|
|
|
|
}
|
2013-10-16 17:52:40 +00:00
|
|
|
}
|
2014-08-01 08:05:55 +00:00
|
|
|
++NumLoadsWantToKeepOrder;
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of loads.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::FPToUI:
|
|
|
|
case Instruction::FPToSI:
|
|
|
|
case Instruction::FPExt:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::SIToFP:
|
|
|
|
case Instruction::UIToFP:
|
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::FPTrunc:
|
|
|
|
case Instruction::BitCast: {
|
|
|
|
Type *SrcTy = VL0->getOperand(0)->getType();
|
|
|
|
for (unsigned i = 0; i < VL.size(); ++i) {
|
|
|
|
Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
|
2015-02-12 02:30:56 +00:00
|
|
|
if (Ty != SrcTy || !isValidElementType(Ty)) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of casts.\n");
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
|
|
|
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
buildTree_rec(Operands, Depth+1);
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
return;
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::FCmp: {
|
|
|
|
// Check that all of the compares have the same predicate.
|
2015-04-10 11:24:51 +00:00
|
|
|
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
|
2013-07-15 22:52:48 +00:00
|
|
|
Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned i = 1, e = VL.size(); i < e; ++i) {
|
|
|
|
CmpInst *Cmp = cast<CmpInst>(VL[i]);
|
2013-07-15 22:52:48 +00:00
|
|
|
if (Cmp->getPredicate() != P0 ||
|
|
|
|
Cmp->getOperand(0)->getType() != ComparedTy) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of compares.\n");
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
|
|
|
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
buildTree_rec(Operands, Depth+1);
|
2013-06-25 23:04:09 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
return;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::Select:
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
|
|
|
|
|
2013-10-04 20:39:16 +00:00
|
|
|
// Sort operands of the instructions so that each side is more likely to
|
|
|
|
// have the same opcode.
|
|
|
|
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
|
|
|
|
ValueList Left, Right;
|
|
|
|
reorderInputsAccordingToOpcode(VL, Left, Right);
|
2014-08-01 09:20:42 +00:00
|
|
|
buildTree_rec(Left, Depth + 1);
|
|
|
|
buildTree_rec(Right, Depth + 1);
|
2013-10-04 20:39:16 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
|
|
|
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
|
|
|
|
|
|
|
|
buildTree_rec(Operands, Depth+1);
|
|
|
|
}
|
|
|
|
return;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2014-08-27 15:01:18 +00:00
|
|
|
case Instruction::GetElementPtr: {
|
|
|
|
// We don't combine GEPs with complicated (nested) indexing.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j) {
|
|
|
|
if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
|
|
|
|
DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
|
|
|
|
BS.cancelScheduling(VL);
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can't combine several GEPs into one vector if they operate on
|
|
|
|
// different types.
|
|
|
|
Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j) {
|
|
|
|
Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
|
|
|
|
if (Ty0 != CurTy) {
|
|
|
|
DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
|
|
|
|
BS.cancelScheduling(VL);
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We don't combine GEPs with non-constant indexes.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j) {
|
|
|
|
auto Op = cast<Instruction>(VL[j])->getOperand(1);
|
|
|
|
if (!isa<ConstantInt>(Op)) {
|
|
|
|
DEBUG(
|
|
|
|
dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
|
|
|
|
BS.cancelScheduling(VL);
|
|
|
|
newTreeEntry(VL, false);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
|
|
|
|
for (unsigned i = 0, e = 2; i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
|
|
|
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
|
|
|
|
|
|
|
|
buildTree_rec(Operands, Depth + 1);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::Store: {
|
|
|
|
// Check if the stores are consecutive or of we need to swizzle them.
|
|
|
|
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
|
2016-03-16 19:48:42 +00:00
|
|
|
if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, false);
|
2013-12-05 05:44:44 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
|
2013-07-07 06:57:07 +00:00
|
|
|
return;
|
|
|
|
}
|
2013-06-25 23:04:09 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a vector of stores.\n");
|
2013-06-25 23:04:09 +00:00
|
|
|
|
|
|
|
ValueList Operands;
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
2013-07-07 06:57:07 +00:00
|
|
|
Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
|
2013-06-25 23:04:09 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
buildTree_rec(Operands, Depth + 1);
|
2013-06-22 21:34:10 +00:00
|
|
|
return;
|
|
|
|
}
|
2014-03-12 20:21:50 +00:00
|
|
|
case Instruction::Call: {
|
|
|
|
// Check if the calls are all to the same vectorizable intrinsic.
|
2014-05-03 09:59:54 +00:00
|
|
|
CallInst *CI = cast<CallInst>(VL[0]);
|
|
|
|
// Check if this is an Intrinsic call or something that can be
|
|
|
|
// represented by an intrinsic call
|
|
|
|
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
|
2014-04-09 14:20:47 +00:00
|
|
|
if (!isTriviallyVectorizable(ID)) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2014-03-12 20:21:50 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
|
|
|
|
return;
|
|
|
|
}
|
2014-05-03 09:59:54 +00:00
|
|
|
Function *Int = CI->getCalledFunction();
|
2014-05-30 04:31:24 +00:00
|
|
|
Value *A1I = nullptr;
|
|
|
|
if (hasVectorInstrinsicScalarOpd(ID, 1))
|
|
|
|
A1I = CI->getArgOperand(1);
|
2014-03-12 20:21:50 +00:00
|
|
|
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
|
2014-05-03 09:59:54 +00:00
|
|
|
CallInst *CI2 = dyn_cast<CallInst>(VL[i]);
|
|
|
|
if (!CI2 || CI2->getCalledFunction() != Int ||
|
|
|
|
getIntrinsicIDForCall(CI2, TLI) != ID) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2014-03-12 20:21:50 +00:00
|
|
|
newTreeEntry(VL, false);
|
2014-05-03 09:59:54 +00:00
|
|
|
DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
|
2014-03-12 20:21:50 +00:00
|
|
|
<< "\n");
|
|
|
|
return;
|
|
|
|
}
|
2014-05-30 04:31:24 +00:00
|
|
|
// ctlz,cttz and powi are special intrinsics whose second argument
|
|
|
|
// should be same in order for them to be vectorized.
|
|
|
|
if (hasVectorInstrinsicScalarOpd(ID, 1)) {
|
|
|
|
Value *A1J = CI2->getArgOperand(1);
|
|
|
|
if (A1I != A1J) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2014-05-30 04:31:24 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
|
|
|
|
<< " argument "<< A1I<<"!=" << A1J
|
|
|
|
<< "\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2014-03-12 20:21:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
newTreeEntry(VL, true);
|
2014-05-03 09:59:54 +00:00
|
|
|
for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
|
2014-03-12 20:21:50 +00:00
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j) {
|
2014-05-03 09:59:54 +00:00
|
|
|
CallInst *CI2 = dyn_cast<CallInst>(VL[j]);
|
|
|
|
Operands.push_back(CI2->getArgOperand(i));
|
2014-03-12 20:21:50 +00:00
|
|
|
}
|
|
|
|
buildTree_rec(Operands, Depth + 1);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2014-06-20 04:32:48 +00:00
|
|
|
case Instruction::ShuffleVector: {
|
|
|
|
// If this is not an alternate sequence of opcode like add-sub
|
|
|
|
// then do not vectorize this instruction.
|
|
|
|
if (!isAltShuffle) {
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2014-06-20 04:32:48 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
newTreeEntry(VL, true);
|
|
|
|
DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
|
2015-01-20 06:11:00 +00:00
|
|
|
|
|
|
|
// Reorder operands if reordering would enable vectorization.
|
|
|
|
if (isa<BinaryOperator>(VL0)) {
|
|
|
|
ValueList Left, Right;
|
|
|
|
reorderAltShuffleOperands(VL, Left, Right);
|
|
|
|
buildTree_rec(Left, Depth + 1);
|
|
|
|
buildTree_rec(Right, Depth + 1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-06-20 04:32:48 +00:00
|
|
|
for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
// Prepare the operand vector.
|
|
|
|
for (unsigned j = 0; j < VL.size(); ++j)
|
|
|
|
Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
|
|
|
|
|
|
|
|
buildTree_rec(Operands, Depth + 1);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
default:
|
2014-08-01 09:20:42 +00:00
|
|
|
BS.cancelScheduling(VL);
|
2013-07-07 06:57:07 +00:00
|
|
|
newTreeEntry(VL, false);
|
|
|
|
DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
|
|
|
|
return;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|
|
|
ArrayRef<Value*> VL = E->Scalars;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
|
|
|
Type *ScalarTy = VL[0]->getType();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
|
|
|
ScalarTy = SI->getValueOperand()->getType();
|
2013-06-24 02:52:43 +00:00
|
|
|
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
|
|
|
|
2016-02-18 14:14:40 +00:00
|
|
|
// If we have computed a smaller type for the expression, update VecTy so
|
|
|
|
// that the costs will be accurate.
|
|
|
|
if (MinBWs.count(VL[0]))
|
|
|
|
VecTy = VectorType::get(IntegerType::get(F->getContext(), MinBWs[VL[0]]),
|
|
|
|
VL.size());
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
if (E->NeedToGather) {
|
|
|
|
if (allConstant(VL))
|
|
|
|
return 0;
|
|
|
|
if (isSplat(VL)) {
|
|
|
|
return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
return getGatherCost(E->Scalars);
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2014-06-20 04:32:48 +00:00
|
|
|
unsigned Opcode = getSameOpcode(VL);
|
|
|
|
assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL");
|
2013-06-22 21:34:10 +00:00
|
|
|
Instruction *VL0 = cast<Instruction>(VL[0]);
|
|
|
|
switch (Opcode) {
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::PHI: {
|
2013-06-22 21:34:10 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::ExtractElement: {
|
2014-03-28 17:21:32 +00:00
|
|
|
if (CanReuseExtract(VL)) {
|
|
|
|
int DeadCost = 0;
|
|
|
|
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
|
|
|
ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
|
|
|
|
if (E->hasOneUse())
|
|
|
|
// Take credit for instruction that will become dead.
|
|
|
|
DeadCost +=
|
|
|
|
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
|
|
|
|
}
|
|
|
|
return -DeadCost;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
return getGatherCost(VecTy);
|
2013-06-24 02:52:43 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::FPToUI:
|
|
|
|
case Instruction::FPToSI:
|
|
|
|
case Instruction::FPExt:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::SIToFP:
|
|
|
|
case Instruction::UIToFP:
|
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::FPTrunc:
|
|
|
|
case Instruction::BitCast: {
|
|
|
|
Type *SrcTy = VL0->getOperand(0)->getType();
|
|
|
|
|
|
|
|
// Calculate the cost of this instruction.
|
|
|
|
int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
|
|
|
|
VL0->getType(), SrcTy);
|
|
|
|
|
|
|
|
VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
|
|
|
|
int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
|
|
|
|
return VecCost - ScalarCost;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::FCmp:
|
|
|
|
case Instruction::ICmp:
|
|
|
|
case Instruction::Select:
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
// Calculate the cost of this instruction.
|
|
|
|
int ScalarCost = 0;
|
|
|
|
int VecCost = 0;
|
|
|
|
if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
|
|
|
|
Opcode == Instruction::Select) {
|
|
|
|
VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
|
|
|
|
ScalarCost = VecTy->getNumElements() *
|
|
|
|
TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
|
|
|
|
VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
|
|
|
|
} else {
|
2013-10-29 01:33:53 +00:00
|
|
|
// Certain instructions can be cheaper to vectorize if they have a
|
|
|
|
// constant second vector operand.
|
|
|
|
TargetTransformInfo::OperandValueKind Op1VK =
|
|
|
|
TargetTransformInfo::OK_AnyValue;
|
|
|
|
TargetTransformInfo::OperandValueKind Op2VK =
|
|
|
|
TargetTransformInfo::OK_UniformConstantValue;
|
2014-08-25 04:56:54 +00:00
|
|
|
TargetTransformInfo::OperandValueProperties Op1VP =
|
|
|
|
TargetTransformInfo::OP_None;
|
|
|
|
TargetTransformInfo::OperandValueProperties Op2VP =
|
|
|
|
TargetTransformInfo::OP_None;
|
2013-10-29 01:33:53 +00:00
|
|
|
|
2014-02-12 23:43:47 +00:00
|
|
|
// If all operands are exactly the same ConstantInt then set the
|
|
|
|
// operand kind to OK_UniformConstantValue.
|
|
|
|
// If instead not all operands are constants, then set the operand kind
|
|
|
|
// to OK_AnyValue. If all operands are constants but not the same,
|
|
|
|
// then set the operand kind to OK_NonUniformConstantValue.
|
2014-04-25 05:29:35 +00:00
|
|
|
ConstantInt *CInt = nullptr;
|
2014-02-12 23:43:47 +00:00
|
|
|
for (unsigned i = 0; i < VL.size(); ++i) {
|
|
|
|
const Instruction *I = cast<Instruction>(VL[i]);
|
|
|
|
if (!isa<ConstantInt>(I->getOperand(1))) {
|
2013-10-29 01:33:53 +00:00
|
|
|
Op2VK = TargetTransformInfo::OK_AnyValue;
|
|
|
|
break;
|
|
|
|
}
|
2014-02-12 23:43:47 +00:00
|
|
|
if (i == 0) {
|
|
|
|
CInt = cast<ConstantInt>(I->getOperand(1));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue &&
|
|
|
|
CInt != cast<ConstantInt>(I->getOperand(1)))
|
|
|
|
Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
|
|
|
|
}
|
2016-03-15 13:27:58 +00:00
|
|
|
// FIXME: Currently cost of model modification for division by power of
|
|
|
|
// 2 is handled for X86 and AArch64. Add support for other targets.
|
2014-08-25 04:56:54 +00:00
|
|
|
if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt &&
|
|
|
|
CInt->getValue().isPowerOf2())
|
|
|
|
Op2VP = TargetTransformInfo::OP_PowerOf2;
|
2013-10-29 01:33:53 +00:00
|
|
|
|
2014-08-25 04:56:54 +00:00
|
|
|
ScalarCost = VecTy->getNumElements() *
|
|
|
|
TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK,
|
|
|
|
Op1VP, Op2VP);
|
|
|
|
VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK,
|
|
|
|
Op1VP, Op2VP);
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
return VecCost - ScalarCost;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2014-08-27 15:01:18 +00:00
|
|
|
case Instruction::GetElementPtr: {
|
|
|
|
TargetTransformInfo::OperandValueKind Op1VK =
|
|
|
|
TargetTransformInfo::OK_AnyValue;
|
|
|
|
TargetTransformInfo::OperandValueKind Op2VK =
|
|
|
|
TargetTransformInfo::OK_UniformConstantValue;
|
|
|
|
|
|
|
|
int ScalarCost =
|
|
|
|
VecTy->getNumElements() *
|
|
|
|
TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
|
|
|
|
int VecCost =
|
|
|
|
TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
|
|
|
|
|
|
|
|
return VecCost - ScalarCost;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::Load: {
|
|
|
|
// Cost of wide load - cost of scalar loads.
|
|
|
|
int ScalarLdCost = VecTy->getNumElements() *
|
|
|
|
TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
|
2013-10-29 01:33:50 +00:00
|
|
|
int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
|
2013-07-07 06:57:07 +00:00
|
|
|
return VecLdCost - ScalarLdCost;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::Store: {
|
|
|
|
// We know that we can merge the stores. Calculate the cost.
|
|
|
|
int ScalarStCost = VecTy->getNumElements() *
|
|
|
|
TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
|
2013-10-29 01:33:50 +00:00
|
|
|
int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
|
2013-07-07 06:57:07 +00:00
|
|
|
return VecStCost - ScalarStCost;
|
2013-06-24 02:52:43 +00:00
|
|
|
}
|
2014-03-12 20:21:50 +00:00
|
|
|
case Instruction::Call: {
|
|
|
|
CallInst *CI = cast<CallInst>(VL0);
|
2014-05-03 09:59:54 +00:00
|
|
|
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
|
2014-03-12 20:21:50 +00:00
|
|
|
|
|
|
|
// Calculate the cost of the scalar and vector calls.
|
|
|
|
SmallVector<Type*, 4> ScalarTys, VecTys;
|
2014-05-03 09:59:54 +00:00
|
|
|
for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
|
2014-03-12 20:21:50 +00:00
|
|
|
ScalarTys.push_back(CI->getArgOperand(op)->getType());
|
|
|
|
VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
|
|
|
|
VecTy->getNumElements()));
|
|
|
|
}
|
|
|
|
|
|
|
|
int ScalarCallCost = VecTy->getNumElements() *
|
|
|
|
TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys);
|
|
|
|
|
|
|
|
int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys);
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
|
|
|
|
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
|
2014-05-03 09:59:54 +00:00
|
|
|
<< " for " << *CI << "\n");
|
2014-03-12 20:21:50 +00:00
|
|
|
|
|
|
|
return VecCallCost - ScalarCallCost;
|
|
|
|
}
|
2014-06-20 04:32:48 +00:00
|
|
|
case Instruction::ShuffleVector: {
|
|
|
|
TargetTransformInfo::OperandValueKind Op1VK =
|
|
|
|
TargetTransformInfo::OK_AnyValue;
|
|
|
|
TargetTransformInfo::OperandValueKind Op2VK =
|
|
|
|
TargetTransformInfo::OK_AnyValue;
|
|
|
|
int ScalarCost = 0;
|
|
|
|
int VecCost = 0;
|
|
|
|
for (unsigned i = 0; i < VL.size(); ++i) {
|
|
|
|
Instruction *I = cast<Instruction>(VL[i]);
|
|
|
|
if (!I)
|
|
|
|
break;
|
|
|
|
ScalarCost +=
|
|
|
|
TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK);
|
|
|
|
}
|
|
|
|
// VecCost is equal to sum of the cost of creating 2 vectors
|
|
|
|
// and the cost of creating shuffle.
|
|
|
|
Instruction *I0 = cast<Instruction>(VL[0]);
|
|
|
|
VecCost =
|
|
|
|
TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK);
|
|
|
|
Instruction *I1 = cast<Instruction>(VL[1]);
|
|
|
|
VecCost +=
|
|
|
|
TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK);
|
|
|
|
VecCost +=
|
|
|
|
TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0);
|
|
|
|
return VecCost - ScalarCost;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
default:
|
|
|
|
llvm_unreachable("Unknown instruction");
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2013-06-24 02:52:43 +00:00
|
|
|
|
2013-10-02 20:20:39 +00:00
|
|
|
bool BoUpSLP::isFullyVectorizableTinyTree() {
|
|
|
|
DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
|
|
|
|
VectorizableTree.size() << " is fully vectorizable .\n");
|
|
|
|
|
|
|
|
// We only handle trees of height 2.
|
|
|
|
if (VectorizableTree.size() != 2)
|
|
|
|
return false;
|
|
|
|
|
2015-06-19 17:40:15 +00:00
|
|
|
// Handle splat and all-constants stores.
|
|
|
|
if (!VectorizableTree[0].NeedToGather &&
|
|
|
|
(allConstant(VectorizableTree[1].Scalars) ||
|
|
|
|
isSplat(VectorizableTree[1].Scalars)))
|
2014-02-24 19:52:29 +00:00
|
|
|
return true;
|
|
|
|
|
2013-10-02 20:20:39 +00:00
|
|
|
// Gathering cost would be too much for tiny trees.
|
2014-02-24 19:52:29 +00:00
|
|
|
if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather)
|
|
|
|
return false;
|
2013-10-02 20:20:39 +00:00
|
|
|
|
2014-02-24 19:52:29 +00:00
|
|
|
return true;
|
2013-10-02 20:20:39 +00:00
|
|
|
}
|
|
|
|
|
2014-08-05 12:30:34 +00:00
|
|
|
int BoUpSLP::getSpillCost() {
|
|
|
|
// Walk from the bottom of the tree to the top, tracking which values are
|
|
|
|
// live. When we see a call instruction that is not part of our tree,
|
|
|
|
// query TTI to see if there is a cost to keeping values live over it
|
|
|
|
// (for example, if spills and fills are required).
|
|
|
|
unsigned BundleWidth = VectorizableTree.front().Scalars.size();
|
|
|
|
int Cost = 0;
|
|
|
|
|
|
|
|
SmallPtrSet<Instruction*, 4> LiveValues;
|
2016-01-13 07:03:42 +00:00
|
|
|
Instruction *PrevInst = nullptr;
|
2014-08-05 12:30:34 +00:00
|
|
|
|
|
|
|
for (unsigned N = 0; N < VectorizableTree.size(); ++N) {
|
|
|
|
Instruction *Inst = dyn_cast<Instruction>(VectorizableTree[N].Scalars[0]);
|
|
|
|
if (!Inst)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (!PrevInst) {
|
|
|
|
PrevInst = Inst;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-02-08 14:11:39 +00:00
|
|
|
// Update LiveValues.
|
|
|
|
LiveValues.erase(PrevInst);
|
|
|
|
for (auto &J : PrevInst->operands()) {
|
|
|
|
if (isa<Instruction>(&*J) && ScalarToTreeEntry.count(&*J))
|
|
|
|
LiveValues.insert(cast<Instruction>(&*J));
|
|
|
|
}
|
|
|
|
|
2014-08-05 12:30:34 +00:00
|
|
|
DEBUG(
|
|
|
|
dbgs() << "SLP: #LV: " << LiveValues.size();
|
|
|
|
for (auto *X : LiveValues)
|
|
|
|
dbgs() << " " << X->getName();
|
|
|
|
dbgs() << ", Looking at ";
|
|
|
|
Inst->dump();
|
|
|
|
);
|
|
|
|
|
|
|
|
// Now find the sequence of instructions between PrevInst and Inst.
|
2015-10-19 22:06:09 +00:00
|
|
|
BasicBlock::reverse_iterator InstIt(Inst->getIterator()),
|
|
|
|
PrevInstIt(PrevInst->getIterator());
|
2014-08-05 12:30:34 +00:00
|
|
|
--PrevInstIt;
|
|
|
|
while (InstIt != PrevInstIt) {
|
|
|
|
if (PrevInstIt == PrevInst->getParent()->rend()) {
|
|
|
|
PrevInstIt = Inst->getParent()->rbegin();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isa<CallInst>(&*PrevInstIt) && &*PrevInstIt != PrevInst) {
|
|
|
|
SmallVector<Type*, 4> V;
|
|
|
|
for (auto *II : LiveValues)
|
|
|
|
V.push_back(VectorType::get(II->getType(), BundleWidth));
|
|
|
|
Cost += TTI->getCostOfKeepingLiveOverCall(V);
|
|
|
|
}
|
|
|
|
|
|
|
|
++PrevInstIt;
|
|
|
|
}
|
|
|
|
|
|
|
|
PrevInst = Inst;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Cost;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
int BoUpSLP::getTreeCost() {
|
|
|
|
int Cost = 0;
|
|
|
|
DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
|
|
|
|
VectorizableTree.size() << ".\n");
|
|
|
|
|
2013-10-02 20:20:39 +00:00
|
|
|
// We only vectorize tiny trees if it is fully vectorizable.
|
|
|
|
if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
|
2015-01-15 11:41:30 +00:00
|
|
|
if (VectorizableTree.empty()) {
|
2013-07-26 23:07:55 +00:00
|
|
|
assert(!ExternalUses.size() && "We should not have any external users");
|
|
|
|
}
|
2013-09-24 17:26:43 +00:00
|
|
|
return INT_MAX;
|
2013-07-11 04:54:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
unsigned BundleWidth = VectorizableTree[0].Scalars.size();
|
|
|
|
|
2016-01-12 18:47:59 +00:00
|
|
|
for (TreeEntry &TE : VectorizableTree) {
|
|
|
|
int C = getEntryCost(&TE);
|
2013-07-07 06:57:07 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
|
2016-01-29 17:21:38 +00:00
|
|
|
<< *TE.Scalars[0] << ".\n");
|
2013-07-07 06:57:07 +00:00
|
|
|
Cost += C;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-11 04:54:05 +00:00
|
|
|
|
2013-11-22 15:47:17 +00:00
|
|
|
SmallSet<Value *, 16> ExtractCostCalculated;
|
2013-07-11 04:54:05 +00:00
|
|
|
int ExtractCost = 0;
|
2016-01-12 18:47:59 +00:00
|
|
|
for (ExternalUser &EU : ExternalUses) {
|
2013-11-22 15:47:17 +00:00
|
|
|
// We only add extract cost once for the same scalar.
|
2016-01-12 18:47:59 +00:00
|
|
|
if (!ExtractCostCalculated.insert(EU.Scalar).second)
|
2013-11-22 15:47:17 +00:00
|
|
|
continue;
|
2013-07-11 04:54:05 +00:00
|
|
|
|
2014-10-15 17:35:01 +00:00
|
|
|
// Uses by ephemeral values are free (because the ephemeral value will be
|
|
|
|
// removed prior to code generation, and so the extraction will be
|
|
|
|
// removed as well).
|
2016-01-12 18:47:59 +00:00
|
|
|
if (EphValues.count(EU.User))
|
2014-10-15 17:35:01 +00:00
|
|
|
continue;
|
|
|
|
|
2016-02-18 14:14:40 +00:00
|
|
|
// If we plan to rewrite the tree in a smaller type, we will need to sign
|
|
|
|
// extend the extracted value back to the original type. Here, we account
|
|
|
|
// for the extract and the added cost of the sign extend if needed.
|
|
|
|
auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
|
|
|
|
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
|
|
|
|
if (MinBWs.count(ScalarRoot)) {
|
|
|
|
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
|
|
|
|
VecTy = VectorType::get(MinTy, BundleWidth);
|
|
|
|
ExtractCost +=
|
|
|
|
TTI->getCastInstrCost(Instruction::SExt, EU.Scalar->getType(), MinTy);
|
|
|
|
}
|
|
|
|
ExtractCost +=
|
|
|
|
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
|
2013-07-11 04:54:05 +00:00
|
|
|
}
|
|
|
|
|
2016-02-11 23:06:40 +00:00
|
|
|
int SpillCost = getSpillCost();
|
|
|
|
Cost += SpillCost + ExtractCost;
|
2014-08-05 12:30:34 +00:00
|
|
|
|
2016-02-11 23:06:40 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
|
|
|
|
<< "SLP: Extract Cost = " << ExtractCost << ".\n"
|
|
|
|
<< "SLP: Total Cost = " << Cost << ".\n");
|
|
|
|
return Cost;
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
int BoUpSLP::getGatherCost(Type *Ty) {
|
|
|
|
int Cost = 0;
|
|
|
|
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
|
|
|
|
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
|
|
|
|
return Cost;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
|
|
|
|
// Find the type of the operands in VL.
|
|
|
|
Type *ScalarTy = VL[0]->getType();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
|
|
|
ScalarTy = SI->getValueOperand()->getType();
|
|
|
|
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
|
|
|
// Find the cost of inserting/extracting values from the vector.
|
|
|
|
return getGatherCost(VecTy);
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
|
2015-01-20 06:11:00 +00:00
|
|
|
// Reorder commutative operations in alternate shuffle if the resulting vectors
|
|
|
|
// are consecutive loads. This would allow us to vectorize the tree.
|
|
|
|
// If we have something like-
|
|
|
|
// load a[0] - load b[0]
|
|
|
|
// load b[1] + load a[1]
|
|
|
|
// load a[2] - load b[2]
|
|
|
|
// load a[3] + load b[3]
|
|
|
|
// Reordering the second load b[1] load a[1] would allow us to vectorize this
|
|
|
|
// code.
|
|
|
|
void BoUpSLP::reorderAltShuffleOperands(ArrayRef<Value *> VL,
|
|
|
|
SmallVectorImpl<Value *> &Left,
|
|
|
|
SmallVectorImpl<Value *> &Right) {
|
|
|
|
// Push left and right operands of binary operation into Left and Right
|
|
|
|
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
|
|
|
Left.push_back(cast<Instruction>(VL[i])->getOperand(0));
|
|
|
|
Right.push_back(cast<Instruction>(VL[i])->getOperand(1));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Reorder if we have a commutative operation and consecutive access
|
|
|
|
// are on either side of the alternate instructions.
|
|
|
|
for (unsigned j = 0; j < VL.size() - 1; ++j) {
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
|
|
|
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
|
|
|
|
Instruction *VL1 = cast<Instruction>(VL[j]);
|
|
|
|
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
|
2016-03-16 19:48:42 +00:00
|
|
|
if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
|
2015-01-20 06:11:00 +00:00
|
|
|
std::swap(Left[j], Right[j]);
|
|
|
|
continue;
|
2016-03-16 19:48:42 +00:00
|
|
|
} else if (VL2->isCommutative() &&
|
|
|
|
isConsecutiveAccess(L, L1, *DL, *SE)) {
|
2015-01-20 06:11:00 +00:00
|
|
|
std::swap(Left[j + 1], Right[j + 1]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// else unchanged
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
|
|
|
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
|
|
|
|
Instruction *VL1 = cast<Instruction>(VL[j]);
|
|
|
|
Instruction *VL2 = cast<Instruction>(VL[j + 1]);
|
2016-03-16 19:48:42 +00:00
|
|
|
if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) {
|
2015-01-20 06:11:00 +00:00
|
|
|
std::swap(Left[j], Right[j]);
|
|
|
|
continue;
|
2016-03-16 19:48:42 +00:00
|
|
|
} else if (VL2->isCommutative() &&
|
|
|
|
isConsecutiveAccess(L, L1, *DL, *SE)) {
|
2015-01-20 06:11:00 +00:00
|
|
|
std::swap(Left[j + 1], Right[j + 1]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// else unchanged
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-23 00:46:17 +00:00
|
|
|
// Return true if I should be commuted before adding it's left and right
|
|
|
|
// operands to the arrays Left and Right.
|
|
|
|
//
|
|
|
|
// The vectorizer is trying to either have all elements one side being
|
|
|
|
// instruction with the same opcode to enable further vectorization, or having
|
|
|
|
// a splat to lower the vectorizing cost.
|
|
|
|
static bool shouldReorderOperands(int i, Instruction &I,
|
|
|
|
SmallVectorImpl<Value *> &Left,
|
2015-11-06 20:17:51 +00:00
|
|
|
SmallVectorImpl<Value *> &Right,
|
|
|
|
bool AllSameOpcodeLeft,
|
|
|
|
bool AllSameOpcodeRight, bool SplatLeft,
|
|
|
|
bool SplatRight) {
|
2015-10-23 00:46:17 +00:00
|
|
|
Value *VLeft = I.getOperand(0);
|
|
|
|
Value *VRight = I.getOperand(1);
|
2015-11-06 20:17:51 +00:00
|
|
|
// If we have "SplatRight", try to see if commuting is needed to preserve it.
|
|
|
|
if (SplatRight) {
|
|
|
|
if (VRight == Right[i - 1])
|
|
|
|
// Preserve SplatRight
|
|
|
|
return false;
|
|
|
|
if (VLeft == Right[i - 1]) {
|
|
|
|
// Commuting would preserve SplatRight, but we don't want to break
|
|
|
|
// SplatLeft either, i.e. preserve the original order if possible.
|
|
|
|
// (FIXME: why do we care?)
|
|
|
|
if (SplatLeft && VLeft == Left[i - 1])
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Symmetrically handle Right side.
|
|
|
|
if (SplatLeft) {
|
|
|
|
if (VLeft == Left[i - 1])
|
|
|
|
// Preserve SplatLeft
|
|
|
|
return false;
|
|
|
|
if (VRight == Left[i - 1])
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-10-23 00:46:17 +00:00
|
|
|
Instruction *ILeft = dyn_cast<Instruction>(VLeft);
|
|
|
|
Instruction *IRight = dyn_cast<Instruction>(VRight);
|
|
|
|
|
2015-11-06 20:17:51 +00:00
|
|
|
// If we have "AllSameOpcodeRight", try to see if the left operands preserves
|
|
|
|
// it and not the right, in this case we want to commute.
|
|
|
|
if (AllSameOpcodeRight) {
|
|
|
|
unsigned RightPrevOpcode = cast<Instruction>(Right[i - 1])->getOpcode();
|
|
|
|
if (IRight && RightPrevOpcode == IRight->getOpcode())
|
|
|
|
// Do not commute, a match on the right preserves AllSameOpcodeRight
|
|
|
|
return false;
|
|
|
|
if (ILeft && RightPrevOpcode == ILeft->getOpcode()) {
|
|
|
|
// We have a match and may want to commute, but first check if there is
|
|
|
|
// not also a match on the existing operands on the Left to preserve
|
|
|
|
// AllSameOpcodeLeft, i.e. preserve the original order if possible.
|
|
|
|
// (FIXME: why do we care?)
|
|
|
|
if (AllSameOpcodeLeft && ILeft &&
|
|
|
|
cast<Instruction>(Left[i - 1])->getOpcode() == ILeft->getOpcode())
|
|
|
|
return false;
|
2015-10-23 00:46:17 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
2015-11-06 20:17:51 +00:00
|
|
|
// Symmetrically handle Left side.
|
|
|
|
if (AllSameOpcodeLeft) {
|
|
|
|
unsigned LeftPrevOpcode = cast<Instruction>(Left[i - 1])->getOpcode();
|
|
|
|
if (ILeft && LeftPrevOpcode == ILeft->getOpcode())
|
|
|
|
return false;
|
|
|
|
if (IRight && LeftPrevOpcode == IRight->getOpcode())
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
2015-10-23 00:46:17 +00:00
|
|
|
}
|
|
|
|
|
2015-01-20 06:11:00 +00:00
|
|
|
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
|
|
|
|
SmallVectorImpl<Value *> &Left,
|
|
|
|
SmallVectorImpl<Value *> &Right) {
|
|
|
|
|
2015-10-23 00:46:17 +00:00
|
|
|
if (VL.size()) {
|
|
|
|
// Peel the first iteration out of the loop since there's nothing
|
2015-11-06 20:17:51 +00:00
|
|
|
// interesting to do anyway and it simplifies the checks in the loop.
|
2015-10-23 00:46:17 +00:00
|
|
|
auto VLeft = cast<Instruction>(VL[0])->getOperand(0);
|
|
|
|
auto VRight = cast<Instruction>(VL[0])->getOperand(1);
|
|
|
|
if (!isa<Instruction>(VRight) && isa<Instruction>(VLeft))
|
|
|
|
// Favor having instruction to the right. FIXME: why?
|
|
|
|
std::swap(VLeft, VRight);
|
|
|
|
Left.push_back(VLeft);
|
|
|
|
Right.push_back(VRight);
|
|
|
|
}
|
|
|
|
|
2015-10-23 01:04:45 +00:00
|
|
|
// Keep track if we have instructions with all the same opcode on one side.
|
|
|
|
bool AllSameOpcodeLeft = isa<Instruction>(Left[0]);
|
|
|
|
bool AllSameOpcodeRight = isa<Instruction>(Right[0]);
|
2015-11-06 20:17:51 +00:00
|
|
|
// Keep track if we have one side with all the same value (broadcast).
|
|
|
|
bool SplatLeft = true;
|
|
|
|
bool SplatRight = true;
|
2015-10-23 01:04:45 +00:00
|
|
|
|
2015-10-23 00:46:17 +00:00
|
|
|
for (unsigned i = 1, e = VL.size(); i != e; ++i) {
|
2015-01-20 06:11:00 +00:00
|
|
|
Instruction *I = cast<Instruction>(VL[i]);
|
2015-11-06 20:17:51 +00:00
|
|
|
assert(I->isCommutative() && "Can only process commutative instruction");
|
2015-10-23 00:46:17 +00:00
|
|
|
// Commute to favor either a splat or maximizing having the same opcodes on
|
|
|
|
// one side.
|
2015-11-06 20:17:51 +00:00
|
|
|
if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft,
|
|
|
|
AllSameOpcodeRight, SplatLeft, SplatRight)) {
|
2015-10-23 00:46:17 +00:00
|
|
|
Left.push_back(I->getOperand(1));
|
|
|
|
Right.push_back(I->getOperand(0));
|
|
|
|
} else {
|
|
|
|
Left.push_back(I->getOperand(0));
|
|
|
|
Right.push_back(I->getOperand(1));
|
2015-01-20 06:11:00 +00:00
|
|
|
}
|
2015-11-06 20:17:51 +00:00
|
|
|
// Update Splat* and AllSameOpcode* after the insertion.
|
|
|
|
SplatRight = SplatRight && (Right[i - 1] == Right[i]);
|
|
|
|
SplatLeft = SplatLeft && (Left[i - 1] == Left[i]);
|
|
|
|
AllSameOpcodeLeft = AllSameOpcodeLeft && isa<Instruction>(Left[i]) &&
|
|
|
|
(cast<Instruction>(Left[i - 1])->getOpcode() ==
|
|
|
|
cast<Instruction>(Left[i])->getOpcode());
|
|
|
|
AllSameOpcodeRight = AllSameOpcodeRight && isa<Instruction>(Right[i]) &&
|
|
|
|
(cast<Instruction>(Right[i - 1])->getOpcode() ==
|
|
|
|
cast<Instruction>(Right[i])->getOpcode());
|
2015-01-20 06:11:00 +00:00
|
|
|
}
|
|
|
|
|
2015-11-06 20:17:51 +00:00
|
|
|
// If one operand end up being broadcast, return this operand order.
|
|
|
|
if (SplatRight || SplatLeft)
|
2015-01-20 06:11:00 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
// Finally check if we can get longer vectorizable chain by reordering
|
|
|
|
// without breaking the good operand order detected above.
|
|
|
|
// E.g. If we have something like-
|
|
|
|
// load a[0] load b[0]
|
|
|
|
// load b[1] load a[1]
|
|
|
|
// load a[2] load b[2]
|
|
|
|
// load a[3] load b[3]
|
|
|
|
// Reordering the second load b[1] load a[1] would allow us to vectorize
|
|
|
|
// this code and we still retain AllSameOpcode property.
|
|
|
|
// FIXME: This load reordering might break AllSameOpcode in some rare cases
|
|
|
|
// such as-
|
|
|
|
// add a[0],c[0] load b[0]
|
|
|
|
// add a[1],c[2] load b[1]
|
|
|
|
// b[2] load b[2]
|
|
|
|
// add a[3],c[3] load b[3]
|
|
|
|
for (unsigned j = 0; j < VL.size() - 1; ++j) {
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(Left[j])) {
|
|
|
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Right[j + 1])) {
|
2016-03-16 19:48:42 +00:00
|
|
|
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
|
2015-01-20 06:11:00 +00:00
|
|
|
std::swap(Left[j + 1], Right[j + 1]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(Right[j])) {
|
|
|
|
if (LoadInst *L1 = dyn_cast<LoadInst>(Left[j + 1])) {
|
2016-03-16 19:48:42 +00:00
|
|
|
if (isConsecutiveAccess(L, L1, *DL, *SE)) {
|
2015-01-20 06:11:00 +00:00
|
|
|
std::swap(Left[j + 1], Right[j + 1]);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// else unchanged
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
|
|
|
|
Instruction *VL0 = cast<Instruction>(VL[0]);
|
2015-10-19 22:06:09 +00:00
|
|
|
BasicBlock::iterator NextInst(VL0);
|
2013-08-26 23:08:37 +00:00
|
|
|
++NextInst;
|
|
|
|
Builder.SetInsertPoint(VL0->getParent(), NextInst);
|
|
|
|
Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
|
2013-06-22 21:34:10 +00:00
|
|
|
Value *Vec = UndefValue::get(Ty);
|
|
|
|
// Generate the 'InsertElement' instruction.
|
|
|
|
for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
|
|
|
|
Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
|
2013-07-11 04:54:05 +00:00
|
|
|
if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
|
|
|
|
GatherSeq.insert(Insrt);
|
2013-11-26 22:24:25 +00:00
|
|
|
CSEBlocks.insert(Insrt->getParent());
|
2013-07-11 04:54:05 +00:00
|
|
|
|
|
|
|
// Add to our 'need-to-extract' list.
|
|
|
|
if (ScalarToTreeEntry.count(VL[i])) {
|
|
|
|
int Idx = ScalarToTreeEntry[VL[i]];
|
|
|
|
TreeEntry *E = &VectorizableTree[Idx];
|
|
|
|
// Find which lane we need to extract.
|
|
|
|
int FoundLane = -1;
|
|
|
|
for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
|
|
|
|
// Is this the lane of the scalar that we are looking for ?
|
|
|
|
if (E->Scalars[Lane] == VL[i]) {
|
|
|
|
FoundLane = Lane;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(FoundLane >= 0 && "Could not find the correct lane");
|
|
|
|
ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
|
|
|
|
}
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Vec;
|
|
|
|
}
|
|
|
|
|
2013-08-26 17:56:38 +00:00
|
|
|
Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
|
|
|
|
SmallDenseMap<Value*, int>::const_iterator Entry
|
|
|
|
= ScalarToTreeEntry.find(VL[0]);
|
|
|
|
if (Entry != ScalarToTreeEntry.end()) {
|
|
|
|
int Idx = Entry->second;
|
|
|
|
const TreeEntry *En = &VectorizableTree[Idx];
|
2013-07-22 22:18:07 +00:00
|
|
|
if (En->isSame(VL) && En->VectorizedValue)
|
|
|
|
return En->VectorizedValue;
|
|
|
|
}
|
2014-04-25 05:29:35 +00:00
|
|
|
return nullptr;
|
2013-07-22 22:18:07 +00:00
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
|
|
|
|
if (ScalarToTreeEntry.count(VL[0])) {
|
|
|
|
int Idx = ScalarToTreeEntry[VL[0]];
|
|
|
|
TreeEntry *E = &VectorizableTree[Idx];
|
|
|
|
if (E->isSame(VL))
|
|
|
|
return vectorizeTree(E);
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
|
|
|
Type *ScalarTy = VL[0]->getType();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
|
|
|
|
ScalarTy = SI->getValueOperand()->getType();
|
|
|
|
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
return Gather(VL, VecTy);
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
2013-09-30 15:39:48 +00:00
|
|
|
IRBuilder<>::InsertPointGuard Guard(Builder);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
if (E->VectorizedValue) {
|
|
|
|
DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
|
|
|
|
return E->VectorizedValue;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
|
2013-08-26 17:56:35 +00:00
|
|
|
Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
|
|
|
|
Type *ScalarTy = VL0->getType();
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
|
2013-07-07 06:57:07 +00:00
|
|
|
ScalarTy = SI->getValueOperand()->getType();
|
|
|
|
VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
if (E->NeedToGather) {
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-07 06:57:07 +00:00
|
|
|
return Gather(E->Scalars, VecTy);
|
|
|
|
}
|
2014-08-01 09:20:42 +00:00
|
|
|
|
2014-06-20 04:32:48 +00:00
|
|
|
unsigned Opcode = getSameOpcode(E->Scalars);
|
2013-06-25 23:04:09 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
switch (Opcode) {
|
|
|
|
case Instruction::PHI: {
|
|
|
|
PHINode *PH = dyn_cast<PHINode>(VL0);
|
2013-09-27 15:30:25 +00:00
|
|
|
Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
|
2013-07-29 18:18:46 +00:00
|
|
|
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
|
2013-07-07 06:57:07 +00:00
|
|
|
PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
|
|
|
|
E->VectorizedValue = NewPhi;
|
|
|
|
|
2013-08-12 17:46:44 +00:00
|
|
|
// PHINodes may have multiple entries from the same block. We want to
|
|
|
|
// visit every block once.
|
|
|
|
SmallSet<BasicBlock*, 4> VisitedBBs;
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
|
|
|
|
ValueList Operands;
|
|
|
|
BasicBlock *IBB = PH->getIncomingBlock(i);
|
|
|
|
|
2014-11-19 07:49:26 +00:00
|
|
|
if (!VisitedBBs.insert(IBB).second) {
|
2013-08-12 17:46:44 +00:00
|
|
|
NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// Prepare the operand vector.
|
2015-07-05 20:15:21 +00:00
|
|
|
for (Value *V : E->Scalars)
|
|
|
|
Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(IBB));
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
Builder.SetInsertPoint(IBB->getTerminator());
|
2013-07-29 18:18:46 +00:00
|
|
|
Builder.SetCurrentDebugLocation(PH->getDebugLoc());
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *Vec = vectorizeTree(Operands);
|
|
|
|
NewPhi->addIncoming(Vec, IBB);
|
|
|
|
}
|
2013-06-25 23:04:09 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
|
|
|
|
"Invalid number of incoming values");
|
|
|
|
return NewPhi;
|
2013-06-25 23:04:09 +00:00
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::ExtractElement: {
|
|
|
|
if (CanReuseExtract(E->Scalars)) {
|
|
|
|
Value *V = VL0->getOperand(0);
|
|
|
|
E->VectorizedValue = V;
|
|
|
|
return V;
|
|
|
|
}
|
|
|
|
return Gather(E->Scalars, VecTy);
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
case Instruction::FPToUI:
|
|
|
|
case Instruction::FPToSI:
|
|
|
|
case Instruction::FPExt:
|
|
|
|
case Instruction::PtrToInt:
|
|
|
|
case Instruction::IntToPtr:
|
|
|
|
case Instruction::SIToFP:
|
|
|
|
case Instruction::UIToFP:
|
|
|
|
case Instruction::Trunc:
|
|
|
|
case Instruction::FPTrunc:
|
|
|
|
case Instruction::BitCast: {
|
|
|
|
ValueList INVL;
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars)
|
|
|
|
INVL.push_back(cast<Instruction>(V)->getOperand(0));
|
2013-07-07 06:57:07 +00:00
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-29 18:18:46 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *InVec = vectorizeTree(INVL);
|
2013-07-22 22:18:07 +00:00
|
|
|
|
|
|
|
if (Value *V = alreadyVectorized(E->Scalars))
|
|
|
|
return V;
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
CastInst *CI = dyn_cast<CastInst>(VL0);
|
|
|
|
Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
|
|
|
|
E->VectorizedValue = V;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2013-07-07 06:57:07 +00:00
|
|
|
return V;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::FCmp:
|
|
|
|
case Instruction::ICmp: {
|
|
|
|
ValueList LHSV, RHSV;
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars) {
|
|
|
|
LHSV.push_back(cast<Instruction>(V)->getOperand(0));
|
|
|
|
RHSV.push_back(cast<Instruction>(V)->getOperand(1));
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-29 18:18:46 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *L = vectorizeTree(LHSV);
|
|
|
|
Value *R = vectorizeTree(RHSV);
|
2013-07-22 22:18:07 +00:00
|
|
|
|
|
|
|
if (Value *V = alreadyVectorized(E->Scalars))
|
|
|
|
return V;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2015-04-10 11:24:51 +00:00
|
|
|
CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
|
2013-07-22 22:18:07 +00:00
|
|
|
Value *V;
|
2013-07-07 06:57:07 +00:00
|
|
|
if (Opcode == Instruction::FCmp)
|
|
|
|
V = Builder.CreateFCmp(P0, L, R);
|
|
|
|
else
|
|
|
|
V = Builder.CreateICmp(P0, L, R);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
E->VectorizedValue = V;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2013-07-07 06:57:07 +00:00
|
|
|
return V;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::Select: {
|
|
|
|
ValueList TrueVec, FalseVec, CondVec;
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars) {
|
|
|
|
CondVec.push_back(cast<Instruction>(V)->getOperand(0));
|
|
|
|
TrueVec.push_back(cast<Instruction>(V)->getOperand(1));
|
|
|
|
FalseVec.push_back(cast<Instruction>(V)->getOperand(2));
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-29 18:18:46 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *Cond = vectorizeTree(CondVec);
|
|
|
|
Value *True = vectorizeTree(TrueVec);
|
|
|
|
Value *False = vectorizeTree(FalseVec);
|
2013-07-22 22:18:07 +00:00
|
|
|
|
|
|
|
if (Value *V = alreadyVectorized(E->Scalars))
|
|
|
|
return V;
|
2013-08-26 18:38:29 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *V = Builder.CreateSelect(Cond, True, False);
|
|
|
|
E->VectorizedValue = V;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2013-07-07 06:57:07 +00:00
|
|
|
return V;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::FAdd:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::FSub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::FMul:
|
|
|
|
case Instruction::UDiv:
|
|
|
|
case Instruction::SDiv:
|
|
|
|
case Instruction::FDiv:
|
|
|
|
case Instruction::URem:
|
|
|
|
case Instruction::SRem:
|
|
|
|
case Instruction::FRem:
|
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::LShr:
|
|
|
|
case Instruction::AShr:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor: {
|
|
|
|
ValueList LHSVL, RHSVL;
|
2014-12-17 10:34:27 +00:00
|
|
|
if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
|
2013-10-04 20:39:16 +00:00
|
|
|
reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
|
2014-12-17 10:34:27 +00:00
|
|
|
else
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars) {
|
|
|
|
LHSVL.push_back(cast<Instruction>(V)->getOperand(0));
|
|
|
|
RHSVL.push_back(cast<Instruction>(V)->getOperand(1));
|
2013-10-04 20:39:16 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-29 18:18:46 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *LHS = vectorizeTree(LHSVL);
|
|
|
|
Value *RHS = vectorizeTree(RHSVL);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
if (LHS == RHS && isa<Instruction>(LHS)) {
|
|
|
|
assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-22 22:18:07 +00:00
|
|
|
if (Value *V = alreadyVectorized(E->Scalars))
|
|
|
|
return V;
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
|
|
|
|
Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
|
|
|
|
E->VectorizedValue = V;
|
2014-09-03 17:40:30 +00:00
|
|
|
propagateIRFlags(E->VectorizedValue, E->Scalars);
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2013-11-23 00:48:34 +00:00
|
|
|
|
|
|
|
if (Instruction *I = dyn_cast<Instruction>(V))
|
|
|
|
return propagateMetadata(I, E->Scalars);
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
return V;
|
|
|
|
}
|
|
|
|
case Instruction::Load: {
|
|
|
|
// Loads are inserted at the head of the tree because we don't want to
|
|
|
|
// sink them all the way down past store instructions.
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-29 18:18:46 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
LoadInst *LI = cast<LoadInst>(VL0);
|
2014-08-07 22:47:27 +00:00
|
|
|
Type *ScalarLoadTy = LI->getType();
|
2013-09-27 21:24:57 +00:00
|
|
|
unsigned AS = LI->getPointerAddressSpace();
|
|
|
|
|
|
|
|
Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
|
|
|
|
VecTy->getPointerTo(AS));
|
2014-09-02 21:00:39 +00:00
|
|
|
|
|
|
|
// The pointer operand uses an in-tree scalar so we add the new BitCast to
|
|
|
|
// ExternalUses list to make sure that an extract will be generated in the
|
|
|
|
// future.
|
|
|
|
if (ScalarToTreeEntry.count(LI->getPointerOperand()))
|
|
|
|
ExternalUses.push_back(
|
|
|
|
ExternalUser(LI->getPointerOperand(), cast<User>(VecPtr), 0));
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
unsigned Alignment = LI->getAlignment();
|
|
|
|
LI = Builder.CreateLoad(VecPtr);
|
2015-03-10 02:37:25 +00:00
|
|
|
if (!Alignment) {
|
2016-03-16 19:48:42 +00:00
|
|
|
Alignment = DL->getABITypeAlignment(ScalarLoadTy);
|
2015-03-10 02:37:25 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
LI->setAlignment(Alignment);
|
|
|
|
E->VectorizedValue = LI;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2013-11-23 00:48:34 +00:00
|
|
|
return propagateMetadata(LI, E->Scalars);
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
case Instruction::Store: {
|
|
|
|
StoreInst *SI = cast<StoreInst>(VL0);
|
|
|
|
unsigned Alignment = SI->getAlignment();
|
2013-09-27 21:24:57 +00:00
|
|
|
unsigned AS = SI->getPointerAddressSpace();
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
ValueList ValueOp;
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars)
|
|
|
|
ValueOp.push_back(cast<StoreInst>(V)->getValueOperand());
|
2013-07-07 06:57:07 +00:00
|
|
|
|
2013-08-26 23:08:37 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2013-07-29 18:18:46 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *VecValue = vectorizeTree(ValueOp);
|
2013-09-27 21:24:57 +00:00
|
|
|
Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
|
|
|
|
VecTy->getPointerTo(AS));
|
2013-07-07 06:57:07 +00:00
|
|
|
StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
|
2014-09-02 21:00:39 +00:00
|
|
|
|
|
|
|
// The pointer operand uses an in-tree scalar so we add the new BitCast to
|
|
|
|
// ExternalUses list to make sure that an extract will be generated in the
|
|
|
|
// future.
|
|
|
|
if (ScalarToTreeEntry.count(SI->getPointerOperand()))
|
|
|
|
ExternalUses.push_back(
|
|
|
|
ExternalUser(SI->getPointerOperand(), cast<User>(VecPtr), 0));
|
|
|
|
|
2015-03-10 02:37:25 +00:00
|
|
|
if (!Alignment) {
|
2016-03-16 19:48:42 +00:00
|
|
|
Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
|
2015-03-10 02:37:25 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
S->setAlignment(Alignment);
|
|
|
|
E->VectorizedValue = S;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2013-11-23 00:48:34 +00:00
|
|
|
return propagateMetadata(S, E->Scalars);
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2014-08-27 15:01:18 +00:00
|
|
|
case Instruction::GetElementPtr: {
|
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
|
|
|
|
|
|
|
ValueList Op0VL;
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars)
|
|
|
|
Op0VL.push_back(cast<GetElementPtrInst>(V)->getOperand(0));
|
2014-08-27 15:01:18 +00:00
|
|
|
|
|
|
|
Value *Op0 = vectorizeTree(Op0VL);
|
|
|
|
|
|
|
|
std::vector<Value *> OpVecs;
|
|
|
|
for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
|
|
|
|
++j) {
|
|
|
|
ValueList OpVL;
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars)
|
|
|
|
OpVL.push_back(cast<GetElementPtrInst>(V)->getOperand(j));
|
2014-08-27 15:01:18 +00:00
|
|
|
|
|
|
|
Value *OpVec = vectorizeTree(OpVL);
|
|
|
|
OpVecs.push_back(OpVec);
|
|
|
|
}
|
|
|
|
|
2015-03-24 22:38:16 +00:00
|
|
|
Value *V = Builder.CreateGEP(
|
|
|
|
cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
|
2014-08-27 15:01:18 +00:00
|
|
|
E->VectorizedValue = V;
|
|
|
|
++NumVectorInstructions;
|
|
|
|
|
|
|
|
if (Instruction *I = dyn_cast<Instruction>(V))
|
|
|
|
return propagateMetadata(I, E->Scalars);
|
|
|
|
|
|
|
|
return V;
|
|
|
|
}
|
2014-03-12 20:21:50 +00:00
|
|
|
case Instruction::Call: {
|
|
|
|
CallInst *CI = cast<CallInst>(VL0);
|
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
2014-05-30 04:31:24 +00:00
|
|
|
Function *FI;
|
|
|
|
Intrinsic::ID IID = Intrinsic::not_intrinsic;
|
2014-09-02 21:00:39 +00:00
|
|
|
Value *ScalarArg = nullptr;
|
2014-05-30 04:31:24 +00:00
|
|
|
if (CI && (FI = CI->getCalledFunction())) {
|
2015-05-20 17:16:39 +00:00
|
|
|
IID = FI->getIntrinsicID();
|
2014-05-30 04:31:24 +00:00
|
|
|
}
|
2014-03-12 20:21:50 +00:00
|
|
|
std::vector<Value *> OpVecs;
|
|
|
|
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
|
|
|
|
ValueList OpVL;
|
2014-05-30 04:31:24 +00:00
|
|
|
// ctlz,cttz and powi are special intrinsics whose second argument is
|
|
|
|
// a scalar. This argument should not be vectorized.
|
|
|
|
if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) {
|
|
|
|
CallInst *CEI = cast<CallInst>(E->Scalars[0]);
|
2014-09-02 21:00:39 +00:00
|
|
|
ScalarArg = CEI->getArgOperand(j);
|
2014-05-30 04:31:24 +00:00
|
|
|
OpVecs.push_back(CEI->getArgOperand(j));
|
|
|
|
continue;
|
|
|
|
}
|
2015-07-04 19:38:52 +00:00
|
|
|
for (Value *V : E->Scalars) {
|
|
|
|
CallInst *CEI = cast<CallInst>(V);
|
2014-03-12 20:21:50 +00:00
|
|
|
OpVL.push_back(CEI->getArgOperand(j));
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *OpVec = vectorizeTree(OpVL);
|
|
|
|
DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
|
|
|
|
OpVecs.push_back(OpVec);
|
|
|
|
}
|
|
|
|
|
|
|
|
Module *M = F->getParent();
|
2014-05-03 09:59:54 +00:00
|
|
|
Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
|
2014-03-12 20:21:50 +00:00
|
|
|
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
|
|
|
|
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
|
|
|
|
Value *V = Builder.CreateCall(CF, OpVecs);
|
2014-09-02 21:00:39 +00:00
|
|
|
|
|
|
|
// The scalar argument uses an in-tree scalar so we add the new vectorized
|
|
|
|
// call to ExternalUses list to make sure that an extract will be
|
|
|
|
// generated in the future.
|
|
|
|
if (ScalarArg && ScalarToTreeEntry.count(ScalarArg))
|
|
|
|
ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
|
|
|
|
|
2014-03-12 20:21:50 +00:00
|
|
|
E->VectorizedValue = V;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2014-03-12 20:21:50 +00:00
|
|
|
return V;
|
|
|
|
}
|
2014-06-20 04:32:48 +00:00
|
|
|
case Instruction::ShuffleVector: {
|
|
|
|
ValueList LHSVL, RHSVL;
|
2015-01-20 06:11:00 +00:00
|
|
|
assert(isa<BinaryOperator>(VL0) && "Invalid Shuffle Vector Operand");
|
|
|
|
reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL);
|
2014-06-20 04:32:48 +00:00
|
|
|
setInsertPointAfterBundle(E->Scalars);
|
|
|
|
|
|
|
|
Value *LHS = vectorizeTree(LHSVL);
|
|
|
|
Value *RHS = vectorizeTree(RHSVL);
|
|
|
|
|
|
|
|
if (Value *V = alreadyVectorized(E->Scalars))
|
|
|
|
return V;
|
|
|
|
|
|
|
|
// Create a vector of LHS op1 RHS
|
|
|
|
BinaryOperator *BinOp0 = cast<BinaryOperator>(VL0);
|
|
|
|
Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS);
|
|
|
|
|
|
|
|
// Create a vector of LHS op2 RHS
|
|
|
|
Instruction *VL1 = cast<Instruction>(E->Scalars[1]);
|
|
|
|
BinaryOperator *BinOp1 = cast<BinaryOperator>(VL1);
|
|
|
|
Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS);
|
|
|
|
|
2014-09-03 17:40:30 +00:00
|
|
|
// Create shuffle to take alternate operations from the vector.
|
|
|
|
// Also, gather up odd and even scalar ops to propagate IR flags to
|
|
|
|
// each vector operation.
|
|
|
|
ValueList OddScalars, EvenScalars;
|
2014-06-20 04:32:48 +00:00
|
|
|
unsigned e = E->Scalars.size();
|
2014-09-03 17:40:30 +00:00
|
|
|
SmallVector<Constant *, 8> Mask(e);
|
2014-06-20 04:32:48 +00:00
|
|
|
for (unsigned i = 0; i < e; ++i) {
|
2014-09-03 17:40:30 +00:00
|
|
|
if (i & 1) {
|
2014-06-20 04:32:48 +00:00
|
|
|
Mask[i] = Builder.getInt32(e + i);
|
2014-09-03 17:40:30 +00:00
|
|
|
OddScalars.push_back(E->Scalars[i]);
|
|
|
|
} else {
|
2014-06-20 04:32:48 +00:00
|
|
|
Mask[i] = Builder.getInt32(i);
|
2014-09-03 17:40:30 +00:00
|
|
|
EvenScalars.push_back(E->Scalars[i]);
|
|
|
|
}
|
2014-06-20 04:32:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Value *ShuffleMask = ConstantVector::get(Mask);
|
2014-09-03 17:40:30 +00:00
|
|
|
propagateIRFlags(V0, EvenScalars);
|
|
|
|
propagateIRFlags(V1, OddScalars);
|
2014-06-20 04:32:48 +00:00
|
|
|
|
|
|
|
Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
|
|
|
|
E->VectorizedValue = V;
|
2014-08-01 08:14:28 +00:00
|
|
|
++NumVectorInstructions;
|
2014-06-20 04:32:48 +00:00
|
|
|
if (Instruction *I = dyn_cast<Instruction>(V))
|
|
|
|
return propagateMetadata(I, E->Scalars);
|
|
|
|
|
|
|
|
return V;
|
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
default:
|
|
|
|
llvm_unreachable("unknown inst");
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2014-04-25 05:29:35 +00:00
|
|
|
return nullptr;
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-09-21 01:06:00 +00:00
|
|
|
Value *BoUpSLP::vectorizeTree() {
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2014-08-02 19:39:42 +00:00
|
|
|
// All blocks must be scheduled before any instructions are inserted.
|
|
|
|
for (auto &BSIter : BlocksSchedules) {
|
|
|
|
scheduleBlock(BSIter.second.get());
|
|
|
|
}
|
|
|
|
|
2015-10-19 22:06:09 +00:00
|
|
|
Builder.SetInsertPoint(&F->getEntryBlock().front());
|
2016-02-18 14:14:40 +00:00
|
|
|
auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
|
|
|
|
|
|
|
|
// If the vectorized tree can be rewritten in a smaller type, we truncate the
|
|
|
|
// vectorized root. InstCombine will then rewrite the entire expression. We
|
|
|
|
// sign extend the extracted values below.
|
|
|
|
auto *ScalarRoot = VectorizableTree[0].Scalars[0];
|
|
|
|
if (MinBWs.count(ScalarRoot)) {
|
|
|
|
if (auto *I = dyn_cast<Instruction>(VectorRoot))
|
|
|
|
Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
|
|
|
|
auto BundleWidth = VectorizableTree[0].Scalars.size();
|
|
|
|
auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot]);
|
|
|
|
auto *VecTy = VectorType::get(MinTy, BundleWidth);
|
|
|
|
auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
|
|
|
|
VectorizableTree[0].VectorizedValue = Trunc;
|
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-11 04:54:05 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
|
|
|
|
|
|
|
|
// Extract all of the elements with the external uses.
|
|
|
|
for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
|
|
|
|
it != e; ++it) {
|
|
|
|
Value *Scalar = it->Scalar;
|
|
|
|
llvm::User *User = it->User;
|
2013-07-12 06:09:24 +00:00
|
|
|
|
|
|
|
// Skip users that we already RAUW. This happens when one instruction
|
|
|
|
// has multiple uses of the same value.
|
2014-03-09 03:16:01 +00:00
|
|
|
if (std::find(Scalar->user_begin(), Scalar->user_end(), User) ==
|
|
|
|
Scalar->user_end())
|
2013-07-11 04:54:05 +00:00
|
|
|
continue;
|
|
|
|
assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
|
|
|
|
|
|
|
|
int Idx = ScalarToTreeEntry[Scalar];
|
|
|
|
TreeEntry *E = &VectorizableTree[Idx];
|
|
|
|
assert(!E->NeedToGather && "Extracting from a gather list");
|
|
|
|
|
|
|
|
Value *Vec = E->VectorizedValue;
|
|
|
|
assert(Vec && "Can't find vectorizable value");
|
|
|
|
|
2013-08-02 18:40:24 +00:00
|
|
|
Value *Lane = Builder.getInt32(it->Lane);
|
2013-07-11 04:54:05 +00:00
|
|
|
// Generate extracts for out-of-tree users.
|
|
|
|
// Find the insertion point for the extractelement lane.
|
2016-04-01 17:28:15 +00:00
|
|
|
if (auto *VecI = dyn_cast<Instruction>(Vec)) {
|
2013-07-12 06:09:24 +00:00
|
|
|
if (PHINode *PH = dyn_cast<PHINode>(User)) {
|
|
|
|
for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
|
|
|
|
if (PH->getIncomingValue(i) == Scalar) {
|
2016-04-01 17:28:15 +00:00
|
|
|
TerminatorInst *IncomingTerminator =
|
|
|
|
PH->getIncomingBlock(i)->getTerminator();
|
|
|
|
if (isa<CatchSwitchInst>(IncomingTerminator)) {
|
|
|
|
Builder.SetInsertPoint(VecI->getParent(),
|
|
|
|
std::next(VecI->getIterator()));
|
|
|
|
} else {
|
|
|
|
Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
|
|
|
|
}
|
2013-08-02 18:40:24 +00:00
|
|
|
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
|
2016-02-18 14:14:40 +00:00
|
|
|
if (MinBWs.count(ScalarRoot))
|
|
|
|
Ex = Builder.CreateSExt(Ex, Scalar->getType());
|
2013-11-26 22:24:25 +00:00
|
|
|
CSEBlocks.insert(PH->getIncomingBlock(i));
|
2013-08-02 18:40:24 +00:00
|
|
|
PH->setOperand(i, Ex);
|
2013-07-12 06:09:24 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
2013-08-02 18:40:24 +00:00
|
|
|
Builder.SetInsertPoint(cast<Instruction>(User));
|
|
|
|
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
|
2016-02-18 14:14:40 +00:00
|
|
|
if (MinBWs.count(ScalarRoot))
|
|
|
|
Ex = Builder.CreateSExt(Ex, Scalar->getType());
|
2013-11-26 22:24:25 +00:00
|
|
|
CSEBlocks.insert(cast<Instruction>(User)->getParent());
|
2013-08-02 18:40:24 +00:00
|
|
|
User->replaceUsesOfWith(Scalar, Ex);
|
2013-07-12 06:09:24 +00:00
|
|
|
}
|
2013-07-11 04:54:05 +00:00
|
|
|
} else {
|
2015-10-19 22:06:09 +00:00
|
|
|
Builder.SetInsertPoint(&F->getEntryBlock().front());
|
2013-08-02 18:40:24 +00:00
|
|
|
Value *Ex = Builder.CreateExtractElement(Vec, Lane);
|
2016-02-18 14:14:40 +00:00
|
|
|
if (MinBWs.count(ScalarRoot))
|
|
|
|
Ex = Builder.CreateSExt(Ex, Scalar->getType());
|
2013-11-26 22:24:25 +00:00
|
|
|
CSEBlocks.insert(&F->getEntryBlock());
|
2013-08-02 18:40:24 +00:00
|
|
|
User->replaceUsesOfWith(Scalar, Ex);
|
2013-07-11 04:54:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// For each vectorized value:
|
|
|
|
for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
|
|
|
|
TreeEntry *Entry = &VectorizableTree[EIdx];
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
// For each lane:
|
|
|
|
for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
|
|
|
|
Value *Scalar = Entry->Scalars[Lane];
|
|
|
|
// No need to handle users of gathered values.
|
|
|
|
if (Entry->NeedToGather)
|
|
|
|
continue;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-07-11 05:39:02 +00:00
|
|
|
assert(Entry->VectorizedValue && "Can't find vectorizable value");
|
2013-06-28 22:07:09 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
Type *Ty = Scalar->getType();
|
|
|
|
if (!Ty->isVoidTy()) {
|
2014-03-09 03:50:36 +00:00
|
|
|
#ifndef NDEBUG
|
2014-03-09 03:16:01 +00:00
|
|
|
for (User *U : Scalar->users()) {
|
|
|
|
DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
|
2013-09-21 01:06:00 +00:00
|
|
|
|
2014-03-09 03:16:01 +00:00
|
|
|
assert((ScalarToTreeEntry.count(U) ||
|
2014-05-04 17:10:15 +00:00
|
|
|
// It is legal to replace users in the ignorelist by undef.
|
|
|
|
(std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) !=
|
|
|
|
UserIgnoreList.end())) &&
|
2013-07-07 06:57:07 +00:00
|
|
|
"Replacing out-of-tree value with undef");
|
|
|
|
}
|
2014-03-09 03:50:36 +00:00
|
|
|
#endif
|
2013-07-07 06:57:07 +00:00
|
|
|
Value *Undef = UndefValue::get(Ty);
|
|
|
|
Scalar->replaceAllUsesWith(Undef);
|
|
|
|
}
|
|
|
|
DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
|
2015-01-14 11:24:47 +00:00
|
|
|
eraseInstruction(cast<Instruction>(Scalar));
|
2013-06-28 22:07:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-07-07 14:57:18 +00:00
|
|
|
Builder.ClearInsertionPoint();
|
2013-09-21 01:06:00 +00:00
|
|
|
|
|
|
|
return VectorizableTree[0].VectorizedValue;
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
void BoUpSLP::optimizeGatherSequence() {
|
|
|
|
DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
|
|
|
|
<< " gather sequences instructions.\n");
|
2013-06-23 06:15:46 +00:00
|
|
|
// LICM InsertElementInst sequences.
|
2013-06-22 21:34:10 +00:00
|
|
|
for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
|
2013-06-23 06:15:46 +00:00
|
|
|
e = GatherSeq.end(); it != e; ++it) {
|
|
|
|
InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
|
|
|
if (!Insert)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Check if this block is inside a loop.
|
2013-06-23 06:15:46 +00:00
|
|
|
Loop *L = LI->getLoopFor(Insert->getParent());
|
2013-06-22 21:34:10 +00:00
|
|
|
if (!L)
|
2013-06-23 06:15:46 +00:00
|
|
|
continue;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
|
|
|
// Check if it has a preheader.
|
|
|
|
BasicBlock *PreHeader = L->getLoopPreheader();
|
|
|
|
if (!PreHeader)
|
2013-06-26 16:54:53 +00:00
|
|
|
continue;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
|
|
|
// If the vector or the element that we insert into it are
|
|
|
|
// instructions that are defined in this basic block then we can't
|
|
|
|
// hoist this instruction.
|
|
|
|
Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
|
|
|
|
Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
|
|
|
|
if (CurrVec && L->contains(CurrVec))
|
|
|
|
continue;
|
|
|
|
if (NewElem && L->contains(NewElem))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// We can hoist this instruction. Move it to the pre-header.
|
2013-06-23 06:15:46 +00:00
|
|
|
Insert->moveBefore(PreHeader->getTerminator());
|
|
|
|
}
|
|
|
|
|
2014-05-11 10:28:58 +00:00
|
|
|
// Make a list of all reachable blocks in our CSE queue.
|
|
|
|
SmallVector<const DomTreeNode *, 8> CSEWorkList;
|
|
|
|
CSEWorkList.reserve(CSEBlocks.size());
|
|
|
|
for (BasicBlock *BB : CSEBlocks)
|
|
|
|
if (DomTreeNode *N = DT->getNode(BB)) {
|
|
|
|
assert(DT->isReachableFromEntry(N));
|
|
|
|
CSEWorkList.push_back(N);
|
|
|
|
}
|
|
|
|
|
2013-11-03 12:27:52 +00:00
|
|
|
// Sort blocks by domination. This ensures we visit a block after all blocks
|
|
|
|
// dominating it are visited.
|
2014-03-01 11:47:00 +00:00
|
|
|
std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(),
|
2014-05-11 10:28:58 +00:00
|
|
|
[this](const DomTreeNode *A, const DomTreeNode *B) {
|
2014-03-01 11:47:00 +00:00
|
|
|
return DT->properlyDominates(A, B);
|
|
|
|
});
|
2013-11-03 12:27:52 +00:00
|
|
|
|
2013-06-23 06:15:46 +00:00
|
|
|
// Perform O(N^2) search over the gather sequences and merge identical
|
|
|
|
// instructions. TODO: We can further optimize this scan if we split the
|
|
|
|
// instructions into different buckets based on the insert lane.
|
2013-11-03 12:27:52 +00:00
|
|
|
SmallVector<Instruction *, 16> Visited;
|
2014-05-11 10:28:58 +00:00
|
|
|
for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
|
|
|
|
assert((I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
|
2013-11-03 12:27:52 +00:00
|
|
|
"Worklist not sorted properly!");
|
2014-05-11 10:28:58 +00:00
|
|
|
BasicBlock *BB = (*I)->getBlock();
|
2013-11-03 12:27:52 +00:00
|
|
|
// For all instructions in blocks containing gather sequences:
|
|
|
|
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
|
2015-10-19 22:06:09 +00:00
|
|
|
Instruction *In = &*it++;
|
2013-11-26 22:24:25 +00:00
|
|
|
if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
|
2013-06-23 06:15:46 +00:00
|
|
|
continue;
|
|
|
|
|
2013-06-26 16:54:53 +00:00
|
|
|
// Check if we can replace this instruction with any of the
|
|
|
|
// visited instructions.
|
2013-11-03 12:27:52 +00:00
|
|
|
for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
|
|
|
|
ve = Visited.end();
|
|
|
|
v != ve; ++v) {
|
2013-07-12 06:09:24 +00:00
|
|
|
if (In->isIdenticalTo(*v) &&
|
|
|
|
DT->dominates((*v)->getParent(), In->getParent())) {
|
|
|
|
In->replaceAllUsesWith(*v);
|
2015-01-14 11:24:47 +00:00
|
|
|
eraseInstruction(In);
|
2014-04-25 05:29:35 +00:00
|
|
|
In = nullptr;
|
2013-06-23 06:15:46 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2013-11-03 12:27:52 +00:00
|
|
|
if (In) {
|
|
|
|
assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
|
|
|
|
Visited.push_back(In);
|
|
|
|
}
|
2013-06-23 06:15:46 +00:00
|
|
|
}
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
2013-11-26 22:24:25 +00:00
|
|
|
CSEBlocks.clear();
|
|
|
|
GatherSeq.clear();
|
2013-06-22 21:34:10 +00:00
|
|
|
}
|
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
// Groups the instructions to a bundle (which is then a single scheduling entity)
|
|
|
|
// and schedules instructions until the bundle gets ready.
|
|
|
|
bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
|
2015-01-14 11:24:47 +00:00
|
|
|
BoUpSLP *SLP) {
|
2014-08-01 09:20:42 +00:00
|
|
|
if (isa<PHINode>(VL[0]))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Initialize the instruction bundle.
|
|
|
|
Instruction *OldScheduleEnd = ScheduleEnd;
|
|
|
|
ScheduleData *PrevInBundle = nullptr;
|
|
|
|
ScheduleData *Bundle = nullptr;
|
|
|
|
bool ReSchedule = false;
|
|
|
|
DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n");
|
2015-09-30 17:00:44 +00:00
|
|
|
|
|
|
|
// Make sure that the scheduling region contains all
|
|
|
|
// instructions of the bundle.
|
|
|
|
for (Value *V : VL) {
|
|
|
|
if (!extendSchedulingRegion(V))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
for (Value *V : VL) {
|
|
|
|
ScheduleData *BundleMember = getScheduleData(V);
|
|
|
|
assert(BundleMember &&
|
|
|
|
"no ScheduleData for bundle member (maybe not in same basic block)");
|
|
|
|
if (BundleMember->IsScheduled) {
|
|
|
|
// A bundle member was scheduled as single instruction before and now
|
|
|
|
// needs to be scheduled as part of the bundle. We just get rid of the
|
|
|
|
// existing schedule.
|
|
|
|
DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
|
|
|
|
<< " was already scheduled\n");
|
|
|
|
ReSchedule = true;
|
|
|
|
}
|
|
|
|
assert(BundleMember->isSchedulingEntity() &&
|
|
|
|
"bundle member already part of other bundle");
|
|
|
|
if (PrevInBundle) {
|
|
|
|
PrevInBundle->NextInBundle = BundleMember;
|
|
|
|
} else {
|
|
|
|
Bundle = BundleMember;
|
|
|
|
}
|
|
|
|
BundleMember->UnscheduledDepsInBundle = 0;
|
|
|
|
Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
|
|
|
|
|
|
|
|
// Group the instructions to a bundle.
|
|
|
|
BundleMember->FirstInBundle = Bundle;
|
|
|
|
PrevInBundle = BundleMember;
|
|
|
|
}
|
|
|
|
if (ScheduleEnd != OldScheduleEnd) {
|
|
|
|
// The scheduling region got new instructions at the lower end (or it is a
|
|
|
|
// new region for the first bundle). This makes it necessary to
|
|
|
|
// recalculate all dependencies.
|
|
|
|
// It is seldom that this needs to be done a second time after adding the
|
|
|
|
// initial bundle to the region.
|
|
|
|
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
|
|
ScheduleData *SD = getScheduleData(I);
|
|
|
|
SD->clearDependencies();
|
|
|
|
}
|
|
|
|
ReSchedule = true;
|
|
|
|
}
|
|
|
|
if (ReSchedule) {
|
|
|
|
resetSchedule();
|
|
|
|
initialFillReadyList(ReadyInsts);
|
|
|
|
}
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
|
|
|
|
<< BB->getName() << "\n");
|
|
|
|
|
2015-01-14 11:24:47 +00:00
|
|
|
calculateDependencies(Bundle, true, SLP);
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
// Now try to schedule the new bundle. As soon as the bundle is "ready" it
|
|
|
|
// means that there are no cyclic dependencies and we can schedule it.
|
|
|
|
// Note that's important that we don't "schedule" the bundle yet (see
|
|
|
|
// cancelScheduling).
|
|
|
|
while (!Bundle->isReady() && !ReadyInsts.empty()) {
|
|
|
|
|
|
|
|
ScheduleData *pickedSD = ReadyInsts.back();
|
|
|
|
ReadyInsts.pop_back();
|
|
|
|
|
|
|
|
if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
|
|
|
|
schedule(pickedSD, ReadyInsts);
|
|
|
|
}
|
|
|
|
}
|
2015-09-30 17:00:44 +00:00
|
|
|
if (!Bundle->isReady()) {
|
|
|
|
cancelScheduling(VL);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL) {
|
|
|
|
if (isa<PHINode>(VL[0]))
|
|
|
|
return;
|
|
|
|
|
|
|
|
ScheduleData *Bundle = getScheduleData(VL[0]);
|
|
|
|
DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
|
|
|
|
assert(!Bundle->IsScheduled &&
|
|
|
|
"Can't cancel bundle which is already scheduled");
|
|
|
|
assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
|
|
|
|
"tried to unbundle something which is not a bundle");
|
|
|
|
|
|
|
|
// Un-bundle: make single instructions out of the bundle.
|
|
|
|
ScheduleData *BundleMember = Bundle;
|
|
|
|
while (BundleMember) {
|
|
|
|
assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
|
|
|
|
BundleMember->FirstInBundle = BundleMember;
|
|
|
|
ScheduleData *Next = BundleMember->NextInBundle;
|
|
|
|
BundleMember->NextInBundle = nullptr;
|
|
|
|
BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
|
|
|
|
if (BundleMember->UnscheduledDepsInBundle == 0) {
|
|
|
|
ReadyInsts.insert(BundleMember);
|
|
|
|
}
|
|
|
|
BundleMember = Next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-09-30 17:00:44 +00:00
|
|
|
bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) {
|
2014-08-01 09:20:42 +00:00
|
|
|
if (getScheduleData(V))
|
2015-09-30 17:00:44 +00:00
|
|
|
return true;
|
2014-08-01 09:20:42 +00:00
|
|
|
Instruction *I = dyn_cast<Instruction>(V);
|
|
|
|
assert(I && "bundle member must be an instruction");
|
|
|
|
assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
|
|
|
|
if (!ScheduleStart) {
|
|
|
|
// It's the first instruction in the new region.
|
|
|
|
initScheduleData(I, I->getNextNode(), nullptr, nullptr);
|
|
|
|
ScheduleStart = I;
|
|
|
|
ScheduleEnd = I->getNextNode();
|
|
|
|
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
|
|
|
|
DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
|
2015-09-30 17:00:44 +00:00
|
|
|
return true;
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
// Search up and down at the same time, because we don't know if the new
|
|
|
|
// instruction is above or below the existing scheduling region.
|
2015-10-19 22:06:09 +00:00
|
|
|
BasicBlock::reverse_iterator UpIter(ScheduleStart->getIterator());
|
2014-08-01 09:20:42 +00:00
|
|
|
BasicBlock::reverse_iterator UpperEnd = BB->rend();
|
|
|
|
BasicBlock::iterator DownIter(ScheduleEnd);
|
|
|
|
BasicBlock::iterator LowerEnd = BB->end();
|
|
|
|
for (;;) {
|
2015-09-30 17:00:44 +00:00
|
|
|
if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
|
|
|
|
DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-08-01 09:20:42 +00:00
|
|
|
if (UpIter != UpperEnd) {
|
|
|
|
if (&*UpIter == I) {
|
|
|
|
initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
|
|
|
|
ScheduleStart = I;
|
|
|
|
DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n");
|
2015-09-30 17:00:44 +00:00
|
|
|
return true;
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
UpIter++;
|
|
|
|
}
|
|
|
|
if (DownIter != LowerEnd) {
|
|
|
|
if (&*DownIter == I) {
|
|
|
|
initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
|
|
|
|
nullptr);
|
|
|
|
ScheduleEnd = I->getNextNode();
|
|
|
|
assert(ScheduleEnd && "tried to vectorize a TerminatorInst?");
|
|
|
|
DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
|
2015-09-30 17:00:44 +00:00
|
|
|
return true;
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
DownIter++;
|
|
|
|
}
|
|
|
|
assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
|
|
|
|
"instruction not found in block");
|
|
|
|
}
|
2015-09-30 17:00:44 +00:00
|
|
|
return true;
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
|
|
|
|
Instruction *ToI,
|
|
|
|
ScheduleData *PrevLoadStore,
|
|
|
|
ScheduleData *NextLoadStore) {
|
|
|
|
ScheduleData *CurrentLoadStore = PrevLoadStore;
|
|
|
|
for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
|
|
|
|
ScheduleData *SD = ScheduleDataMap[I];
|
|
|
|
if (!SD) {
|
|
|
|
// Allocate a new ScheduleData for the instruction.
|
|
|
|
if (ChunkPos >= ChunkSize) {
|
|
|
|
ScheduleDataChunks.push_back(
|
|
|
|
llvm::make_unique<ScheduleData[]>(ChunkSize));
|
|
|
|
ChunkPos = 0;
|
|
|
|
}
|
|
|
|
SD = &(ScheduleDataChunks.back()[ChunkPos++]);
|
|
|
|
ScheduleDataMap[I] = SD;
|
|
|
|
SD->Inst = I;
|
|
|
|
}
|
|
|
|
assert(!isInSchedulingRegion(SD) &&
|
|
|
|
"new ScheduleData already in scheduling region");
|
|
|
|
SD->init(SchedulingRegionID);
|
|
|
|
|
|
|
|
if (I->mayReadOrWriteMemory()) {
|
|
|
|
// Update the linked list of memory accessing instructions.
|
|
|
|
if (CurrentLoadStore) {
|
|
|
|
CurrentLoadStore->NextLoadStore = SD;
|
|
|
|
} else {
|
|
|
|
FirstLoadStoreInRegion = SD;
|
|
|
|
}
|
|
|
|
CurrentLoadStore = SD;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (NextLoadStore) {
|
|
|
|
if (CurrentLoadStore)
|
|
|
|
CurrentLoadStore->NextLoadStore = NextLoadStore;
|
|
|
|
} else {
|
|
|
|
LastLoadStoreInRegion = CurrentLoadStore;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
|
|
|
|
bool InsertInReadyList,
|
2015-01-14 11:24:47 +00:00
|
|
|
BoUpSLP *SLP) {
|
2014-08-01 09:20:42 +00:00
|
|
|
assert(SD->isSchedulingEntity());
|
|
|
|
|
|
|
|
SmallVector<ScheduleData *, 10> WorkList;
|
|
|
|
WorkList.push_back(SD);
|
|
|
|
|
|
|
|
while (!WorkList.empty()) {
|
|
|
|
ScheduleData *SD = WorkList.back();
|
|
|
|
WorkList.pop_back();
|
|
|
|
|
|
|
|
ScheduleData *BundleMember = SD;
|
|
|
|
while (BundleMember) {
|
|
|
|
assert(isInSchedulingRegion(BundleMember));
|
|
|
|
if (!BundleMember->hasValidDependencies()) {
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n");
|
|
|
|
BundleMember->Dependencies = 0;
|
|
|
|
BundleMember->resetUnscheduledDeps();
|
|
|
|
|
|
|
|
// Handle def-use chain dependencies.
|
|
|
|
for (User *U : BundleMember->Inst->users()) {
|
|
|
|
if (isa<Instruction>(U)) {
|
|
|
|
ScheduleData *UseSD = getScheduleData(U);
|
|
|
|
if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
|
|
|
|
BundleMember->Dependencies++;
|
|
|
|
ScheduleData *DestBundle = UseSD->FirstInBundle;
|
|
|
|
if (!DestBundle->IsScheduled) {
|
|
|
|
BundleMember->incrementUnscheduledDeps(1);
|
|
|
|
}
|
|
|
|
if (!DestBundle->hasValidDependencies()) {
|
|
|
|
WorkList.push_back(DestBundle);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// I'm not sure if this can ever happen. But we need to be safe.
|
2015-08-08 18:27:36 +00:00
|
|
|
// This lets the instruction/bundle never be scheduled and
|
|
|
|
// eventually disable vectorization.
|
2014-08-01 09:20:42 +00:00
|
|
|
BundleMember->Dependencies++;
|
|
|
|
BundleMember->incrementUnscheduledDeps(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle the memory dependencies.
|
|
|
|
ScheduleData *DepDest = BundleMember->NextLoadStore;
|
|
|
|
if (DepDest) {
|
2015-01-14 11:24:47 +00:00
|
|
|
Instruction *SrcInst = BundleMember->Inst;
|
2015-06-17 07:18:54 +00:00
|
|
|
MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
|
2014-08-01 09:20:42 +00:00
|
|
|
bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
|
2015-01-19 09:33:38 +00:00
|
|
|
unsigned numAliased = 0;
|
2015-01-22 08:20:51 +00:00
|
|
|
unsigned DistToSrc = 1;
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
while (DepDest) {
|
|
|
|
assert(isInSchedulingRegion(DepDest));
|
2015-01-22 08:20:51 +00:00
|
|
|
|
|
|
|
// We have two limits to reduce the complexity:
|
|
|
|
// 1) AliasedCheckLimit: It's a small limit to reduce calls to
|
|
|
|
// SLP->isAliased (which is the expensive part in this loop).
|
|
|
|
// 2) MaxMemDepDistance: It's for very large blocks and it aborts
|
|
|
|
// the whole loop (even if the loop is fast, it's quadratic).
|
|
|
|
// It's important for the loop break condition (see below) to
|
|
|
|
// check this limit even between two read-only instructions.
|
|
|
|
if (DistToSrc >= MaxMemDepDistance ||
|
|
|
|
((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
|
|
|
|
(numAliased >= AliasedCheckLimit ||
|
|
|
|
SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
|
|
|
|
|
|
|
|
// We increment the counter only if the locations are aliased
|
|
|
|
// (instead of counting all alias checks). This gives a better
|
|
|
|
// balance between reduced runtime and accurate dependencies.
|
|
|
|
numAliased++;
|
|
|
|
|
|
|
|
DepDest->MemoryDependencies.push_back(BundleMember);
|
|
|
|
BundleMember->Dependencies++;
|
|
|
|
ScheduleData *DestBundle = DepDest->FirstInBundle;
|
|
|
|
if (!DestBundle->IsScheduled) {
|
|
|
|
BundleMember->incrementUnscheduledDeps(1);
|
|
|
|
}
|
|
|
|
if (!DestBundle->hasValidDependencies()) {
|
|
|
|
WorkList.push_back(DestBundle);
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
DepDest = DepDest->NextLoadStore;
|
2015-01-22 08:20:51 +00:00
|
|
|
|
|
|
|
// Example, explaining the loop break condition: Let's assume our
|
|
|
|
// starting instruction is i0 and MaxMemDepDistance = 3.
|
|
|
|
//
|
|
|
|
// +--------v--v--v
|
|
|
|
// i0,i1,i2,i3,i4,i5,i6,i7,i8
|
|
|
|
// +--------^--^--^
|
|
|
|
//
|
|
|
|
// MaxMemDepDistance let us stop alias-checking at i3 and we add
|
|
|
|
// dependencies from i0 to i3,i4,.. (even if they are not aliased).
|
|
|
|
// Previously we already added dependencies from i3 to i6,i7,i8
|
|
|
|
// (because of MaxMemDepDistance). As we added a dependency from
|
|
|
|
// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
|
|
|
|
// and we can abort this loop at i6.
|
|
|
|
if (DistToSrc >= 2 * MaxMemDepDistance)
|
|
|
|
break;
|
|
|
|
DistToSrc++;
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BundleMember = BundleMember->NextInBundle;
|
|
|
|
}
|
|
|
|
if (InsertInReadyList && SD->isReady()) {
|
|
|
|
ReadyInsts.push_back(SD);
|
|
|
|
DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst << "\n");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void BoUpSLP::BlockScheduling::resetSchedule() {
|
|
|
|
assert(ScheduleStart &&
|
|
|
|
"tried to reset schedule on block which has not been scheduled");
|
|
|
|
for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
|
|
|
|
ScheduleData *SD = getScheduleData(I);
|
|
|
|
assert(isInSchedulingRegion(SD));
|
|
|
|
SD->IsScheduled = false;
|
|
|
|
SD->resetUnscheduledDeps();
|
|
|
|
}
|
|
|
|
ReadyInsts.clear();
|
|
|
|
}
|
|
|
|
|
2014-08-02 19:39:42 +00:00
|
|
|
void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2014-08-02 19:39:42 +00:00
|
|
|
if (!BS->ScheduleStart)
|
2014-08-01 09:20:42 +00:00
|
|
|
return;
|
2016-01-13 07:03:42 +00:00
|
|
|
|
2014-08-02 19:39:42 +00:00
|
|
|
DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
|
2014-08-01 09:20:42 +00:00
|
|
|
|
|
|
|
BS->resetSchedule();
|
|
|
|
|
|
|
|
// For the real scheduling we use a more sophisticated ready-list: it is
|
|
|
|
// sorted by the original instruction location. This lets the final schedule
|
|
|
|
// be as close as possible to the original instruction order.
|
|
|
|
struct ScheduleDataCompare {
|
|
|
|
bool operator()(ScheduleData *SD1, ScheduleData *SD2) {
|
|
|
|
return SD2->SchedulingPriority < SD1->SchedulingPriority;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
|
|
|
|
|
2015-08-08 18:27:36 +00:00
|
|
|
// Ensure that all dependency data is updated and fill the ready-list with
|
2014-08-01 09:20:42 +00:00
|
|
|
// initial instructions.
|
|
|
|
int Idx = 0;
|
|
|
|
int NumToSchedule = 0;
|
|
|
|
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
|
|
|
|
I = I->getNextNode()) {
|
|
|
|
ScheduleData *SD = BS->getScheduleData(I);
|
|
|
|
assert(
|
|
|
|
SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) &&
|
|
|
|
"scheduler and vectorizer have different opinion on what is a bundle");
|
|
|
|
SD->FirstInBundle->SchedulingPriority = Idx++;
|
|
|
|
if (SD->isSchedulingEntity()) {
|
2015-01-14 11:24:47 +00:00
|
|
|
BS->calculateDependencies(SD, false, this);
|
2014-08-01 09:20:42 +00:00
|
|
|
NumToSchedule++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
BS->initialFillReadyList(ReadyInsts);
|
|
|
|
|
|
|
|
Instruction *LastScheduledInst = BS->ScheduleEnd;
|
|
|
|
|
|
|
|
// Do the "real" scheduling.
|
|
|
|
while (!ReadyInsts.empty()) {
|
|
|
|
ScheduleData *picked = *ReadyInsts.begin();
|
|
|
|
ReadyInsts.erase(ReadyInsts.begin());
|
|
|
|
|
|
|
|
// Move the scheduled instruction(s) to their dedicated places, if not
|
|
|
|
// there yet.
|
|
|
|
ScheduleData *BundleMember = picked;
|
|
|
|
while (BundleMember) {
|
|
|
|
Instruction *pickedInst = BundleMember->Inst;
|
|
|
|
if (LastScheduledInst->getNextNode() != pickedInst) {
|
2014-08-02 19:39:42 +00:00
|
|
|
BS->BB->getInstList().remove(pickedInst);
|
2015-10-19 22:06:09 +00:00
|
|
|
BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
|
|
|
|
pickedInst);
|
2014-08-01 09:20:42 +00:00
|
|
|
}
|
|
|
|
LastScheduledInst = pickedInst;
|
|
|
|
BundleMember = BundleMember->NextInBundle;
|
|
|
|
}
|
|
|
|
|
|
|
|
BS->schedule(picked, ReadyInsts);
|
|
|
|
NumToSchedule--;
|
|
|
|
}
|
|
|
|
assert(NumToSchedule == 0 && "could not schedule all instructions");
|
|
|
|
|
|
|
|
// Avoid duplicate scheduling of the block.
|
|
|
|
BS->ScheduleStart = nullptr;
|
|
|
|
}
|
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
unsigned BoUpSLP::getVectorElementSize(Value *V) {
|
|
|
|
// If V is a store, just return the width of the stored value without
|
|
|
|
// traversing the expression tree. This is the common case.
|
|
|
|
if (auto *Store = dyn_cast<StoreInst>(V))
|
2016-03-16 19:48:42 +00:00
|
|
|
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
|
2016-01-15 18:51:51 +00:00
|
|
|
|
|
|
|
// If V is not a store, we can traverse the expression tree to find loads
|
|
|
|
// that feed it. The type of the loaded value may indicate a more suitable
|
|
|
|
// width than V's type. We want to base the vector element size on the width
|
|
|
|
// of memory operations where possible.
|
|
|
|
SmallVector<Instruction *, 16> Worklist;
|
|
|
|
SmallPtrSet<Instruction *, 16> Visited;
|
|
|
|
if (auto *I = dyn_cast<Instruction>(V))
|
|
|
|
Worklist.push_back(I);
|
|
|
|
|
|
|
|
// Traverse the expression tree in bottom-up order looking for loads. If we
|
|
|
|
// encounter an instruciton we don't yet handle, we give up.
|
|
|
|
auto MaxWidth = 0u;
|
|
|
|
auto FoundUnknownInst = false;
|
|
|
|
while (!Worklist.empty() && !FoundUnknownInst) {
|
|
|
|
auto *I = Worklist.pop_back_val();
|
|
|
|
Visited.insert(I);
|
|
|
|
|
|
|
|
// We should only be looking at scalar instructions here. If the current
|
|
|
|
// instruction has a vector type, give up.
|
|
|
|
auto *Ty = I->getType();
|
|
|
|
if (isa<VectorType>(Ty))
|
|
|
|
FoundUnknownInst = true;
|
|
|
|
|
|
|
|
// If the current instruction is a load, update MaxWidth to reflect the
|
|
|
|
// width of the loaded value.
|
|
|
|
else if (isa<LoadInst>(I))
|
2016-03-16 19:48:42 +00:00
|
|
|
MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
|
2016-01-15 18:51:51 +00:00
|
|
|
|
|
|
|
// Otherwise, we need to visit the operands of the instruction. We only
|
|
|
|
// handle the interesting cases from buildTree here. If an operand is an
|
|
|
|
// instruction we haven't yet visited, we add it to the worklist.
|
|
|
|
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
|
|
|
|
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
|
|
|
|
for (Use &U : I->operands())
|
|
|
|
if (auto *J = dyn_cast<Instruction>(U.get()))
|
|
|
|
if (!Visited.count(J))
|
|
|
|
Worklist.push_back(J);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we don't yet handle the instruction, give up.
|
|
|
|
else
|
|
|
|
FoundUnknownInst = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we didn't encounter a memory access in the expression tree, or if we
|
|
|
|
// gave up for some reason, just return the width of V.
|
|
|
|
if (!MaxWidth || FoundUnknownInst)
|
2016-03-16 19:48:42 +00:00
|
|
|
return DL->getTypeSizeInBits(V->getType());
|
2016-01-15 18:51:51 +00:00
|
|
|
|
|
|
|
// Otherwise, return the maximum width we found.
|
|
|
|
return MaxWidth;
|
|
|
|
}
|
|
|
|
|
2016-02-18 14:14:40 +00:00
|
|
|
// Determine if a value V in a vectorizable expression Expr can be demoted to a
|
|
|
|
// smaller type with a truncation. We collect the values that will be demoted
|
|
|
|
// in ToDemote and additional roots that require investigating in Roots.
|
|
|
|
static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
|
|
|
|
SmallVectorImpl<Value *> &ToDemote,
|
|
|
|
SmallVectorImpl<Value *> &Roots) {
|
|
|
|
|
|
|
|
// We can always demote constants.
|
|
|
|
if (isa<Constant>(V)) {
|
|
|
|
ToDemote.push_back(V);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the value is not an instruction in the expression with only one use, it
|
|
|
|
// cannot be demoted.
|
|
|
|
auto *I = dyn_cast<Instruction>(V);
|
|
|
|
if (!I || !I->hasOneUse() || !Expr.count(I))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (I->getOpcode()) {
|
|
|
|
|
|
|
|
// We can always demote truncations and extensions. Since truncations can
|
|
|
|
// seed additional demotion, we save the truncated value.
|
|
|
|
case Instruction::Trunc:
|
|
|
|
Roots.push_back(I->getOperand(0));
|
|
|
|
case Instruction::ZExt:
|
|
|
|
case Instruction::SExt:
|
|
|
|
break;
|
|
|
|
|
|
|
|
// We can demote certain binary operations if we can demote both of their
|
|
|
|
// operands.
|
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor:
|
|
|
|
if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
|
|
|
|
!collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
|
|
|
|
// We can demote selects if we can demote their true and false values.
|
|
|
|
case Instruction::Select: {
|
|
|
|
SelectInst *SI = cast<SelectInst>(I);
|
|
|
|
if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
|
|
|
|
!collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can demote phis if we can demote all their incoming operands. Note that
|
|
|
|
// we don't need to worry about cycles since we ensure single use above.
|
|
|
|
case Instruction::PHI: {
|
|
|
|
PHINode *PN = cast<PHINode>(I);
|
|
|
|
for (Value *IncValue : PN->incoming_values())
|
|
|
|
if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
|
|
|
|
return false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, conservatively give up.
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Record the value that we can demote.
|
|
|
|
ToDemote.push_back(V);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void BoUpSLP::computeMinimumValueSizes() {
|
|
|
|
// If there are no external uses, the expression tree must be rooted by a
|
|
|
|
// store. We can't demote in-memory values, so there is nothing to do here.
|
|
|
|
if (ExternalUses.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
// We only attempt to truncate integer expressions.
|
|
|
|
auto &TreeRoot = VectorizableTree[0].Scalars;
|
|
|
|
auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
|
|
|
|
if (!TreeRootIT)
|
|
|
|
return;
|
|
|
|
|
|
|
|
// If the expression is not rooted by a store, these roots should have
|
|
|
|
// external uses. We will rely on InstCombine to rewrite the expression in
|
|
|
|
// the narrower type. However, InstCombine only rewrites single-use values.
|
|
|
|
// This means that if a tree entry other than a root is used externally, it
|
|
|
|
// must have multiple uses and InstCombine will not rewrite it. The code
|
|
|
|
// below ensures that only the roots are used externally.
|
|
|
|
SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
|
|
|
|
for (auto &EU : ExternalUses)
|
|
|
|
if (!Expr.erase(EU.Scalar))
|
|
|
|
return;
|
|
|
|
if (!Expr.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Collect the scalar values of the vectorizable expression. We will use this
|
|
|
|
// context to determine which values can be demoted. If we see a truncation,
|
|
|
|
// we mark it as seeding another demotion.
|
|
|
|
for (auto &Entry : VectorizableTree)
|
|
|
|
Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end());
|
|
|
|
|
|
|
|
// Ensure the roots of the vectorizable tree don't form a cycle. They must
|
|
|
|
// have a single external user that is not in the vectorizable tree.
|
|
|
|
for (auto *Root : TreeRoot)
|
|
|
|
if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// Conservatively determine if we can actually truncate the roots of the
|
|
|
|
// expression. Collect the values that can be demoted in ToDemote and
|
|
|
|
// additional roots that require investigating in Roots.
|
|
|
|
SmallVector<Value *, 32> ToDemote;
|
|
|
|
SmallVector<Value *, 4> Roots;
|
|
|
|
for (auto *Root : TreeRoot)
|
|
|
|
if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// The maximum bit width required to represent all the values that can be
|
|
|
|
// demoted without loss of precision. It would be safe to truncate the roots
|
|
|
|
// of the expression to this width.
|
|
|
|
auto MaxBitWidth = 8u;
|
|
|
|
|
|
|
|
// We first check if all the bits of the roots are demanded. If they're not,
|
|
|
|
// we can truncate the roots to this narrower type.
|
|
|
|
for (auto *Root : TreeRoot) {
|
|
|
|
auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
|
|
|
|
MaxBitWidth = std::max<unsigned>(
|
|
|
|
Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If all the bits of the roots are demanded, we can try a little harder to
|
|
|
|
// compute a narrower type. This can happen, for example, if the roots are
|
|
|
|
// getelementptr indices. InstCombine promotes these indices to the pointer
|
|
|
|
// width. Thus, all their bits are technically demanded even though the
|
|
|
|
// address computation might be vectorized in a smaller type.
|
|
|
|
//
|
|
|
|
// We start by looking at each entry that can be demoted. We compute the
|
|
|
|
// maximum bit width required to store the scalar by using ValueTracking to
|
|
|
|
// compute the number of high-order bits we can truncate.
|
2016-03-16 19:48:42 +00:00
|
|
|
if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType())) {
|
2016-02-18 14:14:40 +00:00
|
|
|
MaxBitWidth = 8u;
|
|
|
|
for (auto *Scalar : ToDemote) {
|
2016-03-16 19:48:42 +00:00
|
|
|
auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, 0, DT);
|
|
|
|
auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
|
2016-02-18 14:14:40 +00:00
|
|
|
MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Round MaxBitWidth up to the next power-of-two.
|
|
|
|
if (!isPowerOf2_64(MaxBitWidth))
|
|
|
|
MaxBitWidth = NextPowerOf2(MaxBitWidth);
|
|
|
|
|
|
|
|
// If the maximum bit width we compute is less than the with of the roots'
|
|
|
|
// type, we can proceed with the narrowing. Otherwise, do nothing.
|
|
|
|
if (MaxBitWidth >= TreeRootIT->getBitWidth())
|
|
|
|
return;
|
|
|
|
|
|
|
|
// If we can truncate the root, we must collect additional values that might
|
|
|
|
// be demoted as a result. That is, those seeded by truncations we will
|
|
|
|
// modify.
|
|
|
|
while (!Roots.empty())
|
|
|
|
collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
|
|
|
|
|
|
|
|
// Finally, map the values we can demote to the maximum bit with we computed.
|
|
|
|
for (auto *Scalar : ToDemote)
|
|
|
|
MinBWs[Scalar] = MaxBitWidth;
|
|
|
|
}
|
|
|
|
|
2013-04-09 19:44:35 +00:00
|
|
|
/// The SLPVectorizer Pass.
|
2013-04-15 22:00:26 +00:00
|
|
|
struct SLPVectorizer : public FunctionPass {
|
2013-06-22 21:34:10 +00:00
|
|
|
typedef SmallVector<StoreInst *, 8> StoreList;
|
|
|
|
typedef MapVector<Value *, StoreList> StoreListMap;
|
2016-01-15 18:51:51 +00:00
|
|
|
typedef SmallVector<WeakVH, 8> WeakVHList;
|
|
|
|
typedef MapVector<Value *, WeakVHList> WeakVHListMap;
|
2013-04-09 19:44:35 +00:00
|
|
|
|
|
|
|
/// Pass identification, replacement for typeid
|
|
|
|
static char ID;
|
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
explicit SLPVectorizer() : FunctionPass(ID) {
|
2013-04-09 19:44:35 +00:00
|
|
|
initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
ScalarEvolution *SE;
|
|
|
|
TargetTransformInfo *TTI;
|
2014-05-03 09:59:54 +00:00
|
|
|
TargetLibraryInfo *TLI;
|
2013-04-09 19:44:35 +00:00
|
|
|
AliasAnalysis *AA;
|
2013-04-15 22:00:26 +00:00
|
|
|
LoopInfo *LI;
|
2013-06-23 21:57:27 +00:00
|
|
|
DominatorTree *DT;
|
2015-01-04 12:03:27 +00:00
|
|
|
AssumptionCache *AC;
|
2016-02-18 14:14:40 +00:00
|
|
|
DemandedBits *DB;
|
2016-03-16 19:48:42 +00:00
|
|
|
const DataLayout *DL;
|
|
|
|
|
|
|
|
bool doInitialization(Module &M) override {
|
|
|
|
DL = &M.getDataLayout();
|
|
|
|
return false;
|
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
|
2014-03-05 09:10:37 +00:00
|
|
|
bool runOnFunction(Function &F) override {
|
2014-02-06 00:07:05 +00:00
|
|
|
if (skipOptnoneFunction(F))
|
|
|
|
return false;
|
|
|
|
|
[PM] Port ScalarEvolution to the new pass manager.
This change makes ScalarEvolution a stand-alone object and just produces
one from a pass as needed. Making this work well requires making the
object movable, using references instead of overwritten pointers in
a number of places, and other refactorings.
I've also wired it up to the new pass manager and added a RUN line to
a test to exercise it under the new pass manager. This includes basic
printing support much like with other analyses.
But there is a big and somewhat scary change here. Prior to this patch
ScalarEvolution was never *actually* invalidated!!! Re-running the pass
just re-wired up the various other analyses and didn't remove any of the
existing entries in the SCEV caches or clear out anything at all. This
might seem OK as everything in SCEV that can uses ValueHandles to track
updates to the values that serve as SCEV keys. However, this still means
that as we ran SCEV over each function in the module, we kept
accumulating more and more SCEVs into the cache. At the end, we would
have a SCEV cache with every value that we ever needed a SCEV for in the
entire module!!! Yowzers. The releaseMemory routine would dump all of
this, but that isn't realy called during normal runs of the pipeline as
far as I can see.
To make matters worse, there *is* actually a key that we don't update
with value handles -- there is a map keyed off of Loop*s. Because
LoopInfo *does* release its memory from run to run, it is entirely
possible to run SCEV over one function, then over another function, and
then lookup a Loop* from the second function but find an entry inserted
for the first function! Ouch.
To make matters still worse, there are plenty of updates that *don't*
trip a value handle. It seems incredibly unlikely that today GVN or
another pass that invalidates SCEV can update values in *just* such
a way that a subsequent run of SCEV will incorrectly find lookups in
a cache, but it is theoretically possible and would be a nightmare to
debug.
With this refactoring, I've fixed all this by actually destroying and
recreating the ScalarEvolution object from run to run. Technically, this
could increase the amount of malloc traffic we see, but then again it is
also technically correct. ;] I don't actually think we're suffering from
tons of malloc traffic from SCEV because if we were, the fact that we
never clear the memory would seem more likely to have come up as an
actual problem before now. So, I've made the simple fix here. If in fact
there are serious issues with too much allocation and deallocation,
I can work on a clever fix that preserves the allocations (while
clearing the data) between each run, but I'd prefer to do that kind of
optimization with a test case / benchmark that shows why we need such
cleverness (and that can test that we actually make it faster). It's
possible that this will make some things faster by making the SCEV
caches have higher locality (due to being significantly smaller) so
until there is a clear benchmark, I think the simple change is best.
Differential Revision: http://reviews.llvm.org/D12063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245193 91177308-0d34-0410-b5e6-96231b3b80d8
2015-08-17 02:08:17 +00:00
|
|
|
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
2015-02-01 12:01:35 +00:00
|
|
|
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
2015-01-15 10:41:28 +00:00
|
|
|
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
|
|
|
|
TLI = TLIP ? &TLIP->getTLI() : nullptr;
|
[PM/AA] Rebuild LLVM's alias analysis infrastructure in a way compatible
with the new pass manager, and no longer relying on analysis groups.
This builds essentially a ground-up new AA infrastructure stack for
LLVM. The core ideas are the same that are used throughout the new pass
manager: type erased polymorphism and direct composition. The design is
as follows:
- FunctionAAResults is a type-erasing alias analysis results aggregation
interface to walk a single query across a range of results from
different alias analyses. Currently this is function-specific as we
always assume that aliasing queries are *within* a function.
- AAResultBase is a CRTP utility providing stub implementations of
various parts of the alias analysis result concept, notably in several
cases in terms of other more general parts of the interface. This can
be used to implement only a narrow part of the interface rather than
the entire interface. This isn't really ideal, this logic should be
hoisted into FunctionAAResults as currently it will cause
a significant amount of redundant work, but it faithfully models the
behavior of the prior infrastructure.
- All the alias analysis passes are ported to be wrapper passes for the
legacy PM and new-style analysis passes for the new PM with a shared
result object. In some cases (most notably CFL), this is an extremely
naive approach that we should revisit when we can specialize for the
new pass manager.
- BasicAA has been restructured to reflect that it is much more
fundamentally a function analysis because it uses dominator trees and
loop info that need to be constructed for each function.
All of the references to getting alias analysis results have been
updated to use the new aggregation interface. All the preservation and
other pass management code has been updated accordingly.
The way the FunctionAAResultsWrapperPass works is to detect the
available alias analyses when run, and add them to the results object.
This means that we should be able to continue to respect when various
passes are added to the pipeline, for example adding CFL or adding TBAA
passes should just cause their results to be available and to get folded
into this. The exception to this rule is BasicAA which really needs to
be a function pass due to using dominator trees and loop info. As
a consequence, the FunctionAAResultsWrapperPass directly depends on
BasicAA and always includes it in the aggregation.
This has significant implications for preserving analyses. Generally,
most passes shouldn't bother preserving FunctionAAResultsWrapperPass
because rebuilding the results just updates the set of known AA passes.
The exception to this rule are LoopPass instances which need to preserve
all the function analyses that the loop pass manager will end up
needing. This means preserving both BasicAAWrapperPass and the
aggregating FunctionAAResultsWrapperPass.
Now, when preserving an alias analysis, you do so by directly preserving
that analysis. This is only necessary for non-immutable-pass-provided
alias analyses though, and there are only three of interest: BasicAA,
GlobalsAA (formerly GlobalsModRef), and SCEVAA. Usually BasicAA is
preserved when needed because it (like DominatorTree and LoopInfo) is
marked as a CFG-only pass. I've expanded GlobalsAA into the preserved
set everywhere we previously were preserving all of AliasAnalysis, and
I've added SCEVAA in the intersection of that with where we preserve
SCEV itself.
One significant challenge to all of this is that the CGSCC passes were
actually using the alias analysis implementations by taking advantage of
a pretty amazing set of loop holes in the old pass manager's analysis
management code which allowed analysis groups to slide through in many
cases. Moving away from analysis groups makes this problem much more
obvious. To fix it, I've leveraged the flexibility the design of the new
PM components provides to just directly construct the relevant alias
analyses for the relevant functions in the IPO passes that need them.
This is a bit hacky, but should go away with the new pass manager, and
is already in many ways cleaner than the prior state.
Another significant challenge is that various facilities of the old
alias analysis infrastructure just don't fit any more. The most
significant of these is the alias analysis 'counter' pass. That pass
relied on the ability to snoop on AA queries at different points in the
analysis group chain. Instead, I'm planning to build printing
functionality directly into the aggregation layer. I've not included
that in this patch merely to keep it smaller.
Note that all of this needs a nearly complete rewrite of the AA
documentation. I'm planning to do that, but I'd like to make sure the
new design settles, and to flesh out a bit more of what it looks like in
the new pass manager first.
Differential Revision: http://reviews.llvm.org/D12080
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247167 91177308-0d34-0410-b5e6-96231b3b80d8
2015-09-09 17:55:00 +00:00
|
|
|
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
|
2015-01-17 14:16:18 +00:00
|
|
|
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
2014-01-13 13:07:17 +00:00
|
|
|
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
2015-01-04 12:03:27 +00:00
|
|
|
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
2016-02-18 14:14:40 +00:00
|
|
|
DB = &getAnalysis<DemandedBits>();
|
2013-04-15 22:00:26 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
Stores.clear();
|
|
|
|
GEPs.clear();
|
2013-04-15 22:00:26 +00:00
|
|
|
bool Changed = false;
|
|
|
|
|
2013-09-18 12:43:35 +00:00
|
|
|
// If the target claims to have no vector registers don't attempt
|
|
|
|
// vectorization.
|
|
|
|
if (!TTI->getNumberOfRegisters(true))
|
|
|
|
return false;
|
|
|
|
|
2015-07-08 23:40:55 +00:00
|
|
|
// Use the vector register size specified by the target unless overridden
|
|
|
|
// by a command-line option.
|
|
|
|
// TODO: It would be better to limit the vectorization factor based on
|
|
|
|
// data type rather than just register size. For example, x86 AVX has
|
|
|
|
// 256-bit registers, but it does not support integer operations
|
|
|
|
// at that width (that requires AVX2).
|
|
|
|
if (MaxVectorRegSizeOption.getNumOccurrences())
|
|
|
|
MaxVecRegSize = MaxVectorRegSizeOption;
|
|
|
|
else
|
|
|
|
MaxVecRegSize = TTI->getRegisterBitWidth(true);
|
|
|
|
|
2016-03-10 02:49:47 +00:00
|
|
|
MinVecRegSize = MinVectorRegSizeOption;
|
|
|
|
|
2013-07-29 05:13:00 +00:00
|
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
2013-08-21 18:54:50 +00:00
|
|
|
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
|
2013-07-29 05:13:00 +00:00
|
|
|
return false;
|
|
|
|
|
2013-06-20 17:54:36 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
|
2013-05-10 22:56:18 +00:00
|
|
|
|
2014-02-16 10:43:25 +00:00
|
|
|
// Use the bottom up slp vectorizer to construct chains that start with
|
2014-05-20 17:11:11 +00:00
|
|
|
// store instructions.
|
2016-03-16 19:48:42 +00:00
|
|
|
BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2015-01-14 11:24:47 +00:00
|
|
|
// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
|
|
|
|
// delete instructions.
|
|
|
|
|
2013-06-26 23:44:45 +00:00
|
|
|
// Scan the blocks in the function in post order.
|
2015-04-15 17:41:42 +00:00
|
|
|
for (auto BB : post_order(&F.getEntryBlock())) {
|
2016-01-15 18:51:51 +00:00
|
|
|
collectSeedInstructions(BB);
|
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
// Vectorize trees that end at stores.
|
2016-03-21 19:47:44 +00:00
|
|
|
if (!Stores.empty()) {
|
|
|
|
DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
|
|
|
|
<< " underlying objects.\n");
|
2013-06-22 21:34:10 +00:00
|
|
|
Changed |= vectorizeStoreChains(R);
|
2013-04-15 22:00:26 +00:00
|
|
|
}
|
2013-07-14 06:15:46 +00:00
|
|
|
|
|
|
|
// Vectorize trees that end at reductions.
|
|
|
|
Changed |= vectorizeChainsInBlock(BB, R);
|
2016-01-15 18:51:51 +00:00
|
|
|
|
|
|
|
// Vectorize the index computations of getelementptr instructions. This
|
|
|
|
// is primarily intended to catch gather-like idioms ending at
|
|
|
|
// non-consecutive loads.
|
2016-03-21 19:47:44 +00:00
|
|
|
if (!GEPs.empty()) {
|
|
|
|
DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
|
|
|
|
<< " underlying objects.\n");
|
2016-01-15 18:51:51 +00:00
|
|
|
Changed |= vectorizeGEPIndices(BB, R);
|
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Changed) {
|
2013-06-23 06:15:46 +00:00
|
|
|
R.optimizeGatherSequence();
|
2013-06-20 17:54:36 +00:00
|
|
|
DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
|
2013-04-15 22:00:26 +00:00
|
|
|
DEBUG(verifyFunction(F));
|
|
|
|
}
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2014-03-05 09:10:37 +00:00
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
2013-04-15 22:00:26 +00:00
|
|
|
FunctionPass::getAnalysisUsage(AU);
|
2015-01-04 12:03:27 +00:00
|
|
|
AU.addRequired<AssumptionCacheTracker>();
|
[PM] Port ScalarEvolution to the new pass manager.
This change makes ScalarEvolution a stand-alone object and just produces
one from a pass as needed. Making this work well requires making the
object movable, using references instead of overwritten pointers in
a number of places, and other refactorings.
I've also wired it up to the new pass manager and added a RUN line to
a test to exercise it under the new pass manager. This includes basic
printing support much like with other analyses.
But there is a big and somewhat scary change here. Prior to this patch
ScalarEvolution was never *actually* invalidated!!! Re-running the pass
just re-wired up the various other analyses and didn't remove any of the
existing entries in the SCEV caches or clear out anything at all. This
might seem OK as everything in SCEV that can uses ValueHandles to track
updates to the values that serve as SCEV keys. However, this still means
that as we ran SCEV over each function in the module, we kept
accumulating more and more SCEVs into the cache. At the end, we would
have a SCEV cache with every value that we ever needed a SCEV for in the
entire module!!! Yowzers. The releaseMemory routine would dump all of
this, but that isn't realy called during normal runs of the pipeline as
far as I can see.
To make matters worse, there *is* actually a key that we don't update
with value handles -- there is a map keyed off of Loop*s. Because
LoopInfo *does* release its memory from run to run, it is entirely
possible to run SCEV over one function, then over another function, and
then lookup a Loop* from the second function but find an entry inserted
for the first function! Ouch.
To make matters still worse, there are plenty of updates that *don't*
trip a value handle. It seems incredibly unlikely that today GVN or
another pass that invalidates SCEV can update values in *just* such
a way that a subsequent run of SCEV will incorrectly find lookups in
a cache, but it is theoretically possible and would be a nightmare to
debug.
With this refactoring, I've fixed all this by actually destroying and
recreating the ScalarEvolution object from run to run. Technically, this
could increase the amount of malloc traffic we see, but then again it is
also technically correct. ;] I don't actually think we're suffering from
tons of malloc traffic from SCEV because if we were, the fact that we
never clear the memory would seem more likely to have come up as an
actual problem before now. So, I've made the simple fix here. If in fact
there are serious issues with too much allocation and deallocation,
I can work on a clever fix that preserves the allocations (while
clearing the data) between each run, but I'd prefer to do that kind of
optimization with a test case / benchmark that shows why we need such
cleverness (and that can test that we actually make it faster). It's
possible that this will make some things faster by making the SCEV
caches have higher locality (due to being significantly smaller) so
until there is a clear benchmark, I think the simple change is best.
Differential Revision: http://reviews.llvm.org/D12063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245193 91177308-0d34-0410-b5e6-96231b3b80d8
2015-08-17 02:08:17 +00:00
|
|
|
AU.addRequired<ScalarEvolutionWrapperPass>();
|
[PM/AA] Rebuild LLVM's alias analysis infrastructure in a way compatible
with the new pass manager, and no longer relying on analysis groups.
This builds essentially a ground-up new AA infrastructure stack for
LLVM. The core ideas are the same that are used throughout the new pass
manager: type erased polymorphism and direct composition. The design is
as follows:
- FunctionAAResults is a type-erasing alias analysis results aggregation
interface to walk a single query across a range of results from
different alias analyses. Currently this is function-specific as we
always assume that aliasing queries are *within* a function.
- AAResultBase is a CRTP utility providing stub implementations of
various parts of the alias analysis result concept, notably in several
cases in terms of other more general parts of the interface. This can
be used to implement only a narrow part of the interface rather than
the entire interface. This isn't really ideal, this logic should be
hoisted into FunctionAAResults as currently it will cause
a significant amount of redundant work, but it faithfully models the
behavior of the prior infrastructure.
- All the alias analysis passes are ported to be wrapper passes for the
legacy PM and new-style analysis passes for the new PM with a shared
result object. In some cases (most notably CFL), this is an extremely
naive approach that we should revisit when we can specialize for the
new pass manager.
- BasicAA has been restructured to reflect that it is much more
fundamentally a function analysis because it uses dominator trees and
loop info that need to be constructed for each function.
All of the references to getting alias analysis results have been
updated to use the new aggregation interface. All the preservation and
other pass management code has been updated accordingly.
The way the FunctionAAResultsWrapperPass works is to detect the
available alias analyses when run, and add them to the results object.
This means that we should be able to continue to respect when various
passes are added to the pipeline, for example adding CFL or adding TBAA
passes should just cause their results to be available and to get folded
into this. The exception to this rule is BasicAA which really needs to
be a function pass due to using dominator trees and loop info. As
a consequence, the FunctionAAResultsWrapperPass directly depends on
BasicAA and always includes it in the aggregation.
This has significant implications for preserving analyses. Generally,
most passes shouldn't bother preserving FunctionAAResultsWrapperPass
because rebuilding the results just updates the set of known AA passes.
The exception to this rule are LoopPass instances which need to preserve
all the function analyses that the loop pass manager will end up
needing. This means preserving both BasicAAWrapperPass and the
aggregating FunctionAAResultsWrapperPass.
Now, when preserving an alias analysis, you do so by directly preserving
that analysis. This is only necessary for non-immutable-pass-provided
alias analyses though, and there are only three of interest: BasicAA,
GlobalsAA (formerly GlobalsModRef), and SCEVAA. Usually BasicAA is
preserved when needed because it (like DominatorTree and LoopInfo) is
marked as a CFG-only pass. I've expanded GlobalsAA into the preserved
set everywhere we previously were preserving all of AliasAnalysis, and
I've added SCEVAA in the intersection of that with where we preserve
SCEV itself.
One significant challenge to all of this is that the CGSCC passes were
actually using the alias analysis implementations by taking advantage of
a pretty amazing set of loop holes in the old pass manager's analysis
management code which allowed analysis groups to slide through in many
cases. Moving away from analysis groups makes this problem much more
obvious. To fix it, I've leveraged the flexibility the design of the new
PM components provides to just directly construct the relevant alias
analyses for the relevant functions in the IPO passes that need them.
This is a bit hacky, but should go away with the new pass manager, and
is already in many ways cleaner than the prior state.
Another significant challenge is that various facilities of the old
alias analysis infrastructure just don't fit any more. The most
significant of these is the alias analysis 'counter' pass. That pass
relied on the ability to snoop on AA queries at different points in the
analysis group chain. Instead, I'm planning to build printing
functionality directly into the aggregation layer. I've not included
that in this patch merely to keep it smaller.
Note that all of this needs a nearly complete rewrite of the AA
documentation. I'm planning to do that, but I'd like to make sure the
new design settles, and to flesh out a bit more of what it looks like in
the new pass manager first.
Differential Revision: http://reviews.llvm.org/D12080
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247167 91177308-0d34-0410-b5e6-96231b3b80d8
2015-09-09 17:55:00 +00:00
|
|
|
AU.addRequired<AAResultsWrapperPass>();
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227669 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-31 03:43:40 +00:00
|
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
2015-01-17 14:16:18 +00:00
|
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
2014-01-13 13:07:17 +00:00
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
2016-02-18 14:14:40 +00:00
|
|
|
AU.addRequired<DemandedBits>();
|
2015-01-17 14:16:18 +00:00
|
|
|
AU.addPreserved<LoopInfoWrapperPass>();
|
2014-01-13 13:07:17 +00:00
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
2015-12-11 17:46:01 +00:00
|
|
|
AU.addPreserved<AAResultsWrapperPass>();
|
|
|
|
AU.addPreserved<GlobalsAAWrapperPass>();
|
2013-06-29 05:38:15 +00:00
|
|
|
AU.setPreservesCFG();
|
2013-04-15 22:00:26 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2016-01-15 18:51:51 +00:00
|
|
|
/// \brief Collect store and getelementptr instructions and organize them
|
|
|
|
/// according to the underlying object of their pointer operands. We sort the
|
|
|
|
/// instructions by their underlying objects to reduce the cost of
|
|
|
|
/// consecutive access queries.
|
|
|
|
///
|
|
|
|
/// TODO: We can further reduce this cost if we flush the chain creation
|
|
|
|
/// every time we run into a memory barrier.
|
|
|
|
void collectSeedInstructions(BasicBlock *BB);
|
2013-04-09 19:44:35 +00:00
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
|
2014-08-01 08:05:55 +00:00
|
|
|
bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
|
2013-04-09 19:44:35 +00:00
|
|
|
|
2013-07-12 00:04:18 +00:00
|
|
|
/// \brief Try to vectorize a list of operands.
|
2014-05-04 17:10:15 +00:00
|
|
|
/// \@param BuildVector A list of users to ignore for the purpose of
|
|
|
|
/// scheduling and that don't need extracting.
|
2013-06-20 17:41:45 +00:00
|
|
|
/// \returns true if a value was vectorized.
|
2014-05-04 17:10:15 +00:00
|
|
|
bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
2014-08-01 08:05:55 +00:00
|
|
|
ArrayRef<Value *> BuildVector = None,
|
|
|
|
bool allowReorder = false);
|
2013-04-20 07:22:58 +00:00
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
/// \brief Try to vectorize a chain that may start at the operands of \V;
|
2013-07-07 06:57:07 +00:00
|
|
|
bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
|
2013-04-09 19:44:35 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
/// \brief Vectorize the store instructions collected in Stores.
|
2013-07-07 06:57:07 +00:00
|
|
|
bool vectorizeStoreChains(BoUpSLP &R);
|
2013-06-20 17:41:45 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
/// \brief Vectorize the index computations of the getelementptr instructions
|
|
|
|
/// collected in GEPs.
|
|
|
|
bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
|
|
|
|
|
2013-06-18 15:58:05 +00:00
|
|
|
/// \brief Scan the basic block and look for patterns that are likely to start
|
|
|
|
/// a vectorization chain.
|
2013-07-07 06:57:07 +00:00
|
|
|
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
|
|
|
|
|
|
|
|
bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
|
2015-07-08 23:40:55 +00:00
|
|
|
BoUpSLP &R, unsigned VecRegSize);
|
2013-04-15 22:00:26 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
|
|
|
|
BoUpSLP &R);
|
2016-01-15 18:51:51 +00:00
|
|
|
|
|
|
|
/// The store instructions in a basic block organized by base pointer.
|
|
|
|
StoreListMap Stores;
|
|
|
|
|
|
|
|
/// The getelementptr instructions in a basic block organized by base pointer.
|
|
|
|
WeakVHListMap GEPs;
|
|
|
|
|
2015-07-08 23:40:55 +00:00
|
|
|
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
|
2016-03-10 02:49:47 +00:00
|
|
|
unsigned MinVecRegSize; // Set by cl::opt (default: 128).
|
2013-04-15 22:00:26 +00:00
|
|
|
};
|
|
|
|
|
2014-01-24 17:20:08 +00:00
|
|
|
/// \brief Check that the Values in the slice in VL array are still existent in
|
2013-11-19 22:20:20 +00:00
|
|
|
/// the WeakVH array.
|
|
|
|
/// Vectorization of part of the VL array may cause later values in the VL array
|
|
|
|
/// to become invalid. We track when this has happened in the WeakVH array.
|
2015-03-02 15:24:36 +00:00
|
|
|
static bool hasValueBeenRAUWed(ArrayRef<Value *> VL, ArrayRef<WeakVH> VH,
|
|
|
|
unsigned SliceBegin, unsigned SliceSize) {
|
|
|
|
VL = VL.slice(SliceBegin, SliceSize);
|
|
|
|
VH = VH.slice(SliceBegin, SliceSize);
|
|
|
|
return !std::equal(VL.begin(), VL.end(), VH.begin());
|
2013-11-19 22:20:20 +00:00
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
|
2015-07-08 23:40:55 +00:00
|
|
|
int CostThreshold, BoUpSLP &R,
|
|
|
|
unsigned VecRegSize) {
|
2013-07-07 06:57:07 +00:00
|
|
|
unsigned ChainLen = Chain.size();
|
|
|
|
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
|
|
|
|
<< "\n");
|
2016-01-15 18:51:51 +00:00
|
|
|
unsigned Sz = R.getVectorElementSize(Chain[0]);
|
2015-07-08 23:40:55 +00:00
|
|
|
unsigned VF = VecRegSize / Sz;
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
if (!isPowerOf2_32(Sz) || VF < 2)
|
|
|
|
return false;
|
|
|
|
|
2014-03-28 17:21:27 +00:00
|
|
|
// Keep track of values that were deleted by vectorizing in the loop below.
|
2013-11-19 22:20:20 +00:00
|
|
|
SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
bool Changed = false;
|
|
|
|
// Look for profitable vectorizable trees at all offsets, starting at zero.
|
|
|
|
for (unsigned i = 0, e = ChainLen; i < e; ++i) {
|
|
|
|
if (i + VF > e)
|
|
|
|
break;
|
2013-11-19 22:20:20 +00:00
|
|
|
|
|
|
|
// Check that a previous iteration of this loop did not delete the Value.
|
|
|
|
if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
|
|
|
|
continue;
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
|
|
|
|
<< "\n");
|
|
|
|
ArrayRef<Value *> Operands = Chain.slice(i, VF);
|
|
|
|
|
|
|
|
R.buildTree(Operands);
|
2016-02-18 14:14:40 +00:00
|
|
|
R.computeMinimumValueSizes();
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
int Cost = R.getTreeCost();
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
|
|
|
|
if (Cost < CostThreshold) {
|
|
|
|
DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
|
|
|
|
R.vectorizeTree();
|
|
|
|
|
|
|
|
// Move to the next bundle.
|
|
|
|
i += VF - 1;
|
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-11-19 22:20:18 +00:00
|
|
|
return Changed;
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
2013-07-16 15:25:17 +00:00
|
|
|
int costThreshold, BoUpSLP &R) {
|
2015-03-10 02:37:25 +00:00
|
|
|
SetVector<StoreInst *> Heads, Tails;
|
|
|
|
SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
// We may run into multiple chains that merge into a single chain. We mark the
|
|
|
|
// stores that we vectorized so that we don't visit the same store twice.
|
|
|
|
BoUpSLP::ValueSet VectorizedStores;
|
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
// Do a quadratic search on all of the given stores and find
|
2013-07-14 06:15:46 +00:00
|
|
|
// all of the pairs of stores that follow each other.
|
2015-07-30 17:40:39 +00:00
|
|
|
SmallVector<unsigned, 16> IndexQueue;
|
2013-07-16 15:25:17 +00:00
|
|
|
for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
|
2015-07-30 17:40:39 +00:00
|
|
|
IndexQueue.clear();
|
|
|
|
// If a store has multiple consecutive store candidates, search Stores
|
|
|
|
// array according to the sequence: from i+1 to e, then from i-1 to 0.
|
|
|
|
// This is because usually pairing with immediate succeeding or preceding
|
|
|
|
// candidate create the best chance to find slp vectorization opportunity.
|
|
|
|
unsigned j = 0;
|
|
|
|
for (j = i + 1; j < e; ++j)
|
|
|
|
IndexQueue.push_back(j);
|
|
|
|
for (j = i; j > 0; --j)
|
|
|
|
IndexQueue.push_back(j - 1);
|
|
|
|
|
|
|
|
for (auto &k : IndexQueue) {
|
2016-03-16 19:48:42 +00:00
|
|
|
if (isConsecutiveAccess(Stores[i], Stores[k], *DL, *SE)) {
|
2015-07-30 17:40:39 +00:00
|
|
|
Tails.insert(Stores[k]);
|
2013-07-07 06:57:07 +00:00
|
|
|
Heads.insert(Stores[i]);
|
2015-07-30 17:40:39 +00:00
|
|
|
ConsecutiveChain[Stores[i]] = Stores[k];
|
|
|
|
break;
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
}
|
2013-07-16 15:25:17 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
|
|
|
|
// For stores that start but don't end a link in the chain:
|
2015-03-10 02:37:25 +00:00
|
|
|
for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
|
2013-07-07 06:57:07 +00:00
|
|
|
it != e; ++it) {
|
|
|
|
if (Tails.count(*it))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// We found a store instr that starts a chain. Now follow the chain and try
|
|
|
|
// to vectorize it.
|
|
|
|
BoUpSLP::ValueList Operands;
|
2015-03-10 02:37:25 +00:00
|
|
|
StoreInst *I = *it;
|
2013-07-07 06:57:07 +00:00
|
|
|
// Collect the chain into a list.
|
|
|
|
while (Tails.count(I) || Heads.count(I)) {
|
|
|
|
if (VectorizedStores.count(I))
|
|
|
|
break;
|
|
|
|
Operands.push_back(I);
|
|
|
|
// Move to the next value in the chain.
|
|
|
|
I = ConsecutiveChain[I];
|
|
|
|
}
|
|
|
|
|
2015-07-08 23:40:55 +00:00
|
|
|
// FIXME: Is division-by-2 the correct step? Should we assert that the
|
|
|
|
// register size is a power-of-2?
|
|
|
|
for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) {
|
|
|
|
if (vectorizeStoreChain(Operands, costThreshold, R, Size)) {
|
|
|
|
// Mark the vectorized stores so that we don't vectorize them again.
|
|
|
|
VectorizedStores.insert(Operands.begin(), Operands.end());
|
|
|
|
Changed = true;
|
|
|
|
break;
|
|
|
|
}
|
2015-07-05 21:21:47 +00:00
|
|
|
}
|
2013-07-07 06:57:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
|
2013-07-07 06:57:07 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
// Initialize the collections. We will make a single pass over the block.
|
|
|
|
Stores.clear();
|
|
|
|
GEPs.clear();
|
2013-04-15 22:00:26 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
// Visit the store and getelementptr instructions in BB and organize them in
|
|
|
|
// Stores and GEPs according to the underlying objects of their pointer
|
|
|
|
// operands.
|
|
|
|
for (Instruction &I : *BB) {
|
2016-01-15 13:10:46 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
// Ignore store instructions that are volatile or have a pointer operand
|
|
|
|
// that doesn't point to a scalar type.
|
|
|
|
if (auto *SI = dyn_cast<StoreInst>(&I)) {
|
|
|
|
if (!SI->isSimple())
|
|
|
|
continue;
|
|
|
|
if (!isValidElementType(SI->getValueOperand()->getType()))
|
|
|
|
continue;
|
2016-03-16 19:48:42 +00:00
|
|
|
Stores[GetUnderlyingObject(SI->getPointerOperand(), *DL)].push_back(SI);
|
2016-01-15 18:51:51 +00:00
|
|
|
}
|
2016-01-15 13:10:46 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
// Ignore getelementptr instructions that have more than one index, a
|
|
|
|
// constant index, or a pointer operand that doesn't point to a scalar
|
|
|
|
// type.
|
|
|
|
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
|
|
|
|
auto Idx = GEP->idx_begin()->get();
|
|
|
|
if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
|
|
|
|
continue;
|
|
|
|
if (!isValidElementType(Idx->getType()))
|
|
|
|
continue;
|
2016-03-16 19:48:42 +00:00
|
|
|
GEPs[GetUnderlyingObject(GEP->getPointerOperand(), *DL)].push_back(GEP);
|
2016-01-15 18:51:51 +00:00
|
|
|
}
|
2013-04-09 19:44:35 +00:00
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
}
|
|
|
|
|
2014-08-01 08:05:55 +00:00
|
|
|
bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
|
2013-06-20 17:54:36 +00:00
|
|
|
if (!A || !B)
|
|
|
|
return false;
|
2013-04-20 09:49:10 +00:00
|
|
|
Value *VL[] = { A, B };
|
2014-08-01 08:05:55 +00:00
|
|
|
return tryToVectorizeList(VL, R, None, true);
|
2013-04-20 07:22:58 +00:00
|
|
|
}
|
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
2014-08-01 08:05:55 +00:00
|
|
|
ArrayRef<Value *> BuildVector,
|
|
|
|
bool allowReorder) {
|
2013-06-18 15:58:05 +00:00
|
|
|
if (VL.size() < 2)
|
|
|
|
return false;
|
|
|
|
|
2013-06-20 17:54:36 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
|
2013-04-20 22:29:43 +00:00
|
|
|
|
2013-06-18 15:58:05 +00:00
|
|
|
// Check that all of the parts are scalar instructions of the same type.
|
|
|
|
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
|
2013-06-20 17:54:36 +00:00
|
|
|
if (!I0)
|
2013-09-03 17:26:04 +00:00
|
|
|
return false;
|
2013-06-18 15:58:05 +00:00
|
|
|
|
|
|
|
unsigned Opcode0 = I0->getOpcode();
|
2013-11-19 22:20:18 +00:00
|
|
|
|
2015-07-08 23:40:55 +00:00
|
|
|
// FIXME: Register size should be a parameter to this function, so we can
|
|
|
|
// try different vectorization factors.
|
2016-01-15 18:51:51 +00:00
|
|
|
unsigned Sz = R.getVectorElementSize(I0);
|
2013-09-03 17:26:04 +00:00
|
|
|
unsigned VF = MinVecRegSize / Sz;
|
2013-06-18 15:58:05 +00:00
|
|
|
|
2015-07-05 20:15:21 +00:00
|
|
|
for (Value *V : VL) {
|
|
|
|
Type *Ty = V->getType();
|
2015-02-12 02:30:56 +00:00
|
|
|
if (!isValidElementType(Ty))
|
2013-09-03 17:26:04 +00:00
|
|
|
return false;
|
2015-07-05 20:15:21 +00:00
|
|
|
Instruction *Inst = dyn_cast<Instruction>(V);
|
2013-06-18 15:58:05 +00:00
|
|
|
if (!Inst || Inst->getOpcode() != Opcode0)
|
2013-09-03 17:26:04 +00:00
|
|
|
return false;
|
2013-04-20 22:29:43 +00:00
|
|
|
}
|
|
|
|
|
2013-09-03 17:26:04 +00:00
|
|
|
bool Changed = false;
|
2013-11-19 22:20:18 +00:00
|
|
|
|
2014-04-05 20:30:31 +00:00
|
|
|
// Keep track of values that were deleted by vectorizing in the loop below.
|
2013-11-19 22:20:20 +00:00
|
|
|
SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
|
|
|
|
|
2013-09-03 17:26:04 +00:00
|
|
|
for (unsigned i = 0, e = VL.size(); i < e; ++i) {
|
|
|
|
unsigned OpsWidth = 0;
|
2013-11-19 22:20:18 +00:00
|
|
|
|
|
|
|
if (i + VF > e)
|
2013-09-03 17:26:04 +00:00
|
|
|
OpsWidth = e - i;
|
|
|
|
else
|
|
|
|
OpsWidth = VF;
|
|
|
|
|
|
|
|
if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
|
|
|
|
break;
|
2013-07-11 20:56:13 +00:00
|
|
|
|
2013-11-19 22:20:20 +00:00
|
|
|
// Check that a previous iteration of this loop did not delete the Value.
|
|
|
|
if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
|
|
|
|
continue;
|
|
|
|
|
2013-11-19 22:20:18 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
|
|
|
|
<< "\n");
|
2013-09-03 17:26:04 +00:00
|
|
|
ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
|
2013-11-19 22:20:18 +00:00
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
ArrayRef<Value *> BuildVectorSlice;
|
|
|
|
if (!BuildVector.empty())
|
|
|
|
BuildVectorSlice = BuildVector.slice(i, OpsWidth);
|
|
|
|
|
|
|
|
R.buildTree(Ops, BuildVectorSlice);
|
2014-08-01 08:05:55 +00:00
|
|
|
// TODO: check if we can allow reordering also for other cases than
|
|
|
|
// tryToVectorizePair()
|
|
|
|
if (allowReorder && R.shouldReorder()) {
|
|
|
|
assert(Ops.size() == 2);
|
|
|
|
assert(BuildVectorSlice.empty());
|
|
|
|
Value *ReorderedOps[] = { Ops[1], Ops[0] };
|
|
|
|
R.buildTree(ReorderedOps, None);
|
|
|
|
}
|
2016-02-18 14:14:40 +00:00
|
|
|
R.computeMinimumValueSizes();
|
2013-09-03 17:26:04 +00:00
|
|
|
int Cost = R.getTreeCost();
|
2013-11-19 22:20:18 +00:00
|
|
|
|
2013-09-03 17:26:04 +00:00
|
|
|
if (Cost < -SLPCostThreshold) {
|
2014-03-28 17:21:27 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
|
2014-05-04 17:10:15 +00:00
|
|
|
Value *VectorizedRoot = R.vectorizeTree();
|
|
|
|
|
|
|
|
// Reconstruct the build vector by extracting the vectorized root. This
|
|
|
|
// way we handle the case where some elements of the vector are undefined.
|
|
|
|
// (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2))
|
|
|
|
if (!BuildVectorSlice.empty()) {
|
|
|
|
// The insert point is the last build vector instruction. The vectorized
|
|
|
|
// root will precede it. This guarantees that we get an instruction. The
|
|
|
|
// vectorized tree could have been constant folded.
|
|
|
|
Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back());
|
|
|
|
unsigned VecIdx = 0;
|
|
|
|
for (auto &V : BuildVectorSlice) {
|
2016-03-13 21:05:13 +00:00
|
|
|
IRBuilder<NoFolder> Builder(InsertAfter->getParent(),
|
|
|
|
++BasicBlock::iterator(InsertAfter));
|
2014-05-04 17:10:15 +00:00
|
|
|
InsertElementInst *IE = cast<InsertElementInst>(V);
|
|
|
|
Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement(
|
|
|
|
VectorizedRoot, Builder.getInt32(VecIdx++)));
|
|
|
|
IE->setOperand(1, Extract);
|
|
|
|
IE->removeFromParent();
|
|
|
|
IE->insertAfter(Extract);
|
|
|
|
InsertAfter = IE;
|
|
|
|
}
|
|
|
|
}
|
2013-09-03 17:26:04 +00:00
|
|
|
// Move to the next bundle.
|
|
|
|
i += VF - 1;
|
|
|
|
Changed = true;
|
|
|
|
}
|
|
|
|
}
|
2013-11-19 22:20:18 +00:00
|
|
|
|
|
|
|
return Changed;
|
2013-04-15 22:00:26 +00:00
|
|
|
}
|
2013-04-09 19:44:35 +00:00
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
|
2013-06-20 17:54:36 +00:00
|
|
|
if (!V)
|
|
|
|
return false;
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
// Try to vectorize V.
|
2014-08-01 08:05:55 +00:00
|
|
|
if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
|
2013-04-14 03:22:20 +00:00
|
|
|
return true;
|
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
|
|
|
|
BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
|
|
|
|
// Try to skip B.
|
|
|
|
if (B && B->hasOneUse()) {
|
|
|
|
BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
|
|
|
|
BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
|
|
|
|
if (tryToVectorizePair(A, B0, R)) {
|
2013-04-14 05:15:53 +00:00
|
|
|
return true;
|
2013-04-14 03:22:20 +00:00
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
if (tryToVectorizePair(A, B1, R)) {
|
|
|
|
return true;
|
2013-04-14 03:22:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-30 21:04:51 +00:00
|
|
|
// Try to skip A.
|
2013-04-15 22:00:26 +00:00
|
|
|
if (A && A->hasOneUse()) {
|
|
|
|
BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
|
|
|
|
BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
|
|
|
|
if (tryToVectorizePair(A0, B, R)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
if (tryToVectorizePair(A1, B, R)) {
|
|
|
|
return true;
|
2013-04-14 03:22:20 +00:00
|
|
|
}
|
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2013-04-14 03:22:20 +00:00
|
|
|
|
2013-09-21 01:06:00 +00:00
|
|
|
/// \brief Generate a shuffle mask to be used in a reduction tree.
|
|
|
|
///
|
|
|
|
/// \param VecLen The length of the vector to be reduced.
|
|
|
|
/// \param NumEltsToRdx The number of elements that should be reduced in the
|
|
|
|
/// vector.
|
|
|
|
/// \param IsPairwise Whether the reduction is a pairwise or splitting
|
2016-01-13 07:03:42 +00:00
|
|
|
/// reduction. A pairwise reduction will generate a mask of
|
2013-09-21 01:06:00 +00:00
|
|
|
/// <0,2,...> or <1,3,..> while a splitting reduction will generate
|
|
|
|
/// <2,3, undef,undef> for a vector of 4 and NumElts = 2.
|
|
|
|
/// \param IsLeft True will generate a mask of even elements, odd otherwise.
|
|
|
|
static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
|
|
|
|
bool IsPairwise, bool IsLeft,
|
|
|
|
IRBuilder<> &Builder) {
|
|
|
|
assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
|
|
|
|
|
|
|
|
SmallVector<Constant *, 32> ShuffleMask(
|
|
|
|
VecLen, UndefValue::get(Builder.getInt32Ty()));
|
|
|
|
|
|
|
|
if (IsPairwise)
|
|
|
|
// Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
|
|
|
|
for (unsigned i = 0; i != NumEltsToRdx; ++i)
|
|
|
|
ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
|
|
|
|
else
|
|
|
|
// Move the upper half of the vector to the lower half.
|
|
|
|
for (unsigned i = 0; i != NumEltsToRdx; ++i)
|
|
|
|
ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
|
|
|
|
|
|
|
|
return ConstantVector::get(ShuffleMask);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// Model horizontal reductions.
|
|
|
|
///
|
|
|
|
/// A horizontal reduction is a tree of reduction operations (currently add and
|
|
|
|
/// fadd) that has operations that can be put into a vector as its leaf.
|
|
|
|
/// For example, this tree:
|
|
|
|
///
|
|
|
|
/// mul mul mul mul
|
|
|
|
/// \ / \ /
|
|
|
|
/// + +
|
|
|
|
/// \ /
|
|
|
|
/// +
|
|
|
|
/// This tree has "mul" as its reduced values and "+" as its reduction
|
|
|
|
/// operations. A reduction might be feeding into a store or a binary operation
|
|
|
|
/// feeding a phi.
|
|
|
|
/// ...
|
|
|
|
/// \ /
|
|
|
|
/// +
|
2013-09-21 05:37:30 +00:00
|
|
|
/// |
|
2013-09-21 01:06:00 +00:00
|
|
|
/// phi +=
|
|
|
|
///
|
|
|
|
/// Or:
|
|
|
|
/// ...
|
|
|
|
/// \ /
|
|
|
|
/// +
|
2013-09-21 05:37:30 +00:00
|
|
|
/// |
|
2013-09-21 01:06:00 +00:00
|
|
|
/// *p =
|
|
|
|
///
|
|
|
|
class HorizontalReduction {
|
2014-05-04 17:10:15 +00:00
|
|
|
SmallVector<Value *, 16> ReductionOps;
|
2013-09-21 01:06:00 +00:00
|
|
|
SmallVector<Value *, 32> ReducedVals;
|
|
|
|
|
|
|
|
BinaryOperator *ReductionRoot;
|
|
|
|
PHINode *ReductionPHI;
|
|
|
|
|
|
|
|
/// The opcode of the reduction.
|
|
|
|
unsigned ReductionOpcode;
|
|
|
|
/// The opcode of the values we perform a reduction on.
|
|
|
|
unsigned ReducedValueOpcode;
|
|
|
|
/// Should we model this reduction as a pairwise reduction tree or a tree that
|
|
|
|
/// splits the vector in halves and adds those halves.
|
|
|
|
bool IsPairwiseReduction;
|
|
|
|
|
|
|
|
public:
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
/// The width of one full horizontal reduction operation.
|
|
|
|
unsigned ReduxWidth;
|
|
|
|
|
2016-03-10 02:49:47 +00:00
|
|
|
/// Minimal width of available vector registers. It's used to determine
|
|
|
|
/// ReduxWidth.
|
|
|
|
unsigned MinVecRegSize;
|
|
|
|
|
|
|
|
HorizontalReduction(unsigned MinVecRegSize)
|
|
|
|
: ReductionRoot(nullptr), ReductionPHI(nullptr), ReductionOpcode(0),
|
|
|
|
ReducedValueOpcode(0), IsPairwiseReduction(false), ReduxWidth(0),
|
|
|
|
MinVecRegSize(MinVecRegSize) {}
|
2013-09-21 01:06:00 +00:00
|
|
|
|
|
|
|
/// \brief Try to find a reduction tree.
|
2015-03-10 02:37:25 +00:00
|
|
|
bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) {
|
2013-09-21 01:06:00 +00:00
|
|
|
assert((!Phi ||
|
|
|
|
std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
|
|
|
|
"Thi phi needs to use the binary operator");
|
|
|
|
|
|
|
|
// We could have a initial reductions that is not an add.
|
|
|
|
// r *= v1 + v2 + v3 + v4
|
|
|
|
// In such a case start looking for a tree rooted in the first '+'.
|
|
|
|
if (Phi) {
|
|
|
|
if (B->getOperand(0) == Phi) {
|
2014-04-25 05:29:35 +00:00
|
|
|
Phi = nullptr;
|
2013-09-21 01:06:00 +00:00
|
|
|
B = dyn_cast<BinaryOperator>(B->getOperand(1));
|
|
|
|
} else if (B->getOperand(1) == Phi) {
|
2014-04-25 05:29:35 +00:00
|
|
|
Phi = nullptr;
|
2013-09-21 01:06:00 +00:00
|
|
|
B = dyn_cast<BinaryOperator>(B->getOperand(0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!B)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
Type *Ty = B->getType();
|
2015-02-12 02:30:56 +00:00
|
|
|
if (!isValidElementType(Ty))
|
2013-09-21 01:06:00 +00:00
|
|
|
return false;
|
|
|
|
|
2015-03-10 02:37:25 +00:00
|
|
|
const DataLayout &DL = B->getModule()->getDataLayout();
|
2013-09-21 01:06:00 +00:00
|
|
|
ReductionOpcode = B->getOpcode();
|
|
|
|
ReducedValueOpcode = 0;
|
2015-07-08 23:40:55 +00:00
|
|
|
// FIXME: Register size should be a parameter to this function, so we can
|
|
|
|
// try different vectorization factors.
|
2015-03-10 02:37:25 +00:00
|
|
|
ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty);
|
2013-09-21 01:06:00 +00:00
|
|
|
ReductionRoot = B;
|
|
|
|
ReductionPHI = Phi;
|
|
|
|
|
|
|
|
if (ReduxWidth < 4)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// We currently only support adds.
|
|
|
|
if (ReductionOpcode != Instruction::Add &&
|
|
|
|
ReductionOpcode != Instruction::FAdd)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Post order traverse the reduction tree starting at B. We only handle true
|
2015-10-27 17:49:11 +00:00
|
|
|
// trees containing only binary operators or selects.
|
|
|
|
SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
|
2013-09-21 01:06:00 +00:00
|
|
|
Stack.push_back(std::make_pair(B, 0));
|
|
|
|
while (!Stack.empty()) {
|
2015-10-27 17:49:11 +00:00
|
|
|
Instruction *TreeN = Stack.back().first;
|
2013-09-21 01:06:00 +00:00
|
|
|
unsigned EdgeToVist = Stack.back().second++;
|
|
|
|
bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
|
|
|
|
|
|
|
|
// Only handle trees in the current basic block.
|
|
|
|
if (TreeN->getParent() != B->getParent())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Each tree node needs to have one user except for the ultimate
|
|
|
|
// reduction.
|
|
|
|
if (!TreeN->hasOneUse() && TreeN != B)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Postorder vist.
|
|
|
|
if (EdgeToVist == 2 || IsReducedValue) {
|
|
|
|
if (IsReducedValue) {
|
|
|
|
// Make sure that the opcodes of the operations that we are going to
|
|
|
|
// reduce match.
|
|
|
|
if (!ReducedValueOpcode)
|
|
|
|
ReducedValueOpcode = TreeN->getOpcode();
|
|
|
|
else if (ReducedValueOpcode != TreeN->getOpcode())
|
|
|
|
return false;
|
|
|
|
ReducedVals.push_back(TreeN);
|
|
|
|
} else {
|
|
|
|
// We need to be able to reassociate the adds.
|
|
|
|
if (!TreeN->isAssociative())
|
|
|
|
return false;
|
2014-05-04 17:10:15 +00:00
|
|
|
ReductionOps.push_back(TreeN);
|
2013-09-21 01:06:00 +00:00
|
|
|
}
|
|
|
|
// Retract.
|
|
|
|
Stack.pop_back();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Visit left or right.
|
|
|
|
Value *NextV = TreeN->getOperand(EdgeToVist);
|
2015-10-27 17:49:11 +00:00
|
|
|
// We currently only allow BinaryOperator's and SelectInst's as reduction
|
|
|
|
// values in our tree.
|
|
|
|
if (isa<BinaryOperator>(NextV) || isa<SelectInst>(NextV))
|
|
|
|
Stack.push_back(std::make_pair(cast<Instruction>(NextV), 0));
|
2013-09-21 01:06:00 +00:00
|
|
|
else if (NextV != Phi)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Attempt to vectorize the tree found by
|
|
|
|
/// matchAssociativeReduction.
|
|
|
|
bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
|
|
|
|
if (ReducedVals.empty())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
unsigned NumReducedVals = ReducedVals.size();
|
|
|
|
if (NumReducedVals < ReduxWidth)
|
|
|
|
return false;
|
|
|
|
|
2014-04-25 05:29:35 +00:00
|
|
|
Value *VectorizedTree = nullptr;
|
2013-09-21 01:06:00 +00:00
|
|
|
IRBuilder<> Builder(ReductionRoot);
|
|
|
|
FastMathFlags Unsafe;
|
|
|
|
Unsafe.setUnsafeAlgebra();
|
2016-01-12 18:03:37 +00:00
|
|
|
Builder.setFastMathFlags(Unsafe);
|
2013-09-21 01:06:00 +00:00
|
|
|
unsigned i = 0;
|
|
|
|
|
|
|
|
for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
|
2014-08-27 05:25:25 +00:00
|
|
|
V.buildTree(makeArrayRef(&ReducedVals[i], ReduxWidth), ReductionOps);
|
2016-02-18 14:14:40 +00:00
|
|
|
V.computeMinimumValueSizes();
|
2013-09-21 01:06:00 +00:00
|
|
|
|
|
|
|
// Estimate cost.
|
|
|
|
int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
|
|
|
|
if (Cost >= -SLPCostThreshold)
|
|
|
|
break;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
|
|
|
|
<< ". (HorRdx)\n");
|
|
|
|
|
|
|
|
// Vectorize a tree.
|
|
|
|
DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
|
|
|
|
Value *VectorizedRoot = V.vectorizeTree();
|
|
|
|
|
|
|
|
// Emit a reduction.
|
|
|
|
Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
|
|
|
|
if (VectorizedTree) {
|
|
|
|
Builder.SetCurrentDebugLocation(Loc);
|
|
|
|
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
|
|
|
|
ReducedSubTree, "bin.rdx");
|
|
|
|
} else
|
|
|
|
VectorizedTree = ReducedSubTree;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (VectorizedTree) {
|
|
|
|
// Finish the reduction.
|
|
|
|
for (; i < NumReducedVals; ++i) {
|
|
|
|
Builder.SetCurrentDebugLocation(
|
|
|
|
cast<Instruction>(ReducedVals[i])->getDebugLoc());
|
|
|
|
VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
|
|
|
|
ReducedVals[i]);
|
|
|
|
}
|
|
|
|
// Update users.
|
|
|
|
if (ReductionPHI) {
|
2014-04-28 04:05:08 +00:00
|
|
|
assert(ReductionRoot && "Need a reduction operation");
|
2013-09-21 01:06:00 +00:00
|
|
|
ReductionRoot->setOperand(0, VectorizedTree);
|
|
|
|
ReductionRoot->setOperand(1, ReductionPHI);
|
|
|
|
} else
|
|
|
|
ReductionRoot->replaceAllUsesWith(VectorizedTree);
|
|
|
|
}
|
2014-04-25 05:29:35 +00:00
|
|
|
return VectorizedTree != nullptr;
|
2013-09-21 01:06:00 +00:00
|
|
|
}
|
|
|
|
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
unsigned numReductionValues() const {
|
|
|
|
return ReducedVals.size();
|
|
|
|
}
|
2013-09-21 01:06:00 +00:00
|
|
|
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
private:
|
2015-08-08 18:27:36 +00:00
|
|
|
/// \brief Calculate the cost of a reduction.
|
2013-09-21 01:06:00 +00:00
|
|
|
int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
|
|
|
|
Type *ScalarTy = FirstReducedVal->getType();
|
|
|
|
Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
|
|
|
|
|
|
|
|
int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
|
|
|
|
int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
|
|
|
|
|
|
|
|
IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
|
|
|
|
int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
|
|
|
|
|
|
|
|
int ScalarReduxCost =
|
|
|
|
ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
|
|
|
|
<< " for reduction that starts with " << *FirstReducedVal
|
|
|
|
<< " (It is a "
|
|
|
|
<< (IsPairwiseReduction ? "pairwise" : "splitting")
|
|
|
|
<< " reduction)\n");
|
|
|
|
|
|
|
|
return VecReduxCost - ScalarReduxCost;
|
|
|
|
}
|
|
|
|
|
|
|
|
static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
|
|
|
|
Value *R, const Twine &Name = "") {
|
|
|
|
if (Opcode == Instruction::FAdd)
|
|
|
|
return Builder.CreateFAdd(L, R, Name);
|
|
|
|
return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// \brief Emit a horizontal reduction of the vectorized value.
|
|
|
|
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
|
|
|
|
assert(VectorizedValue && "Need to have a vectorized tree node");
|
|
|
|
assert(isPowerOf2_32(ReduxWidth) &&
|
|
|
|
"We only handle power-of-two reductions for now");
|
|
|
|
|
2015-01-09 10:23:48 +00:00
|
|
|
Value *TmpVec = VectorizedValue;
|
2013-09-21 01:06:00 +00:00
|
|
|
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
|
|
|
|
if (IsPairwiseReduction) {
|
|
|
|
Value *LeftMask =
|
|
|
|
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
|
|
|
|
Value *RightMask =
|
|
|
|
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
|
|
|
|
|
|
|
|
Value *LeftShuf = Builder.CreateShuffleVector(
|
|
|
|
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
|
|
|
|
Value *RightShuf = Builder.CreateShuffleVector(
|
|
|
|
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
|
|
|
|
"rdx.shuf.r");
|
|
|
|
TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
|
|
|
|
"bin.rdx");
|
|
|
|
} else {
|
|
|
|
Value *UpperHalf =
|
|
|
|
createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
|
|
|
|
Value *Shuf = Builder.CreateShuffleVector(
|
|
|
|
TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
|
|
|
|
TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The result is in the first element of the vector.
|
|
|
|
return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2013-08-26 17:56:35 +00:00
|
|
|
/// \brief Recognize construction of vectors like
|
|
|
|
/// %ra = insertelement <4 x float> undef, float %s0, i32 0
|
|
|
|
/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
|
|
|
|
/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
|
|
|
|
/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
|
|
|
|
///
|
|
|
|
/// Returns true if it matches
|
|
|
|
///
|
2014-05-04 17:10:15 +00:00
|
|
|
static bool findBuildVector(InsertElementInst *FirstInsertElem,
|
|
|
|
SmallVectorImpl<Value *> &BuildVector,
|
|
|
|
SmallVectorImpl<Value *> &BuildVectorOpds) {
|
|
|
|
if (!isa<UndefValue>(FirstInsertElem->getOperand(0)))
|
2013-08-26 17:56:35 +00:00
|
|
|
return false;
|
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
InsertElementInst *IE = FirstInsertElem;
|
2013-08-26 17:56:35 +00:00
|
|
|
while (true) {
|
2014-05-04 17:10:15 +00:00
|
|
|
BuildVector.push_back(IE);
|
|
|
|
BuildVectorOpds.push_back(IE->getOperand(1));
|
2013-08-26 17:56:35 +00:00
|
|
|
|
|
|
|
if (IE->use_empty())
|
|
|
|
return false;
|
|
|
|
|
2014-03-09 03:16:01 +00:00
|
|
|
InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->user_back());
|
2013-08-26 17:56:35 +00:00
|
|
|
if (!NextUse)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// If this isn't the final use, make sure the next insertelement is the only
|
|
|
|
// use. It's OK if the final constructed vector is used multiple times
|
|
|
|
if (!IE->hasOneUse())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
IE = NextUse;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-10-12 18:56:27 +00:00
|
|
|
static bool PhiTypeSorterFunc(Value *V, Value *V2) {
|
|
|
|
return V->getType() < V2->getType();
|
|
|
|
}
|
|
|
|
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
/// \brief Try and get a reduction value from a phi node.
|
|
|
|
///
|
|
|
|
/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
|
|
|
|
/// if they come from either \p ParentBB or a containing loop latch.
|
|
|
|
///
|
|
|
|
/// \returns A candidate reduction value if possible, or \code nullptr \endcode
|
|
|
|
/// if not possible.
|
2015-12-16 18:23:44 +00:00
|
|
|
static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
|
|
|
|
BasicBlock *ParentBB, LoopInfo *LI) {
|
|
|
|
// There are situations where the reduction value is not dominated by the
|
|
|
|
// reduction phi. Vectorizing such cases has been reported to cause
|
|
|
|
// miscompiles. See PR25787.
|
|
|
|
auto DominatedReduxValue = [&](Value *R) {
|
|
|
|
return (
|
|
|
|
dyn_cast<Instruction>(R) &&
|
|
|
|
DT->dominates(P->getParent(), dyn_cast<Instruction>(R)->getParent()));
|
|
|
|
};
|
|
|
|
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
Value *Rdx = nullptr;
|
|
|
|
|
|
|
|
// Return the incoming value if it comes from the same BB as the phi node.
|
|
|
|
if (P->getIncomingBlock(0) == ParentBB) {
|
|
|
|
Rdx = P->getIncomingValue(0);
|
|
|
|
} else if (P->getIncomingBlock(1) == ParentBB) {
|
|
|
|
Rdx = P->getIncomingValue(1);
|
|
|
|
}
|
|
|
|
|
2015-12-16 18:23:44 +00:00
|
|
|
if (Rdx && DominatedReduxValue(Rdx))
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
return Rdx;
|
|
|
|
|
|
|
|
// Otherwise, check whether we have a loop latch to look at.
|
|
|
|
Loop *BBL = LI->getLoopFor(ParentBB);
|
|
|
|
if (!BBL)
|
2015-12-16 18:23:44 +00:00
|
|
|
return nullptr;
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
BasicBlock *BBLatch = BBL->getLoopLatch();
|
|
|
|
if (!BBLatch)
|
2015-12-16 18:23:44 +00:00
|
|
|
return nullptr;
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
|
|
|
|
// There is a loop latch, return the incoming value if it comes from
|
|
|
|
// that. This reduction pattern occassionaly turns up.
|
|
|
|
if (P->getIncomingBlock(0) == BBLatch) {
|
|
|
|
Rdx = P->getIncomingValue(0);
|
|
|
|
} else if (P->getIncomingBlock(1) == BBLatch) {
|
|
|
|
Rdx = P->getIncomingValue(1);
|
|
|
|
}
|
|
|
|
|
2015-12-16 18:23:44 +00:00
|
|
|
if (Rdx && DominatedReduxValue(Rdx))
|
|
|
|
return Rdx;
|
|
|
|
|
|
|
|
return nullptr;
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
}
|
|
|
|
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
/// \brief Attempt to reduce a horizontal reduction.
|
|
|
|
/// If it is legal to match a horizontal reduction feeding
|
|
|
|
/// the phi node P with reduction operators BI, then check if it
|
|
|
|
/// can be done.
|
|
|
|
/// \returns true if a horizontal reduction was matched and reduced.
|
|
|
|
/// \returns false if a horizontal reduction was not matched.
|
|
|
|
static bool canMatchHorizontalReduction(PHINode *P, BinaryOperator *BI,
|
2016-03-10 02:49:47 +00:00
|
|
|
BoUpSLP &R, TargetTransformInfo *TTI,
|
|
|
|
unsigned MinRegSize) {
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
if (!ShouldVectorizeHor)
|
|
|
|
return false;
|
|
|
|
|
2016-03-10 02:49:47 +00:00
|
|
|
HorizontalReduction HorRdx(MinRegSize);
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
if (!HorRdx.matchAssociativeReduction(P, BI))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If there is a sufficient number of reduction values, reduce
|
|
|
|
// to a nearby power-of-2. Can safely generate oversized
|
|
|
|
// vectors and rely on the backend to split them to legal sizes.
|
|
|
|
HorRdx.ReduxWidth =
|
|
|
|
std::max((uint64_t)4, PowerOf2Floor(HorRdx.numReductionValues()));
|
|
|
|
|
|
|
|
return HorRdx.tryToReduce(R, TTI);
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
2013-04-15 22:00:26 +00:00
|
|
|
bool Changed = false;
|
2013-07-12 00:04:18 +00:00
|
|
|
SmallVector<Value *, 4> Incoming;
|
2013-10-12 18:56:27 +00:00
|
|
|
SmallSet<Value *, 16> VisitedInstrs;
|
|
|
|
|
|
|
|
bool HaveVectorizedPhiNodes = true;
|
|
|
|
while (HaveVectorizedPhiNodes) {
|
|
|
|
HaveVectorizedPhiNodes = false;
|
|
|
|
|
|
|
|
// Collect the incoming values from the PHIs.
|
|
|
|
Incoming.clear();
|
2016-01-12 18:47:59 +00:00
|
|
|
for (Instruction &I : *BB) {
|
|
|
|
PHINode *P = dyn_cast<PHINode>(&I);
|
2013-10-12 18:56:27 +00:00
|
|
|
if (!P)
|
|
|
|
break;
|
2013-08-20 21:21:45 +00:00
|
|
|
|
2013-10-12 18:56:27 +00:00
|
|
|
if (!VisitedInstrs.count(P))
|
|
|
|
Incoming.push_back(P);
|
|
|
|
}
|
2013-07-12 00:04:18 +00:00
|
|
|
|
2013-10-12 18:56:27 +00:00
|
|
|
// Sort by type.
|
|
|
|
std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
|
2013-07-12 00:04:18 +00:00
|
|
|
|
2013-10-12 18:56:27 +00:00
|
|
|
// Try to vectorize elements base on their type.
|
|
|
|
for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
|
|
|
|
E = Incoming.end();
|
|
|
|
IncIt != E;) {
|
|
|
|
|
|
|
|
// Look for the next elements with the same type.
|
|
|
|
SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
|
|
|
|
while (SameTypeIt != E &&
|
|
|
|
(*SameTypeIt)->getType() == (*IncIt)->getType()) {
|
|
|
|
VisitedInstrs.insert(*SameTypeIt);
|
|
|
|
++SameTypeIt;
|
|
|
|
}
|
2013-08-20 21:21:45 +00:00
|
|
|
|
2013-10-12 18:56:27 +00:00
|
|
|
// Try to vectorize them.
|
|
|
|
unsigned NumElts = (SameTypeIt - IncIt);
|
|
|
|
DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
|
2014-08-27 05:25:25 +00:00
|
|
|
if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) {
|
2013-10-12 18:56:27 +00:00
|
|
|
// Success start over because instructions might have been changed.
|
|
|
|
HaveVectorizedPhiNodes = true;
|
2013-08-20 21:21:45 +00:00
|
|
|
Changed = true;
|
2013-10-12 18:56:27 +00:00
|
|
|
break;
|
2013-08-20 21:21:45 +00:00
|
|
|
}
|
|
|
|
|
2014-01-24 17:20:08 +00:00
|
|
|
// Start over at the next instruction of a different type (or the end).
|
2013-10-12 18:56:27 +00:00
|
|
|
IncIt = SameTypeIt;
|
2013-07-12 00:04:18 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-20 21:21:45 +00:00
|
|
|
VisitedInstrs.clear();
|
|
|
|
|
|
|
|
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
|
|
|
|
// We may go through BB multiple times so skip the one we have checked.
|
2015-10-19 22:06:09 +00:00
|
|
|
if (!VisitedInstrs.insert(&*it).second)
|
2013-08-20 21:21:45 +00:00
|
|
|
continue;
|
|
|
|
|
|
|
|
if (isa<DbgInfoIntrinsic>(it))
|
2013-06-20 17:54:36 +00:00
|
|
|
continue;
|
2013-04-15 22:00:26 +00:00
|
|
|
|
|
|
|
// Try to vectorize reductions that use PHINodes.
|
2013-08-20 21:21:45 +00:00
|
|
|
if (PHINode *P = dyn_cast<PHINode>(it)) {
|
2013-04-15 22:00:26 +00:00
|
|
|
// Check that the PHI is a reduction PHI.
|
2013-06-20 17:54:36 +00:00
|
|
|
if (P->getNumIncomingValues() != 2)
|
|
|
|
return Changed;
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
|
2015-12-16 18:23:44 +00:00
|
|
|
Value *Rdx = getReductionValue(DT, P, BB, LI);
|
[SLP] Try a bit harder to find reduction PHIs
Summary:
Currently, when the SLP vectorizer considers whether a phi is part of a reduction, it dismisses phi's whose incoming blocks are not the same as the block containing the phi. For the patterns I'm looking at, extending this rule to allow phis whose incoming block is a containing loop latch allows me to vectorize certain workloads.
There is no significant compile-time impact, and combined with D13949, no performance improvement measured in ARM/AArch64 in any of SPEC2000, SPEC2006 or LNT.
Reviewers: jmolloy, mcrosier, nadav
Subscribers: mssimpso, nadav, aemerson, llvm-commits
Differential Revision: http://reviews.llvm.org/D14063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251425 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:54:16 +00:00
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
// Check if this is a Binary Operator.
|
|
|
|
BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
|
|
|
|
if (!BI)
|
2013-04-09 19:44:35 +00:00
|
|
|
continue;
|
2013-04-12 21:11:14 +00:00
|
|
|
|
2013-09-21 01:06:00 +00:00
|
|
|
// Try to match and vectorize a horizontal reduction.
|
2016-03-10 02:49:47 +00:00
|
|
|
if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) {
|
2013-09-21 01:06:00 +00:00
|
|
|
Changed = true;
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *Inst = BI->getOperand(0);
|
2013-06-20 17:54:36 +00:00
|
|
|
if (Inst == P)
|
|
|
|
Inst = BI->getOperand(1);
|
2013-06-22 21:34:10 +00:00
|
|
|
|
2013-08-20 21:21:45 +00:00
|
|
|
if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
|
|
|
|
// We would like to start over since some instructions are deleted
|
|
|
|
// and the iterator may become invalid value.
|
|
|
|
Changed = true;
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
2013-09-21 01:06:00 +00:00
|
|
|
continue;
|
2013-08-20 21:21:45 +00:00
|
|
|
}
|
2013-09-21 01:06:00 +00:00
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
continue;
|
|
|
|
}
|
2013-04-12 21:11:14 +00:00
|
|
|
|
2013-09-25 14:02:32 +00:00
|
|
|
if (ShouldStartVectorizeHorAtStore)
|
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(it))
|
|
|
|
if (BinaryOperator *BinOp =
|
|
|
|
dyn_cast<BinaryOperator>(SI->getValueOperand())) {
|
2016-03-10 02:49:47 +00:00
|
|
|
if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI,
|
|
|
|
MinVecRegSize) ||
|
[SLP] Be more aggressive about reduction width selection.
Summary:
This change could be way off-piste, I'm looking for any feedback on whether it's an acceptable approach.
It never seems to be a problem to gobble up as many reduction values as can be found, and then to attempt to reduce the resulting tree. Some of the workloads I'm looking at have been aggressively unrolled by hand, and by selecting reduction widths that are not constrained by a vector register size, it becomes possible to profitably vectorize. My test case shows such an unrolling which SLP was not vectorizing (on neither ARM nor X86) before this patch, but with it does vectorize.
I measure no significant compile time impact of this change when combined with D13949 and D14063. There are also no significant performance regressions on ARM/AArch64 in SPEC or LNT.
The more principled approach I thought of was to generate several candidate tree's and use the cost model to pick the cheapest one. That seemed like quite a big design change (the algorithms seem very much one-shot), and would likely be a costly thing for compile time. This seemed to do the job at very little cost, but I'm worried I've misunderstood something!
Reviewers: nadav, jmolloy
Subscribers: mssimpso, llvm-commits, aemerson
Differential Revision: http://reviews.llvm.org/D14116
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@251428 91177308-0d34-0410-b5e6-96231b3b80d8
2015-10-27 17:59:03 +00:00
|
|
|
tryToVectorize(BinOp, R)) {
|
2013-09-25 14:02:32 +00:00
|
|
|
Changed = true;
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
|
|
|
continue;
|
|
|
|
}
|
2013-09-21 01:06:00 +00:00
|
|
|
}
|
|
|
|
|
2014-11-19 16:07:38 +00:00
|
|
|
// Try to vectorize horizontal reductions feeding into a return.
|
|
|
|
if (ReturnInst *RI = dyn_cast<ReturnInst>(it))
|
|
|
|
if (RI->getNumOperands() != 0)
|
|
|
|
if (BinaryOperator *BinOp =
|
|
|
|
dyn_cast<BinaryOperator>(RI->getOperand(0))) {
|
|
|
|
DEBUG(dbgs() << "SLP: Found a return to vectorize.\n");
|
|
|
|
if (tryToVectorizePair(BinOp->getOperand(0),
|
|
|
|
BinOp->getOperand(1), R)) {
|
|
|
|
Changed = true;
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
// Try to vectorize trees that start at compare instructions.
|
2013-08-20 21:21:45 +00:00
|
|
|
if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
|
2013-04-15 22:00:26 +00:00
|
|
|
if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
|
2013-08-20 21:21:45 +00:00
|
|
|
Changed = true;
|
|
|
|
// We would like to start over since some instructions are deleted
|
|
|
|
// and the iterator may become invalid value.
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
2013-04-15 22:00:26 +00:00
|
|
|
continue;
|
|
|
|
}
|
2013-08-20 21:21:45 +00:00
|
|
|
|
|
|
|
for (int i = 0; i < 2; ++i) {
|
2014-07-30 21:07:56 +00:00
|
|
|
if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
|
2014-08-01 08:05:55 +00:00
|
|
|
if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
|
2014-07-30 21:07:56 +00:00
|
|
|
Changed = true;
|
|
|
|
// We would like to start over since some instructions are deleted
|
|
|
|
// and the iterator may become invalid value.
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
2015-02-02 12:45:34 +00:00
|
|
|
break;
|
2014-07-30 21:07:56 +00:00
|
|
|
}
|
|
|
|
}
|
2013-08-20 21:21:45 +00:00
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
continue;
|
2013-04-09 19:44:35 +00:00
|
|
|
}
|
2013-08-26 17:56:35 +00:00
|
|
|
|
|
|
|
// Try to vectorize trees that start at insertelement instructions.
|
2014-05-04 17:10:15 +00:00
|
|
|
if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) {
|
|
|
|
SmallVector<Value *, 16> BuildVector;
|
|
|
|
SmallVector<Value *, 16> BuildVectorOpds;
|
|
|
|
if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds))
|
2013-08-26 17:56:35 +00:00
|
|
|
continue;
|
|
|
|
|
2014-05-04 17:10:15 +00:00
|
|
|
// Vectorize starting with the build vector operands ignoring the
|
|
|
|
// BuildVector instructions for the purpose of scheduling and user
|
|
|
|
// extraction.
|
|
|
|
if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) {
|
2013-08-26 17:56:35 +00:00
|
|
|
Changed = true;
|
|
|
|
it = BB->begin();
|
|
|
|
e = BB->end();
|
|
|
|
}
|
|
|
|
|
|
|
|
continue;
|
|
|
|
}
|
2013-04-09 19:44:35 +00:00
|
|
|
}
|
|
|
|
|
2013-04-15 22:00:26 +00:00
|
|
|
return Changed;
|
|
|
|
}
|
2013-04-09 19:44:35 +00:00
|
|
|
|
2016-01-15 18:51:51 +00:00
|
|
|
bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
|
|
|
|
auto Changed = false;
|
|
|
|
for (auto &Entry : GEPs) {
|
|
|
|
|
|
|
|
// If the getelementptr list has fewer than two elements, there's nothing
|
|
|
|
// to do.
|
|
|
|
if (Entry.second.size() < 2)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
|
|
|
|
<< Entry.second.size() << ".\n");
|
|
|
|
|
|
|
|
// We process the getelementptr list in chunks of 16 (like we do for
|
|
|
|
// stores) to minimize compile-time.
|
|
|
|
for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += 16) {
|
|
|
|
auto Len = std::min<unsigned>(BE - BI, 16);
|
|
|
|
auto GEPList = makeArrayRef(&Entry.second[BI], Len);
|
|
|
|
|
|
|
|
// Initialize a set a candidate getelementptrs. Note that we use a
|
|
|
|
// SetVector here to preserve program order. If the index computations
|
|
|
|
// are vectorizable and begin with loads, we want to minimize the chance
|
|
|
|
// of having to reorder them later.
|
|
|
|
SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
|
|
|
|
|
|
|
|
// Some of the candidates may have already been vectorized after we
|
|
|
|
// initially collected them. If so, the WeakVHs will have nullified the
|
|
|
|
// values, so remove them from the set of candidates.
|
|
|
|
Candidates.remove(nullptr);
|
|
|
|
|
|
|
|
// Remove from the set of candidates all pairs of getelementptrs with
|
|
|
|
// constant differences. Such getelementptrs are likely not good
|
|
|
|
// candidates for vectorization in a bottom-up phase since one can be
|
|
|
|
// computed from the other. We also ensure all candidate getelementptr
|
|
|
|
// indices are unique.
|
|
|
|
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
|
|
|
|
auto *GEPI = cast<GetElementPtrInst>(GEPList[I]);
|
|
|
|
if (!Candidates.count(GEPI))
|
|
|
|
continue;
|
|
|
|
auto *SCEVI = SE->getSCEV(GEPList[I]);
|
|
|
|
for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
|
|
|
|
auto *GEPJ = cast<GetElementPtrInst>(GEPList[J]);
|
|
|
|
auto *SCEVJ = SE->getSCEV(GEPList[J]);
|
|
|
|
if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
|
|
|
|
Candidates.remove(GEPList[I]);
|
|
|
|
Candidates.remove(GEPList[J]);
|
|
|
|
} else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
|
|
|
|
Candidates.remove(GEPList[J]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We break out of the above computation as soon as we know there are
|
|
|
|
// fewer than two candidates remaining.
|
|
|
|
if (Candidates.size() < 2)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Add the single, non-constant index of each candidate to the bundle. We
|
|
|
|
// ensured the indices met these constraints when we originally collected
|
|
|
|
// the getelementptrs.
|
|
|
|
SmallVector<Value *, 16> Bundle(Candidates.size());
|
|
|
|
auto BundleIndex = 0u;
|
|
|
|
for (auto *V : Candidates) {
|
|
|
|
auto *GEP = cast<GetElementPtrInst>(V);
|
|
|
|
auto *GEPIdx = GEP->idx_begin()->get();
|
|
|
|
assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
|
|
|
|
Bundle[BundleIndex++] = GEPIdx;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Try and vectorize the indices. We are currently only interested in
|
|
|
|
// gather-like cases of the form:
|
|
|
|
//
|
|
|
|
// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
|
|
|
|
//
|
|
|
|
// where the loads of "a", the loads of "b", and the subtractions can be
|
|
|
|
// performed in parallel. It's likely that detecting this pattern in a
|
|
|
|
// bottom-up phase will be simpler and less costly than building a
|
|
|
|
// full-blown top-down phase beginning at the consecutive loads.
|
|
|
|
Changed |= tryToVectorizeList(Bundle, R);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2013-07-07 06:57:07 +00:00
|
|
|
bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
|
2013-04-15 22:00:26 +00:00
|
|
|
bool Changed = false;
|
|
|
|
// Attempt to sort and vectorize each of the store-groups.
|
2016-01-15 18:51:51 +00:00
|
|
|
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
|
|
|
|
++it) {
|
2013-04-15 22:00:26 +00:00
|
|
|
if (it->second.size() < 2)
|
|
|
|
continue;
|
2013-04-14 03:22:20 +00:00
|
|
|
|
2013-06-20 17:54:36 +00:00
|
|
|
DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
|
2013-07-16 15:25:17 +00:00
|
|
|
<< it->second.size() << ".\n");
|
2013-04-09 19:44:35 +00:00
|
|
|
|
2013-07-16 15:25:17 +00:00
|
|
|
// Process the stores in chunks of 16.
|
2015-07-08 23:40:55 +00:00
|
|
|
// TODO: The limit of 16 inhibits greater vectorization factors.
|
|
|
|
// For example, AVX2 supports v32i8. Increasing this limit, however,
|
|
|
|
// may cause a significant compile-time increase.
|
2013-07-16 15:25:17 +00:00
|
|
|
for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
|
|
|
|
unsigned Len = std::min<unsigned>(CE - CI, 16);
|
2014-08-27 05:25:25 +00:00
|
|
|
Changed |= vectorizeStores(makeArrayRef(&it->second[CI], Len),
|
|
|
|
-SLPCostThreshold, R);
|
2013-07-16 15:25:17 +00:00
|
|
|
}
|
2013-04-09 19:44:35 +00:00
|
|
|
}
|
2013-04-15 22:00:26 +00:00
|
|
|
return Changed;
|
|
|
|
}
|
2013-04-09 19:44:35 +00:00
|
|
|
|
|
|
|
} // end anonymous namespace
|
|
|
|
|
|
|
|
char SLPVectorizer::ID = 0;
|
|
|
|
static const char lv_name[] = "SLP Vectorizer";
|
|
|
|
INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
|
[PM/AA] Rebuild LLVM's alias analysis infrastructure in a way compatible
with the new pass manager, and no longer relying on analysis groups.
This builds essentially a ground-up new AA infrastructure stack for
LLVM. The core ideas are the same that are used throughout the new pass
manager: type erased polymorphism and direct composition. The design is
as follows:
- FunctionAAResults is a type-erasing alias analysis results aggregation
interface to walk a single query across a range of results from
different alias analyses. Currently this is function-specific as we
always assume that aliasing queries are *within* a function.
- AAResultBase is a CRTP utility providing stub implementations of
various parts of the alias analysis result concept, notably in several
cases in terms of other more general parts of the interface. This can
be used to implement only a narrow part of the interface rather than
the entire interface. This isn't really ideal, this logic should be
hoisted into FunctionAAResults as currently it will cause
a significant amount of redundant work, but it faithfully models the
behavior of the prior infrastructure.
- All the alias analysis passes are ported to be wrapper passes for the
legacy PM and new-style analysis passes for the new PM with a shared
result object. In some cases (most notably CFL), this is an extremely
naive approach that we should revisit when we can specialize for the
new pass manager.
- BasicAA has been restructured to reflect that it is much more
fundamentally a function analysis because it uses dominator trees and
loop info that need to be constructed for each function.
All of the references to getting alias analysis results have been
updated to use the new aggregation interface. All the preservation and
other pass management code has been updated accordingly.
The way the FunctionAAResultsWrapperPass works is to detect the
available alias analyses when run, and add them to the results object.
This means that we should be able to continue to respect when various
passes are added to the pipeline, for example adding CFL or adding TBAA
passes should just cause their results to be available and to get folded
into this. The exception to this rule is BasicAA which really needs to
be a function pass due to using dominator trees and loop info. As
a consequence, the FunctionAAResultsWrapperPass directly depends on
BasicAA and always includes it in the aggregation.
This has significant implications for preserving analyses. Generally,
most passes shouldn't bother preserving FunctionAAResultsWrapperPass
because rebuilding the results just updates the set of known AA passes.
The exception to this rule are LoopPass instances which need to preserve
all the function analyses that the loop pass manager will end up
needing. This means preserving both BasicAAWrapperPass and the
aggregating FunctionAAResultsWrapperPass.
Now, when preserving an alias analysis, you do so by directly preserving
that analysis. This is only necessary for non-immutable-pass-provided
alias analyses though, and there are only three of interest: BasicAA,
GlobalsAA (formerly GlobalsModRef), and SCEVAA. Usually BasicAA is
preserved when needed because it (like DominatorTree and LoopInfo) is
marked as a CFG-only pass. I've expanded GlobalsAA into the preserved
set everywhere we previously were preserving all of AliasAnalysis, and
I've added SCEVAA in the intersection of that with where we preserve
SCEV itself.
One significant challenge to all of this is that the CGSCC passes were
actually using the alias analysis implementations by taking advantage of
a pretty amazing set of loop holes in the old pass manager's analysis
management code which allowed analysis groups to slide through in many
cases. Moving away from analysis groups makes this problem much more
obvious. To fix it, I've leveraged the flexibility the design of the new
PM components provides to just directly construct the relevant alias
analyses for the relevant functions in the IPO passes that need them.
This is a bit hacky, but should go away with the new pass manager, and
is already in many ways cleaner than the prior state.
Another significant challenge is that various facilities of the old
alias analysis infrastructure just don't fit any more. The most
significant of these is the alias analysis 'counter' pass. That pass
relied on the ability to snoop on AA queries at different points in the
analysis group chain. Instead, I'm planning to build printing
functionality directly into the aggregation layer. I've not included
that in this patch merely to keep it smaller.
Note that all of this needs a nearly complete rewrite of the AA
documentation. I'm planning to do that, but I'd like to make sure the
new design settles, and to flesh out a bit more of what it looks like in
the new pass manager first.
Differential Revision: http://reviews.llvm.org/D12080
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@247167 91177308-0d34-0410-b5e6-96231b3b80d8
2015-09-09 17:55:00 +00:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
[PM] Change the core design of the TTI analysis to use a polymorphic
type erased interface and a single analysis pass rather than an
extremely complex analysis group.
The end result is that the TTI analysis can contain a type erased
implementation that supports the polymorphic TTI interface. We can build
one from a target-specific implementation or from a dummy one in the IR.
I've also factored all of the code into "mix-in"-able base classes,
including CRTP base classes to facilitate calling back up to the most
specialized form when delegating horizontally across the surface. These
aren't as clean as I would like and I'm planning to work on cleaning
some of this up, but I wanted to start by putting into the right form.
There are a number of reasons for this change, and this particular
design. The first and foremost reason is that an analysis group is
complete overkill, and the chaining delegation strategy was so opaque,
confusing, and high overhead that TTI was suffering greatly for it.
Several of the TTI functions had failed to be implemented in all places
because of the chaining-based delegation making there be no checking of
this. A few other functions were implemented with incorrect delegation.
The message to me was very clear working on this -- the delegation and
analysis group structure was too confusing to be useful here.
The other reason of course is that this is *much* more natural fit for
the new pass manager. This will lay the ground work for a type-erased
per-function info object that can look up the correct subtarget and even
cache it.
Yet another benefit is that this will significantly simplify the
interaction of the pass managers and the TargetMachine. See the future
work below.
The downside of this change is that it is very, very verbose. I'm going
to work to improve that, but it is somewhat an implementation necessity
in C++ to do type erasure. =/ I discussed this design really extensively
with Eric and Hal prior to going down this path, and afterward showed
them the result. No one was really thrilled with it, but there doesn't
seem to be a substantially better alternative. Using a base class and
virtual method dispatch would make the code much shorter, but as
discussed in the update to the programmer's manual and elsewhere,
a polymorphic interface feels like the more principled approach even if
this is perhaps the least compelling example of it. ;]
Ultimately, there is still a lot more to be done here, but this was the
huge chunk that I couldn't really split things out of because this was
the interface change to TTI. I've tried to minimize all the other parts
of this. The follow up work should include at least:
1) Improving the TargetMachine interface by having it directly return
a TTI object. Because we have a non-pass object with value semantics
and an internal type erasure mechanism, we can narrow the interface
of the TargetMachine to *just* do what we need: build and return
a TTI object that we can then insert into the pass pipeline.
2) Make the TTI object be fully specialized for a particular function.
This will include splitting off a minimal form of it which is
sufficient for the inliner and the old pass manager.
3) Add a new pass manager analysis which produces TTI objects from the
target machine for each function. This may actually be done as part
of #2 in order to use the new analysis to implement #2.
4) Work on narrowing the API between TTI and the targets so that it is
easier to understand and less verbose to type erase.
5) Work on narrowing the API between TTI and its clients so that it is
easier to understand and less verbose to forward.
6) Try to improve the CRTP-based delegation. I feel like this code is
just a bit messy and exacerbating the complexity of implementing
the TTI in each target.
Many thanks to Eric and Hal for their help here. I ended up blocked on
this somewhat more abruptly than I expected, and so I appreciate getting
it sorted out very quickly.
Differential Revision: http://reviews.llvm.org/D7293
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227669 91177308-0d34-0410-b5e6-96231b3b80d8
2015-01-31 03:43:40 +00:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
2015-01-04 12:03:27 +00:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
|
[PM] Port ScalarEvolution to the new pass manager.
This change makes ScalarEvolution a stand-alone object and just produces
one from a pass as needed. Making this work well requires making the
object movable, using references instead of overwritten pointers in
a number of places, and other refactorings.
I've also wired it up to the new pass manager and added a RUN line to
a test to exercise it under the new pass manager. This includes basic
printing support much like with other analyses.
But there is a big and somewhat scary change here. Prior to this patch
ScalarEvolution was never *actually* invalidated!!! Re-running the pass
just re-wired up the various other analyses and didn't remove any of the
existing entries in the SCEV caches or clear out anything at all. This
might seem OK as everything in SCEV that can uses ValueHandles to track
updates to the values that serve as SCEV keys. However, this still means
that as we ran SCEV over each function in the module, we kept
accumulating more and more SCEVs into the cache. At the end, we would
have a SCEV cache with every value that we ever needed a SCEV for in the
entire module!!! Yowzers. The releaseMemory routine would dump all of
this, but that isn't realy called during normal runs of the pipeline as
far as I can see.
To make matters worse, there *is* actually a key that we don't update
with value handles -- there is a map keyed off of Loop*s. Because
LoopInfo *does* release its memory from run to run, it is entirely
possible to run SCEV over one function, then over another function, and
then lookup a Loop* from the second function but find an entry inserted
for the first function! Ouch.
To make matters still worse, there are plenty of updates that *don't*
trip a value handle. It seems incredibly unlikely that today GVN or
another pass that invalidates SCEV can update values in *just* such
a way that a subsequent run of SCEV will incorrectly find lookups in
a cache, but it is theoretically possible and would be a nightmare to
debug.
With this refactoring, I've fixed all this by actually destroying and
recreating the ScalarEvolution object from run to run. Technically, this
could increase the amount of malloc traffic we see, but then again it is
also technically correct. ;] I don't actually think we're suffering from
tons of malloc traffic from SCEV because if we were, the fact that we
never clear the memory would seem more likely to have come up as an
actual problem before now. So, I've made the simple fix here. If in fact
there are serious issues with too much allocation and deallocation,
I can work on a clever fix that preserves the allocations (while
clearing the data) between each run, but I'd prefer to do that kind of
optimization with a test case / benchmark that shows why we need such
cleverness (and that can test that we actually make it faster). It's
possible that this will make some things faster by making the SCEV
caches have higher locality (due to being significantly smaller) so
until there is a clear benchmark, I think the simple change is best.
Differential Revision: http://reviews.llvm.org/D12063
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@245193 91177308-0d34-0410-b5e6-96231b3b80d8
2015-08-17 02:08:17 +00:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
|
2013-04-09 19:44:35 +00:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
2016-03-14 20:04:24 +00:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(DemandedBits)
|
2013-04-09 19:44:35 +00:00
|
|
|
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
|
|
|
|
|
|
|
|
namespace llvm {
|
2013-06-20 17:54:36 +00:00
|
|
|
Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
|
2013-04-09 19:44:35 +00:00
|
|
|
}
|