mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-10-10 04:44:53 +00:00
[ARM][NFC] ParallelDSP reorganisation
In preparing to allow ARMParallelDSP pass to parallelise more than smlads, I've restructed some elements: - The ParallelMAC struct has been renamed to BinOpChain. - The BinOpChain struct holds two value lists: LHS and RHS, as well as inheriting from the OpChain base class. - The OpChain struct holds all the values of the represented chain and has had the memory locations functionality inserted into it. - ParallelMACList becomes OpChainList and it now holds pointers instead of objects. Differential Revision: https://reviews.llvm.org/D49020 llvm-svn: 337701
This commit is contained in:
parent
59c94bec0d
commit
89a3799a69
@ -43,38 +43,56 @@ using namespace PatternMatch;
|
|||||||
STATISTIC(NumSMLAD , "Number of smlad instructions generated");
|
STATISTIC(NumSMLAD , "Number of smlad instructions generated");
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
struct ParallelMAC;
|
struct OpChain;
|
||||||
|
struct BinOpChain;
|
||||||
struct Reduction;
|
struct Reduction;
|
||||||
|
|
||||||
using ParallelMACList = SmallVector<ParallelMAC, 8>;
|
using OpChainList = SmallVector<OpChain*, 8>;
|
||||||
using ReductionList = SmallVector<Reduction, 8>;
|
using ReductionList = SmallVector<Reduction, 8>;
|
||||||
using ValueList = SmallVector<Value*, 8>;
|
using ValueList = SmallVector<Value*, 8>;
|
||||||
using MemInstList = SmallVector<Instruction*, 8>;
|
using MemInstList = SmallVector<Instruction*, 8>;
|
||||||
using PMACPair = std::pair<ParallelMAC*,ParallelMAC*>;
|
using PMACPair = std::pair<BinOpChain*,BinOpChain*>;
|
||||||
using PMACPairList = SmallVector<PMACPair, 8>;
|
using PMACPairList = SmallVector<PMACPair, 8>;
|
||||||
using Instructions = SmallVector<Instruction*,16>;
|
using Instructions = SmallVector<Instruction*,16>;
|
||||||
using MemLocList = SmallVector<MemoryLocation, 4>;
|
using MemLocList = SmallVector<MemoryLocation, 4>;
|
||||||
|
|
||||||
// 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures.
|
struct OpChain {
|
||||||
|
Instruction *Root;
|
||||||
|
ValueList AllValues;
|
||||||
|
MemInstList VecLd; // List of all load instructions.
|
||||||
|
MemLocList MemLocs; // All memory locations read by this tree.
|
||||||
|
bool ReadOnly = true;
|
||||||
|
|
||||||
|
OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
|
||||||
|
|
||||||
|
void SetMemoryLocations() {
|
||||||
|
const auto Size = MemoryLocation::UnknownSize;
|
||||||
|
for (auto *V : AllValues) {
|
||||||
|
if (auto *I = dyn_cast<Instruction>(V)) {
|
||||||
|
if (I->mayWriteToMemory())
|
||||||
|
ReadOnly = false;
|
||||||
|
if (auto *Ld = dyn_cast<LoadInst>(V))
|
||||||
|
MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned size() const { return AllValues.size(); }
|
||||||
|
};
|
||||||
|
|
||||||
|
// 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
|
||||||
// 'Reduction' contains the phi-node and accumulator statement from where we
|
// 'Reduction' contains the phi-node and accumulator statement from where we
|
||||||
// start pattern matching, and 'ParallelMAC' the multiplication
|
// start pattern matching, and 'BinOpChain' the multiplication
|
||||||
// instructions that are candidates for parallel execution.
|
// instructions that are candidates for parallel execution.
|
||||||
struct ParallelMAC {
|
struct BinOpChain : public OpChain {
|
||||||
Instruction *Mul;
|
ValueList LHS; // List of all (narrow) left hand operands.
|
||||||
ValueList VL; // List of all (narrow) operands of this Mul
|
ValueList RHS; // List of all (narrow) right hand operands.
|
||||||
MemInstList VecLd; // List of all load instructions of this Mul
|
|
||||||
MemLocList MemLocs; // All memory locations read by this Mul
|
|
||||||
|
|
||||||
// The MAC-chains we currently recognise are simple chains that accumulate
|
BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
|
||||||
// their results with a reducing integer add statement, and consist of
|
OpChain(I, lhs), LHS(lhs), RHS(rhs) {
|
||||||
// a chain of adds and muls, which have only sext and load instructions as
|
for (auto *V : RHS)
|
||||||
// operands. Thus, these chains don't write memory. We check that this is
|
AllValues.push_back(V);
|
||||||
// true when we collect the operands, and use this in alias analysis checks
|
}
|
||||||
// that different parallel MACs don't interfere with each other.
|
|
||||||
bool ReadOnly;
|
|
||||||
|
|
||||||
ParallelMAC(Instruction *I, ValueList &V, bool RdOnly)
|
|
||||||
: Mul(I), VL(V), ReadOnly(RdOnly) {};
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct Reduction {
|
struct Reduction {
|
||||||
@ -83,7 +101,7 @@ namespace {
|
|||||||
Instruction *AccIntAdd; // The accumulating integer add statement,
|
Instruction *AccIntAdd; // The accumulating integer add statement,
|
||||||
// i.e, the reduction statement.
|
// i.e, the reduction statement.
|
||||||
|
|
||||||
ParallelMACList MACCandidates; // The MAC candidates associated with
|
OpChainList MACCandidates; // The MAC candidates associated with
|
||||||
// this reduction statement.
|
// this reduction statement.
|
||||||
Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
|
Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
|
||||||
};
|
};
|
||||||
@ -100,7 +118,7 @@ namespace {
|
|||||||
|
|
||||||
bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
|
bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
|
||||||
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
|
bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
|
||||||
PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
|
PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
|
||||||
Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
|
Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
|
||||||
Instruction *Acc, Instruction *InsertAfter);
|
Instruction *Acc, Instruction *InsertAfter);
|
||||||
|
|
||||||
@ -303,7 +321,7 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
|
|||||||
}
|
}
|
||||||
|
|
||||||
PMACPairList
|
PMACPairList
|
||||||
ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
|
ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
|
||||||
const unsigned Elems = Candidates.size();
|
const unsigned Elems = Candidates.size();
|
||||||
PMACPairList PMACPairs;
|
PMACPairList PMACPairs;
|
||||||
|
|
||||||
@ -314,10 +332,10 @@ ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
|
|||||||
// We can compare all elements, but then we need to compare and evaluate
|
// We can compare all elements, but then we need to compare and evaluate
|
||||||
// different solutions.
|
// different solutions.
|
||||||
for(unsigned i=0; i<Elems-1; i+=2) {
|
for(unsigned i=0; i<Elems-1; i+=2) {
|
||||||
ParallelMAC &PMul0 = Candidates[i];
|
BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i]);
|
||||||
ParallelMAC &PMul1 = Candidates[i+1];
|
BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1]);
|
||||||
const Instruction *Mul0 = PMul0.Mul;
|
const Instruction *Mul0 = PMul0->Root;
|
||||||
const Instruction *Mul1 = PMul1.Mul;
|
const Instruction *Mul1 = PMul1->Root;
|
||||||
|
|
||||||
if (Mul0 == Mul1)
|
if (Mul0 == Mul1)
|
||||||
continue;
|
continue;
|
||||||
@ -326,10 +344,13 @@ ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
|
|||||||
dbgs() << "- "; Mul0->dump();
|
dbgs() << "- "; Mul0->dump();
|
||||||
dbgs() << "- "; Mul1->dump());
|
dbgs() << "- "; Mul1->dump());
|
||||||
|
|
||||||
const ValueList &VL0 = PMul0.VL;
|
const ValueList &Mul0_LHS = PMul0->LHS;
|
||||||
const ValueList &VL1 = PMul1.VL;
|
const ValueList &Mul0_RHS = PMul0->RHS;
|
||||||
|
const ValueList &Mul1_LHS = PMul1->LHS;
|
||||||
|
const ValueList &Mul1_RHS = PMul1->RHS;
|
||||||
|
|
||||||
if (!AreSymmetrical(VL0, VL1))
|
if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
|
||||||
|
!AreSymmetrical(Mul0_RHS, Mul1_RHS))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
|
LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
|
||||||
@ -337,23 +358,23 @@ ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
|
|||||||
// that its two pairs of consecutive loads, then these can be transformed
|
// that its two pairs of consecutive loads, then these can be transformed
|
||||||
// into two wider loads and the users can be replaced with DSP
|
// into two wider loads and the users can be replaced with DSP
|
||||||
// intrinsics.
|
// intrinsics.
|
||||||
for (unsigned x = 0; x < VL0.size(); x += 4) {
|
for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
|
||||||
auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
|
auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
|
||||||
auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
|
auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
|
||||||
auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
|
auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
|
||||||
auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
|
auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
|
||||||
|
|
||||||
LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
|
LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
|
||||||
dbgs() << "\t mul1: "; VL0[x]->dump();
|
dbgs() << "\t mul1: "; Mul0_LHS[x]->dump();
|
||||||
dbgs() << "\t mul2: "; VL1[x]->dump();
|
dbgs() << "\t mul2: "; Mul1_LHS[x]->dump();
|
||||||
dbgs() << "and operands " << x + 2 << ":\n";
|
dbgs() << "and operands " << x + 2 << ":\n";
|
||||||
dbgs() << "\t mul1: "; VL0[x+2]->dump();
|
dbgs() << "\t mul1: "; Mul0_RHS[x]->dump();
|
||||||
dbgs() << "\t mul2: "; VL1[x+2]->dump());
|
dbgs() << "\t mul2: "; Mul1_RHS[x]->dump());
|
||||||
|
|
||||||
if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
|
if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) &&
|
||||||
AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
|
AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
|
||||||
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
|
LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
|
||||||
PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
|
PMACPairs.push_back(std::make_pair(PMul0, PMul1));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -367,8 +388,8 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
|
|||||||
|
|
||||||
for (auto &Pair : PMACPairs) {
|
for (auto &Pair : PMACPairs) {
|
||||||
LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
|
LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
|
||||||
dbgs() << "- "; Pair.first->Mul->dump();
|
dbgs() << "- "; Pair.first->Root->dump();
|
||||||
dbgs() << "- "; Pair.second->Mul->dump());
|
dbgs() << "- "; Pair.second->Root->dump());
|
||||||
auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
|
auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
|
||||||
auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
|
auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
|
||||||
Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
|
Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
|
||||||
@ -383,9 +404,8 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static ReductionList MatchReductions(Function &F, Loop *TheLoop,
|
static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
|
||||||
BasicBlock *Header) {
|
ReductionList &Reductions) {
|
||||||
ReductionList Reductions;
|
|
||||||
RecurrenceDescriptor RecDesc;
|
RecurrenceDescriptor RecDesc;
|
||||||
const bool HasFnNoNaNAttr =
|
const bool HasFnNoNaNAttr =
|
||||||
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
|
F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
|
||||||
@ -394,7 +414,7 @@ static ReductionList MatchReductions(Function &F, Loop *TheLoop,
|
|||||||
// We need a preheader as getIncomingValueForBlock assumes there is one.
|
// We need a preheader as getIncomingValueForBlock assumes there is one.
|
||||||
if (!TheLoop->getLoopPreheader()) {
|
if (!TheLoop->getLoopPreheader()) {
|
||||||
LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
|
LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
|
||||||
return Reductions;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (PHINode &Phi : Header->phis()) {
|
for (PHINode &Phi : Header->phis()) {
|
||||||
@ -418,36 +438,29 @@ static ReductionList MatchReductions(Function &F, Loop *TheLoop,
|
|||||||
|
|
||||||
LLVM_DEBUG(
|
LLVM_DEBUG(
|
||||||
dbgs() << "\nAccumulating integer additions (reductions) found:\n";
|
dbgs() << "\nAccumulating integer additions (reductions) found:\n";
|
||||||
for (auto R : Reductions) {
|
for (auto &R : Reductions) {
|
||||||
dbgs() << "- "; R.Phi->dump();
|
dbgs() << "- "; R.Phi->dump();
|
||||||
dbgs() << "-> "; R.AccIntAdd->dump();
|
dbgs() << "-> "; R.AccIntAdd->dump();
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
return Reductions;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void AddMACCandidate(ParallelMACList &Candidates, const Instruction *Acc,
|
static void AddMACCandidate(OpChainList &Candidates,
|
||||||
|
const Instruction *Acc,
|
||||||
Value *MulOp0, Value *MulOp1, int MulOpNum) {
|
Value *MulOp0, Value *MulOp1, int MulOpNum) {
|
||||||
Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
|
Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
|
||||||
LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
|
LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
|
||||||
ValueList VL;
|
ValueList LHS;
|
||||||
if (IsNarrowSequence<16>(MulOp0, VL) &&
|
ValueList RHS;
|
||||||
IsNarrowSequence<16>(MulOp1, VL)) {
|
if (IsNarrowSequence<16>(MulOp0, LHS) &&
|
||||||
|
IsNarrowSequence<16>(MulOp1, RHS)) {
|
||||||
LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
|
LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
|
||||||
|
Candidates.push_back(new BinOpChain(Mul, LHS, RHS));
|
||||||
bool MayWriteMem = false;
|
|
||||||
for (auto &V : VL) {
|
|
||||||
if (dyn_cast<Instruction>(V)->mayWriteToMemory()) {
|
|
||||||
MayWriteMem = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static ParallelMACList MatchParallelMACs(Reduction &R) {
|
static void MatchParallelMACSequences(Reduction &R,
|
||||||
ParallelMACList Candidates;
|
OpChainList &Candidates) {
|
||||||
const Instruction *Acc = R.AccIntAdd;
|
const Instruction *Acc = R.AccIntAdd;
|
||||||
Value *A, *MulOp0, *MulOp1;
|
Value *A, *MulOp0, *MulOp1;
|
||||||
LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
|
LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
|
||||||
@ -473,7 +486,6 @@ static ParallelMACList MatchParallelMACs(Reduction &R) {
|
|||||||
// Because we start at the bottom of the chain, and we work our way up,
|
// Because we start at the bottom of the chain, and we work our way up,
|
||||||
// the muls are added in reverse program order to the list.
|
// the muls are added in reverse program order to the list.
|
||||||
std::reverse(Candidates.begin(), Candidates.end());
|
std::reverse(Candidates.begin(), Candidates.end());
|
||||||
return Candidates;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collects all instructions that are not part of the MAC chains, which is the
|
// Collects all instructions that are not part of the MAC chains, which is the
|
||||||
@ -492,23 +504,23 @@ static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
|
|||||||
// the memory locations accessed by the MAC-chains.
|
// the memory locations accessed by the MAC-chains.
|
||||||
// TODO: we need the read statements when we accept more complicated chains.
|
// TODO: we need the read statements when we accept more complicated chains.
|
||||||
static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
|
static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
|
||||||
Instructions &Writes, ParallelMACList &MACCandidates) {
|
Instructions &Writes, OpChainList &MACCandidates) {
|
||||||
LLVM_DEBUG(dbgs() << "Alias checks:\n");
|
LLVM_DEBUG(dbgs() << "Alias checks:\n");
|
||||||
for (auto &MAC : MACCandidates) {
|
for (auto *MAC : MACCandidates) {
|
||||||
LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
|
LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
|
||||||
|
|
||||||
// At the moment, we allow only simple chains that only consist of reads,
|
// At the moment, we allow only simple chains that only consist of reads,
|
||||||
// accumulate their result with an integer add, and thus that don't write
|
// accumulate their result with an integer add, and thus that don't write
|
||||||
// memory, and simply bail if they do.
|
// memory, and simply bail if they do.
|
||||||
if (!MAC.ReadOnly)
|
if (!MAC->ReadOnly)
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
// Now for all writes in the basic block, check that they don't alias with
|
// Now for all writes in the basic block, check that they don't alias with
|
||||||
// the memory locations accessed by our MAC-chain:
|
// the memory locations accessed by our MAC-chain:
|
||||||
for (auto *I : Writes) {
|
for (auto *I : Writes) {
|
||||||
LLVM_DEBUG(dbgs() << "- "; I->dump());
|
LLVM_DEBUG(dbgs() << "- "; I->dump());
|
||||||
assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
|
assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
|
||||||
for (auto &MemLoc : MAC.MemLocs) {
|
for (auto &MemLoc : MAC->MemLocs) {
|
||||||
if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
|
if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
|
||||||
ModRefInfo::ModRef))) {
|
ModRefInfo::ModRef))) {
|
||||||
LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
|
LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
|
||||||
@ -522,24 +534,22 @@ static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool SetMemoryLocations(ParallelMACList &Candidates) {
|
static bool CheckMACMemory(OpChainList &Candidates) {
|
||||||
const auto Size = MemoryLocation::UnknownSize;
|
for (auto *C : Candidates) {
|
||||||
for (auto &C : Candidates) {
|
|
||||||
// A mul has 2 operands, and a narrow op consist of sext and a load; thus
|
// A mul has 2 operands, and a narrow op consist of sext and a load; thus
|
||||||
// we expect at least 4 items in this operand value list.
|
// we expect at least 4 items in this operand value list.
|
||||||
if (C.VL.size() < 4) {
|
if (C->size() < 4) {
|
||||||
LLVM_DEBUG(dbgs() << "Operand list too short.\n");
|
LLVM_DEBUG(dbgs() << "Operand list too short.\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
C->SetMemoryLocations();
|
||||||
|
ValueList &LHS = static_cast<BinOpChain*>(C)->LHS;
|
||||||
|
ValueList &RHS = static_cast<BinOpChain*>(C)->RHS;
|
||||||
|
|
||||||
for (unsigned i = 0; i < C.VL.size(); i += 4) {
|
// Use +=2 to skip over the expected extend instructions.
|
||||||
auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
|
for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
|
||||||
auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
|
if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
|
||||||
if (!LdOp0 || !LdOp1)
|
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size));
|
|
||||||
C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
@ -584,17 +594,20 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
|||||||
dbgs() << "Loop info:\n\n"; L->dump());
|
dbgs() << "Loop info:\n\n"; L->dump());
|
||||||
|
|
||||||
bool Changed = false;
|
bool Changed = false;
|
||||||
ReductionList Reductions = MatchReductions(F, L, Header);
|
ReductionList Reductions;
|
||||||
|
MatchReductions(F, L, Header, Reductions);
|
||||||
|
|
||||||
for (auto &R : Reductions) {
|
for (auto &R : Reductions) {
|
||||||
ParallelMACList MACCandidates = MatchParallelMACs(R);
|
OpChainList MACCandidates;
|
||||||
if (!SetMemoryLocations(MACCandidates))
|
MatchParallelMACSequences(R, MACCandidates);
|
||||||
|
if (!CheckMACMemory(MACCandidates))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
R.MACCandidates = MACCandidates;
|
R.MACCandidates = MACCandidates;
|
||||||
|
|
||||||
LLVM_DEBUG(dbgs() << "MAC candidates:\n";
|
LLVM_DEBUG(dbgs() << "MAC candidates:\n";
|
||||||
for (auto &M : R.MACCandidates)
|
for (auto &M : R.MACCandidates)
|
||||||
M.Mul->dump();
|
M->Root->dump();
|
||||||
dbgs() << "\n";);
|
dbgs() << "\n";);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -609,6 +622,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
|
|||||||
return false;
|
return false;
|
||||||
PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
|
PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
|
||||||
Changed |= InsertParallelMACs(R, PMACPairs);
|
Changed |= InsertParallelMACs(R, PMACPairs);
|
||||||
|
for (auto *C : R.MACCandidates)
|
||||||
|
delete C;
|
||||||
}
|
}
|
||||||
|
|
||||||
LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
|
LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););
|
||||||
|
Loading…
Reference in New Issue
Block a user