[ARM][NFC] ParallelDSP reorganisation

In preparing to allow ARMParallelDSP pass to parallelise more than smlads, I've restructed some elements: - The ParallelMAC struct has been renamed to BinOpChain. - The BinOpChain struct holds two value lists: LHS and RHS, as well as inheriting from the OpChain base class. - The OpChain struct holds all the values of the represented chain and has had the memory locations functionality inserted into it. - ParallelMACList becomes OpChainList and it now holds pointers instead of objects. Differential Revision: https://reviews.llvm.org/D49020 llvm-svn: 337701
2024-10-10 04:44:53 +00:00 · 2018-07-23 15:25:59 +00:00 · 2018-07-23 15:25:59 +00:00 · 89a3799a69
commit 89a3799a69
parent 59c94bec0d
1 changed files with 102 additions and 87 deletions
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@ -43,38 +43,56 @@ using namespace PatternMatch;
 STATISTIC(NumSMLAD , "Number of smlad instructions generated");
 namespace {
-  struct ParallelMAC;
+  struct OpChain;
  struct BinOpChain;
  struct Reduction;
-  using ParallelMACList = SmallVector<ParallelMAC, 8>;
+  using OpChainList     = SmallVector<OpChain*, 8>;
  using ReductionList   = SmallVector<Reduction, 8>;
  using ValueList       = SmallVector<Value*, 8>;
  using MemInstList     = SmallVector<Instruction*, 8>;
-  using PMACPair        = std::pair<ParallelMAC*,ParallelMAC*>;
+  using PMACPair        = std::pair<BinOpChain*,BinOpChain*>;
  using PMACPairList    = SmallVector<PMACPair, 8>;
  using Instructions    = SmallVector<Instruction*,16>;
  using MemLocList      = SmallVector<MemoryLocation, 4>;
-  // 'ParallelMAC' and 'Reduction' are just some bookkeeping data structures.
+  struct OpChain {
    Instruction   *Root;
    ValueList     AllValues;
    MemInstList   VecLd;    // List of all load instructions.
    MemLocList    MemLocs;  // All memory locations read by this tree.
    bool          ReadOnly = true;
    OpChain(Instruction *I, ValueList &vl) : Root(I), AllValues(vl) { }
    void SetMemoryLocations() {
      const auto Size = MemoryLocation::UnknownSize;
      for (auto *V : AllValues) {
        if (auto *I = dyn_cast<Instruction>(V)) {
          if (I->mayWriteToMemory())
            ReadOnly = false;
          if (auto *Ld = dyn_cast<LoadInst>(V))
            MemLocs.push_back(MemoryLocation(Ld->getPointerOperand(), Size));
        }
      }
    }
    unsigned size() const { return AllValues.size(); }
  };
  // 'BinOpChain' and 'Reduction' are just some bookkeeping data structures.
  // 'Reduction' contains the phi-node and accumulator statement from where we
-  // start pattern matching, and 'ParallelMAC' the multiplication
+  // start pattern matching, and 'BinOpChain' the multiplication
  // instructions that are candidates for parallel execution.
-  struct ParallelMAC {
+  struct BinOpChain : public OpChain {
-    Instruction *Mul;
+    ValueList     LHS;      // List of all (narrow) left hand operands.
-    ValueList    VL;        // List of all (narrow) operands of this Mul
+    ValueList     RHS;      // List of all (narrow) right hand operands.
    MemInstList  VecLd;     // List of all load instructions of this Mul
    MemLocList   MemLocs;   // All memory locations read by this Mul
-    // The MAC-chains we currently recognise are simple chains that accumulate
+    BinOpChain(Instruction *I, ValueList &lhs, ValueList &rhs) :
-    // their results with a reducing integer add statement, and consist of
+      OpChain(I, lhs), LHS(lhs), RHS(rhs) {
-    // a chain of adds and muls, which have only sext and load instructions as
+        for (auto *V : RHS)
-    // operands. Thus, these chains don't write memory. We check that this is
+          AllValues.push_back(V);
-    // true when we collect the operands, and use this in alias analysis checks
+      }
    // that different parallel MACs don't interfere with each other.
    bool ReadOnly;
    ParallelMAC(Instruction *I, ValueList &V, bool RdOnly)
      : Mul(I), VL(V), ReadOnly(RdOnly) {};
  };
  struct Reduction {
@ -83,7 +101,7 @@ namespace {
    Instruction     *AccIntAdd;       // The accumulating integer add statement,
                                      // i.e, the reduction statement.
-    ParallelMACList MACCandidates;    // The MAC candidates associated with
+    OpChainList     MACCandidates;    // The MAC candidates associated with
                                      // this reduction statement.
    Reduction (PHINode *P, Instruction *Acc) : Phi(P), AccIntAdd(Acc) { };
  };
@ -100,7 +118,7 @@ namespace {
    bool InsertParallelMACs(Reduction &Reduction, PMACPairList &PMACPairs);
    bool AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1, MemInstList &VecMem);
-    PMACPairList CreateParallelMACPairs(ParallelMACList &Candidates);
+    PMACPairList CreateParallelMACPairs(OpChainList &Candidates);
    Instruction *CreateSMLADCall(LoadInst *VecLd0, LoadInst *VecLd1,
                                 Instruction *Acc, Instruction *InsertAfter);
@ -303,7 +321,7 @@ bool ARMParallelDSP::AreSequentialLoads(LoadInst *Ld0, LoadInst *Ld1,
 }
 PMACPairList
-ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
+ARMParallelDSP::CreateParallelMACPairs(OpChainList &Candidates) {
  const unsigned Elems = Candidates.size();
  PMACPairList PMACPairs;
@ -314,10 +332,10 @@ ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
  // We can compare all elements, but then we need to compare and evaluate
  // different solutions.
  for(unsigned i=0; i<Elems-1; i+=2) {
-    ParallelMAC &PMul0 = Candidates[i];
+    BinOpChain *PMul0 = static_cast<BinOpChain*>(Candidates[i]);
-    ParallelMAC &PMul1 = Candidates[i+1];
+    BinOpChain *PMul1 = static_cast<BinOpChain*>(Candidates[i+1]);
-    const Instruction *Mul0 = PMul0.Mul;
+    const Instruction *Mul0 = PMul0->Root;
-    const Instruction *Mul1 = PMul1.Mul;
+    const Instruction *Mul1 = PMul1->Root;
    if (Mul0 == Mul1)
      continue;
@ -326,10 +344,13 @@ ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
               dbgs() << "- "; Mul0->dump();
               dbgs() << "- "; Mul1->dump());
-    const ValueList &VL0 = PMul0.VL;
+    const ValueList &Mul0_LHS = PMul0->LHS;
-    const ValueList &VL1 = PMul1.VL;
+    const ValueList &Mul0_RHS = PMul0->RHS;
    const ValueList &Mul1_LHS = PMul1->LHS;
    const ValueList &Mul1_RHS = PMul1->RHS;
-    if (!AreSymmetrical(VL0, VL1))
+    if (!AreSymmetrical(Mul0_LHS, Mul1_LHS) ||
        !AreSymmetrical(Mul0_RHS, Mul1_RHS))
      continue;
    LLVM_DEBUG(dbgs() << "OK: mul operands list match:\n");
@ -337,23 +358,23 @@ ARMParallelDSP::CreateParallelMACPairs(ParallelMACList &Candidates) {
    // that its two pairs of consecutive loads, then these can be transformed
    // into two wider loads and the users can be replaced with DSP
    // intrinsics.
-    for (unsigned x = 0; x < VL0.size(); x += 4) {
+    for (unsigned x = 0; x < Mul0_LHS.size(); x += 2) {
-      auto *Ld0 = dyn_cast<LoadInst>(VL0[x]);
+      auto *Ld0 = dyn_cast<LoadInst>(Mul0_LHS[x]);
-      auto *Ld1 = dyn_cast<LoadInst>(VL1[x]);
+      auto *Ld1 = dyn_cast<LoadInst>(Mul1_LHS[x]);
-      auto *Ld2 = dyn_cast<LoadInst>(VL0[x+2]);
+      auto *Ld2 = dyn_cast<LoadInst>(Mul0_RHS[x]);
-      auto *Ld3 = dyn_cast<LoadInst>(VL1[x+2]);
+      auto *Ld3 = dyn_cast<LoadInst>(Mul1_RHS[x]);
      LLVM_DEBUG(dbgs() << "Looking at operands " << x << ":\n";
-                 dbgs() << "\t mul1: "; VL0[x]->dump();
+                 dbgs() << "\t mul1: "; Mul0_LHS[x]->dump();
-                 dbgs() << "\t mul2: "; VL1[x]->dump();
+                 dbgs() << "\t mul2: "; Mul1_LHS[x]->dump();
                 dbgs() << "and operands " << x + 2 << ":\n";
-                 dbgs() << "\t mul1: "; VL0[x+2]->dump();
+                 dbgs() << "\t mul1: "; Mul0_RHS[x]->dump();
-                 dbgs() << "\t mul2: "; VL1[x+2]->dump());
+                 dbgs() << "\t mul2: "; Mul1_RHS[x]->dump());
-      if (AreSequentialLoads(Ld0, Ld1, Candidates[i].VecLd) &&
+      if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd) &&
-          AreSequentialLoads(Ld2, Ld3, Candidates[i+1].VecLd)) {
+          AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
        LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
-        PMACPairs.push_back(std::make_pair(&PMul0, &PMul1));
+        PMACPairs.push_back(std::make_pair(PMul0, PMul1));
      }
    }
  }
@ -367,8 +388,8 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
  for (auto &Pair : PMACPairs) {
    LLVM_DEBUG(dbgs() << "Found parallel MACs!!\n";
-               dbgs() << "- "; Pair.first->Mul->dump();
+               dbgs() << "- "; Pair.first->Root->dump();
-               dbgs() << "- "; Pair.second->Mul->dump());
+               dbgs() << "- "; Pair.second->Root->dump());
    auto *VecLd0 = cast<LoadInst>(Pair.first->VecLd[0]);
    auto *VecLd1 = cast<LoadInst>(Pair.second->VecLd[0]);
    Acc = CreateSMLADCall(VecLd0, VecLd1, Acc, InsertAfter);
@ -383,9 +404,8 @@ bool ARMParallelDSP::InsertParallelMACs(Reduction &Reduction,
  return false;
 }
-static ReductionList MatchReductions(Function &F, Loop *TheLoop,
+static void MatchReductions(Function &F, Loop *TheLoop, BasicBlock *Header,
-                                     BasicBlock *Header) {
+                            ReductionList &Reductions) {
  ReductionList Reductions;
  RecurrenceDescriptor RecDesc;
  const bool HasFnNoNaNAttr =
    F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
@ -394,7 +414,7 @@ static ReductionList MatchReductions(Function &F, Loop *TheLoop,
  // We need a preheader as getIncomingValueForBlock assumes there is one.
  if (!TheLoop->getLoopPreheader()) {
    LLVM_DEBUG(dbgs() << "No preheader found, bailing out\n");
-    return Reductions;
+    return;
  }
  for (PHINode &Phi : Header->phis()) {
@ -418,36 +438,29 @@ static ReductionList MatchReductions(Function &F, Loop *TheLoop,
  LLVM_DEBUG(
    dbgs() << "\nAccumulating integer additions (reductions) found:\n";
-    for (auto R : Reductions) {
+    for (auto &R : Reductions) {
      dbgs() << "-  "; R.Phi->dump();
      dbgs() << "-> "; R.AccIntAdd->dump();
    }
  );
  return Reductions;
 }
-static void AddMACCandidate(ParallelMACList &Candidates, const Instruction *Acc,
+static void AddMACCandidate(OpChainList &Candidates,
                            const Instruction *Acc,
                            Value *MulOp0, Value *MulOp1, int MulOpNum) {
  Instruction *Mul = dyn_cast<Instruction>(Acc->getOperand(MulOpNum));
  LLVM_DEBUG(dbgs() << "OK, found acc mul:\t"; Mul->dump());
-  ValueList VL;
+  ValueList LHS;
-  if (IsNarrowSequence<16>(MulOp0, VL) &&
+  ValueList RHS;
-      IsNarrowSequence<16>(MulOp1, VL)) {
+  if (IsNarrowSequence<16>(MulOp0, LHS) &&
      IsNarrowSequence<16>(MulOp1, RHS)) {
    LLVM_DEBUG(dbgs() << "OK, found narrow mul: "; Mul->dump());
-
+    Candidates.push_back(new BinOpChain(Mul, LHS, RHS));
    bool MayWriteMem = false;
    for (auto &V : VL) {
      if (dyn_cast<Instruction>(V)->mayWriteToMemory()) {
        MayWriteMem = true;
        break;
      }
    }
    Candidates.push_back(ParallelMAC(Mul, VL, !MayWriteMem));
  }
 }
-static ParallelMACList MatchParallelMACs(Reduction &R) {
+static void MatchParallelMACSequences(Reduction &R,
-  ParallelMACList Candidates;
+                                      OpChainList &Candidates) {
  const Instruction *Acc = R.AccIntAdd;
  Value *A, *MulOp0, *MulOp1;
  LLVM_DEBUG(dbgs() << "\n- Analysing:\t"; Acc->dump());
@ -473,7 +486,6 @@ static ParallelMACList MatchParallelMACs(Reduction &R) {
  // Because we start at the bottom of the chain, and we work our way up,
  // the muls are added in reverse program order to the list.
  std::reverse(Candidates.begin(), Candidates.end());
  return Candidates;
 }
 // Collects all instructions that are not part of the MAC chains, which is the
@ -492,23 +504,23 @@ static void AliasCandidates(BasicBlock *Header, Instructions &Reads,
 // the memory locations accessed by the MAC-chains.
 // TODO: we need the read statements when we accept more complicated chains.
 static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
-                       Instructions &Writes, ParallelMACList &MACCandidates) {
+                       Instructions &Writes, OpChainList &MACCandidates) {
  LLVM_DEBUG(dbgs() << "Alias checks:\n");
-  for (auto &MAC : MACCandidates) {
+  for (auto *MAC : MACCandidates) {
-    LLVM_DEBUG(dbgs() << "mul: "; MAC.Mul->dump());
+    LLVM_DEBUG(dbgs() << "mul: "; MAC->Root->dump());
    // At the moment, we allow only simple chains that only consist of reads,
    // accumulate their result with an integer add, and thus that don't write
    // memory, and simply bail if they do.
-    if (!MAC.ReadOnly)
+    if (!MAC->ReadOnly)
      return true;
    // Now for all writes in the basic block, check that they don't alias with
    // the memory locations accessed by our MAC-chain:
    for (auto *I : Writes) {
      LLVM_DEBUG(dbgs() << "- "; I->dump());
-      assert(MAC.MemLocs.size() >= 2 && "expecting at least 2 memlocs");
+      assert(MAC->MemLocs.size() >= 2 && "expecting at least 2 memlocs");
-      for (auto &MemLoc : MAC.MemLocs) {
+      for (auto &MemLoc : MAC->MemLocs) {
        if (isModOrRefSet(intersectModRef(AA->getModRefInfo(I, MemLoc),
                                          ModRefInfo::ModRef))) {
          LLVM_DEBUG(dbgs() << "Yes, aliases found\n");
@ -522,24 +534,22 @@ static bool AreAliased(AliasAnalysis *AA, Instructions &Reads,
  return false;
 }
-static bool SetMemoryLocations(ParallelMACList &Candidates) {
+static bool CheckMACMemory(OpChainList &Candidates) {
-  const auto Size = MemoryLocation::UnknownSize;
+  for (auto *C : Candidates) {
  for (auto &C : Candidates) {
    // A mul has 2 operands, and a narrow op consist of sext and a load; thus
    // we expect at least 4 items in this operand value list.
-    if (C.VL.size() < 4) {
+    if (C->size() < 4) {
      LLVM_DEBUG(dbgs() << "Operand list too short.\n");
      return false;
    }
    C->SetMemoryLocations();
    ValueList &LHS = static_cast<BinOpChain*>(C)->LHS;
    ValueList &RHS = static_cast<BinOpChain*>(C)->RHS;
-    for (unsigned i = 0; i < C.VL.size(); i += 4) {
+    // Use +=2 to skip over the expected extend instructions.
-      auto *LdOp0 = dyn_cast<LoadInst>(C.VL[i]);
+    for (unsigned i = 0, e = LHS.size(); i < e; i += 2) {
-      auto *LdOp1 = dyn_cast<LoadInst>(C.VL[i+2]);
+      if (!isa<LoadInst>(LHS[i]) || !isa<LoadInst>(RHS[i]))
      if (!LdOp0 || !LdOp1)
        return false;
      C.MemLocs.push_back(MemoryLocation(LdOp0->getPointerOperand(), Size));
      C.MemLocs.push_back(MemoryLocation(LdOp1->getPointerOperand(), Size));
    }
  }
  return true;
@ -584,17 +594,20 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
             dbgs() << "Loop info:\n\n"; L->dump());
  bool Changed = false;
-  ReductionList Reductions = MatchReductions(F, L, Header);
+  ReductionList Reductions;
  MatchReductions(F, L, Header, Reductions);
  for (auto &R : Reductions) {
-    ParallelMACList MACCandidates = MatchParallelMACs(R);
+    OpChainList MACCandidates;
-    if (!SetMemoryLocations(MACCandidates))
+    MatchParallelMACSequences(R, MACCandidates);
    if (!CheckMACMemory(MACCandidates))
      continue;
    R.MACCandidates = MACCandidates;
    LLVM_DEBUG(dbgs() << "MAC candidates:\n";
      for (auto &M : R.MACCandidates)
-        M.Mul->dump();
+        M->Root->dump();
      dbgs() << "\n";);
  }
@ -609,6 +622,8 @@ bool ARMParallelDSP::MatchSMLAD(Function &F) {
      return false;
    PMACPairList PMACPairs = CreateParallelMACPairs(R.MACCandidates);
    Changed |= InsertParallelMACs(R, PMACPairs);
    for (auto *C : R.MACCandidates)
      delete C;
  }
  LLVM_DEBUG(if (Changed) dbgs() << "Header block:\n"; Header->dump(););