R600: Proper insert S_WAITCNT instructions

Some instructions like memory reads/writes are executed asynchronously, so we need to insert S_WAITCNT instructions to block before accessing their results. Previously we have just inserted S_WAITCNT instructions after each async instruction, this patch fixes this and adds a prober insertion pass. Patch by: Christian König Tested-by: Michel Dänzer <michel.daenzer@amd.com> Reviewed-by: Tom Stellard <thomas.stellard@amd.com> Signed-off-by: Christian König <deathsimple@vodafone.de> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172846 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-08 21:10:35 +00:00 · 2013-01-18 21:15:53 +00:00 · 2013-01-18 21:15:53 +00:00 · 82d3d4524f
commit 82d3d4524f
parent 935a91540b
8 changed files with 379 additions and 32 deletions
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@ -30,6 +30,7 @@ FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
+FunctionPass *createSIInsertWaits(TargetMachine &tm);

 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@ -116,6 +116,11 @@ bool AMDGPUPassConfig::addPreRegAlloc() {
 }

 bool AMDGPUPassConfig::addPostRegAlloc() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createSIInsertWaits(*TM));
+  }
  return false;
 }

--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@ -40,6 +40,7 @@ add_llvm_target(R600CodeGen
  R600RegisterInfo.cpp
  SIAnnotateControlFlow.cpp
  SIAssignInterpRegs.cpp
+  SIInsertWaits.cpp
  SIInstrInfo.cpp
  SIISelLowering.cpp
  SILowerLiteralConstants.cpp
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@ -66,11 +66,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
  MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
  MachineBasicBlock::iterator I = MI;

-  if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
-    AppendS_WAITCNT(MI, *BB, llvm::next(I));
-    return BB;
-  }
-
  switch (MI->getOpcode()) {
  default:
    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@ -141,13 +136,6 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
  return BB;
 }

-void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
-    MachineBasicBlock::iterator I) const {
-  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
-          .addImm(0);
-}
-
-
 void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@ -23,11 +23,6 @@ namespace llvm {
 class SITargetLowering : public AMDGPUTargetLowering {
  const SIInstrInfo * TII;

-  /// Memory reads and writes are syncronized using the S_WAITCNT instruction.
-  /// This function takes the most conservative approach and inserts an
-  /// S_WAITCNT instruction after every read and write.
-  void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
-              MachineBasicBlock::iterator I) const;
  void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
              MachineBasicBlock::iterator I, unsigned Opocde) const;
  void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@ -0,0 +1,353 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// \brief One variable for each of the hardware counters
+typedef union {
+  struct {
+    unsigned VM;
+    unsigned EXP;
+    unsigned LGKM;
+  } Named;
+  unsigned Array[3];
+
+} Counters;
+
+typedef Counters RegCounters[512];
+typedef std::pair<unsigned, unsigned> RegInterval;
+
+class SIInsertWaits : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo &TRI;
+  const MachineRegisterInfo *MRI;
+
+  /// \brief Constant hardware limits
+  static const Counters WaitCounts;
+
+  /// \brief Constant zero value
+  static const Counters ZeroCounts;
+
+  /// \brief Counter values we have already waited on.
+  Counters WaitedOn;
+
+  /// \brief Counter values for last instruction issued.
+  Counters LastIssued;
+
+  /// \brief Registers used by async instructions.
+  RegCounters UsedRegs;
+
+  /// \brief Registers defined by async instructions.
+  RegCounters DefinedRegs;
+
+  /// \brief Different export instruction types seen since last wait.
+  unsigned ExpInstrTypesSeen;
+
+  /// \brief Get increment/decrement amount for this instruction.
+  Counters getHwCounts(MachineInstr &MI);
+
+  /// \brief Is operand relevant for async execution?
+  bool isOpRelevant(MachineOperand &Op);
+
+  /// \brief Get register interval an operand affects.
+  RegInterval getRegInterval(MachineOperand &Op);
+
+  /// \brief Handle instructions async components
+  void pushInstruction(MachineInstr &MI);
+
+  /// \brief Insert the actual wait instruction
+  bool insertWait(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  const Counters &Counts);
+
+  /// \brief Resolve all operand dependencies to counter requirements
+  Counters handleOperands(MachineInstr &MI);
+
+public:
+  SIInsertWaits(TargetMachine &tm) :
+    MachineFunctionPass(ID),
+    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
+    TRI(TII->getRegisterInfo()) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI insert wait  instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIInsertWaits::ID = 0;
+
+const Counters SIInsertWaits::WaitCounts = { { 15, 7, 7 } };
+const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
+
+FunctionPass *llvm::createSIInsertWaits(TargetMachine &tm) {
+  return new SIInsertWaits(tm);
+}
+
+Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
+
+  uint64_t TSFlags = TII->get(MI.getOpcode()).TSFlags;
+  Counters Result;
+
+  Result.Named.VM = !!(TSFlags & SIInstrFlags::VM_CNT);
+
+  // Only consider stores or EXP for EXP_CNT
+  Result.Named.EXP = !!(TSFlags & SIInstrFlags::EXP_CNT &&
+      (MI.getOpcode() == AMDGPU::EXP || !MI.getDesc().mayStore()));
+
+  // LGKM may uses larger values
+  if (TSFlags & SIInstrFlags::LGKM_CNT) {
+
+    MachineOperand &Op = MI.getOperand(0);
+    assert(Op.isReg() && "First LGKM operand must be a register!");
+
+    unsigned Reg = Op.getReg();
+    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+    Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+  } else {
+    Result.Named.LGKM = 0;
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
+
+  // Constants are always irrelevant
+  if (!Op.isReg())
+    return false;
+
+  // Defines are always relevant
+  if (Op.isDef())
+    return true;
+
+  // For exports all registers are relevant
+  MachineInstr &MI = *Op.getParent();
+  if (MI.getOpcode() == AMDGPU::EXP)
+    return true;
+
+  // For stores the stored value is also relevant
+  if (!MI.getDesc().mayStore())
+    return false;
+
+  for (MachineInstr::mop_iterator I = MI.operands_begin(),
+       E = MI.operands_end(); I != E; ++I) {
+
+    if (I->isReg() && I->isUse())
+      return Op.isIdenticalTo(*I);
+  }
+
+  return false;
+}
+
+RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
+
+  if (!Op.isReg())
+    return std::make_pair(0, 0);
+
+  unsigned Reg = Op.getReg();
+  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+
+  assert(Size >= 4);
+
+  RegInterval Result;
+  Result.first = TRI.getEncodingValue(Reg);
+  Result.second = Result.first + Size / 4;
+
+  return Result;
+}
+
+void SIInsertWaits::pushInstruction(MachineInstr &MI) {
+
+  // Get the hardware counter increments and sum them up
+  Counters Increment = getHwCounts(MI);
+  unsigned Sum = 0;
+
+  for (unsigned i = 0; i < 3; ++i) {
+    LastIssued.Array[i] += Increment.Array[i];
+    Sum += Increment.Array[i];
+  }
+
+  // If we don't increase anything then that's it
+  if (Sum == 0)
+    return;
+
+  // Remember which export instructions we have seen
+  if (Increment.Named.EXP) {
+    ExpInstrTypesSeen |= MI.getOpcode() == AMDGPU::EXP ? 1 : 2;
+  }
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    if (!isOpRelevant(Op))
+      continue;
+
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      // Remember which registers we define
+      if (Op.isDef())
+        DefinedRegs[j] = LastIssued;
+
+      // and which one we are using
+      if (Op.isUse())
+        UsedRegs[j] = LastIssued;
+    }
+  }
+}
+
+bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               const Counters &Required) {
+
+  // End of program? No need to wait on anything
+  if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM)
+    return false;
+
+  // Figure out if the async instructions execute in order
+  bool Ordered[3];
+
+  // VM_CNT is always ordered
+  Ordered[0] = true;
+
+  // EXP_CNT is unordered if we have both EXP & VM-writes
+  Ordered[1] = ExpInstrTypesSeen == 3;
+
+  // LGKM_CNT is handled as always unordered. TODO: Handle LDS and GDS
+  Ordered[2] = false;
+
+  // The values we are going to put into the S_WAITCNT instruction
+  Counters Counts = WaitCounts;
+
+  // Do we really need to wait?
+  bool NeedWait = false;
+
+  for (unsigned i = 0; i < 3; ++i) {
+
+    if (Required.Array[i] <= WaitedOn.Array[i])
+      continue;
+
+    NeedWait = true;
+    
+    if (Ordered[i]) {
+      unsigned Value = LastIssued.Array[i] - Required.Array[i];
+
+      // adjust the value to the real hardware posibilities
+      Counts.Array[i] = std::min(Value, WaitCounts.Array[i]);
+
+    } else
+      Counts.Array[i] = 0;
+
+    // Remember on what we have waited on
+    WaitedOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+  }
+
+  if (!NeedWait)
+    return false;
+
+  // Reset EXP_CNT instruction types
+  if (Counts.Named.EXP == 0)
+    ExpInstrTypesSeen = 0;
+
+  // Build the wait instruction
+  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+          .addImm((Counts.Named.VM & 0xF) |
+                  ((Counts.Named.EXP & 0x7) << 4) |
+                  ((Counts.Named.LGKM & 0x7) << 8));
+
+  return true;
+}
+
+/// \brief helper function for handleOperands
+static void increaseCounters(Counters &Dst, const Counters &Src) {
+
+  for (unsigned i = 0; i < 3; ++i)
+    Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
+}
+
+Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
+
+  Counters Result = ZeroCounts;
+
+  // For each register affected by this
+  // instruction increase the result sequence
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+
+    MachineOperand &Op = MI.getOperand(i);
+    RegInterval Interval = getRegInterval(Op);
+    for (unsigned j = Interval.first; j < Interval.second; ++j) {
+
+      if (Op.isDef())
+        increaseCounters(Result, UsedRegs[j]);
+
+      if (Op.isUse())
+        increaseCounters(Result, DefinedRegs[j]);
+    }
+  }
+
+  return Result;
+}
+
+bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
+
+  bool Changes = false;
+
+  MRI = &MF.getRegInfo();
+
+  WaitedOn = ZeroCounts;
+  LastIssued = ZeroCounts;
+
+  memset(&UsedRegs, 0, sizeof(UsedRegs));
+  memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+       BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+
+      Changes |= insertWait(MBB, I, handleOperands(*I));
+      pushInstruction(*I);
+    }
+
+    // Wait for everything at the end of the MBB
+    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+  }
+
+  return Changes;
+}
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@ -55,7 +55,9 @@ public:
 namespace SIInstrFlags {
  enum Flags {
    // First 4 bits are the instruction encoding
-    NEED_WAIT = 1 << 4
+    VM_CNT = 1 << 4,
+    EXP_CNT = 1 << 5,
+    LGKM_CNT = 1 << 6
  };
 }

--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@ -42,11 +42,14 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
    AMDGPUInst<outs, ins, asm, pattern> {

  field bits<4> EncodingType = 0;
-  field bits<1> NeedWait = 0;
+  field bits<1> VM_CNT = 0;
+  field bits<1> EXP_CNT = 0;
+  field bits<1> LGKM_CNT = 0;

  let TSFlags{3-0} = EncodingType;
-  let TSFlags{4} = NeedWait;
-
+  let TSFlags{4} = VM_CNT;
+  let TSFlags{5} = EXP_CNT;
+  let TSFlags{6} = LGKM_CNT;
 }

 class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
@ -140,8 +143,7 @@ def EXP : Enc64<
  let Inst{63-56} = VSRC3;
  let EncodingType = 0; //SIInstrEncodingType::EXP

-  let NeedWait = 1;
-  let usesCustomInserter = 1;
+  let EXP_CNT = 1;
 }

 class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@ -174,11 +176,10 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
  let Inst{47-40} = VDATA;
  let Inst{52-48} = SRSRC;
  let Inst{57-53} = SSAMP;
-
  let EncodingType = 2; //SIInstrEncodingType::MIMG

-  let NeedWait = 1;
-  let usesCustomInserter = 1;
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
 }

 class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
@ -215,8 +216,9 @@ class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
  let Inst{63-56} = SOFFSET;
  let EncodingType = 3; //SIInstrEncodingType::MTBUF

-  let NeedWait = 1;
-  let usesCustomInserter = 1;
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
  let neverHasSideEffects = 1;
 }

@ -252,8 +254,9 @@ class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
  let Inst{63-56} = SOFFSET;
  let EncodingType = 4; //SIInstrEncodingType::MUBUF

-  let NeedWait = 1;
-  let usesCustomInserter = 1;
+  let VM_CNT = 1;
+  let EXP_CNT = 1;
+
  let neverHasSideEffects = 1;
 }

@ -276,8 +279,7 @@ class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
  let Inst{31-27} = 0x18; //encoding
  let EncodingType = 5; //SIInstrEncodingType::SMRD

-  let NeedWait = 1;
-  let usesCustomInserter = 1;
+  let LGKM_CNT = 1;
 }

 class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :