AMDGPU: Do not clobber SCC in SIWholeQuadMode

Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: http://reviews.llvm.org/D22198 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@281230 91177308-0d34-0410-b5e6-96231b3b80d8
2025-04-04 14:22:26 +00:00 · 2016-09-12 16:25:20 +00:00 · 2016-09-12 16:25:20 +00:00 · 01a133c760
commit 01a133c760
parent ce3adfd9b4
3 changed files with 250 additions and 78 deletions
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -343,11 +343,6 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                              const DebugLoc &DL, unsigned DestReg,
                              unsigned SrcReg, bool KillSrc) const {
  // If we are trying to copy to or from SCC, there is a bug somewhere else in
  // the backend.  While it may be theoretically possible to do this, it should
  // never be necessary.
  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
  static const int16_t Sub0_15[] = {
    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
@ -392,6 +387,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
  ArrayRef<int16_t> SubIndices;
  if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
    if (SrcReg == AMDGPU::SCC) {
      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
          .addImm(-1)
          .addImm(0);
      return;
    }
    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
            .addReg(SrcReg, getKillRegState(KillSrc));
@ -418,6 +420,12 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
            .addReg(SrcReg, getKillRegState(KillSrc));
    return;
  } else if (DestReg == AMDGPU::SCC) {
    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
    BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
        .addReg(SrcReg, getKillRegState(KillSrc))
        .addImm(0);
    return;
  } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
    assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
    Opcode = AMDGPU::S_MOV_B64;
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@ -129,6 +129,14 @@ private:
  void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
  char analyzeFunction(MachineFunction &MF);
  bool requiresCorrectState(const MachineInstr &MI) const;
  MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator Before);
  MachineBasicBlock::iterator
  prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
                   MachineBasicBlock::iterator Last, bool PreferLast,
                   bool SaveSCC);
  void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
               unsigned SaveWQM, unsigned LiveMaskReg);
  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
@ -398,32 +406,140 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
  return GlobalFlags;
 }
 /// Whether \p MI really requires the exec state computed during analysis.
 ///
 /// Scalar instructions must occasionally be marked WQM for correct propagation
 /// (e.g. thread masks leading up to branches), but when it comes to actual
 /// execution, they don't care about EXEC.
 bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
  if (MI.isTerminator())
    return true;
  // Skip instructions that are not affected by EXEC
  if (TII->isScalarUnit(MI))
    return false;
  // Generic instructions such as COPY will either disappear by register
  // coalescing or be lowered to SALU or VALU instructions.
  if (MI.isTransient()) {
    if (MI.getNumExplicitOperands() >= 1) {
      const MachineOperand &Op = MI.getOperand(0);
      if (Op.isReg()) {
        if (TRI->isSGPRReg(*MRI, Op.getReg())) {
          // SGPR instructions are not affected by EXEC
          return false;
        }
      }
    }
  }
  return true;
 }
 MachineBasicBlock::iterator
 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator Before) {
  unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
  MachineInstr *Save =
      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
          .addReg(AMDGPU::SCC);
  MachineInstr *Restore =
      BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
          .addReg(SaveReg);
  LIS->InsertMachineInstrInMaps(*Save);
  LIS->InsertMachineInstrInMaps(*Restore);
  LIS->createAndComputeVirtRegInterval(SaveReg);
  return Restore;
 }
 // Return an iterator in the (inclusive) range [First, Last] at which
 // instructions can be safely inserted, keeping in mind that some of the
 // instructions we want to add necessarily clobber SCC.
 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
    MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
    MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
  if (!SaveSCC)
    return PreferLast ? Last : First;
  LiveRange &LR = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
  auto MBBE = MBB.end();
  SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
                                     : LIS->getMBBEndIdx(&MBB);
  SlotIndex LastIdx =
      Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
  SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
  const LiveRange::Segment *S;
  for (;;) {
    S = LR.getSegmentContaining(Idx);
    if (!S)
      break;
    if (PreferLast) {
      SlotIndex Next = S->start.getBaseIndex();
      if (Next < FirstIdx)
        break;
      Idx = Next;
    } else {
      SlotIndex Next = S->end.getNextIndex().getBaseIndex();
      if (Next > LastIdx)
        break;
      Idx = Next;
    }
  }
  MachineBasicBlock::iterator MBBI;
  if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
    MBBI = MI;
  else {
    assert(Idx == LIS->getMBBEndIdx(&MBB));
    MBBI = MBB.end();
  }
  if (S)
    MBBI = saveSCC(MBB, MBBI);
  return MBBI;
 }
 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator Before,
                              unsigned SaveWQM, unsigned LiveMaskReg) {
  MachineInstr *MI;
  if (SaveWQM) {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
-            SaveWQM)
+                 SaveWQM)
-        .addReg(LiveMaskReg);
+             .addReg(LiveMaskReg);
  } else {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64),
-            AMDGPU::EXEC)
+                 AMDGPU::EXEC)
-        .addReg(AMDGPU::EXEC)
+             .addReg(AMDGPU::EXEC)
-        .addReg(LiveMaskReg);
+             .addReg(LiveMaskReg);
  }
  LIS->InsertMachineInstrInMaps(*MI);
 }
 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator Before,
                            unsigned SavedWQM) {
  MachineInstr *MI;
  if (SavedWQM) {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC)
-        .addReg(SavedWQM);
+             .addReg(SavedWQM);
  } else {
-    BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
+    MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
-            AMDGPU::EXEC)
+                 AMDGPU::EXEC)
-        .addReg(AMDGPU::EXEC);
+             .addReg(AMDGPU::EXEC);
  }
  LIS->InsertMachineInstrInMaps(*MI);
 }
 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
@ -447,76 +563,77 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
  unsigned SavedWQMReg = 0;
  bool WQMFromExec = isEntry;
  char State = isEntry ? StateExact : StateWQM;
  MachineInstr *FirstNonWQM = nullptr;
  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
-  while (II != IE) {
+  if (isEntry)
-    MachineInstr &MI = *II;
+    ++II; // Skip the instruction that saves LiveMask
    ++II;
    // Skip instructions that are not affected by EXEC
    if (TII->isScalarUnit(MI) && !MI.isTerminator())
      continue;
    // Generic instructions such as COPY will either disappear by register
    // coalescing or be lowered to SALU or VALU instructions.
    if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) {
      if (MI.getNumExplicitOperands() >= 1) {
        const MachineOperand &Op = MI.getOperand(0);
        if (Op.isReg()) {
          if (TRI->isSGPRReg(*MRI, Op.getReg())) {
            // SGPR instructions are not affected by EXEC
            continue;
          }
        }
      }
    }
  MachineBasicBlock::iterator First = IE;
  for (;;) {
    MachineBasicBlock::iterator Next = II;
    char Needs = 0;
    char OutNeeds = 0;
    auto InstrInfoIt = Instructions.find(&MI);
    if (InstrInfoIt != Instructions.end()) {
      Needs = InstrInfoIt->second.Needs;
      OutNeeds = InstrInfoIt->second.OutNeeds;
    }
-    // Keep track of the first consecutive non-WQM instruction, so that we
+    if (First == IE)
-    // switch away from WQM as soon as possible, potentially saving a small
+      First = II;
    // bit of bandwidth on loads.
    if (Needs == StateWQM)
      FirstNonWQM = nullptr;
    else if (!FirstNonWQM)
      FirstNonWQM = &MI;
-    // State switching
+    if (II != IE) {
-    if (Needs && State != Needs) {
+      MachineInstr &MI = *II;
      if (Needs == StateExact) {
        assert(!SavedWQMReg);
-        if (!WQMFromExec && (OutNeeds & StateWQM))
+      if (requiresCorrectState(MI)) {
-          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        auto III = Instructions.find(&MI);
-
+        if (III != Instructions.end()) {
-        toExact(MBB, FirstNonWQM, SavedWQMReg, LiveMaskReg);
+          Needs = III->second.Needs;
-      } else {
+          OutNeeds = III->second.OutNeeds;
-        assert(WQMFromExec == (SavedWQMReg == 0));
+        }
        toWQM(MBB, &MI, SavedWQMReg);
        SavedWQMReg = 0;
      }
-      State = Needs;
+      if (MI.isTerminator() && !Needs && OutNeeds == StateExact)
        Needs = StateExact;
      if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
        MI.getOperand(3).setImm(1);
      ++Next;
    } else {
      // End of basic block
      if (BI.OutNeeds & StateWQM)
        Needs = StateWQM;
      else if (BI.OutNeeds == StateExact)
        Needs = StateExact;
    }
-    if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
+    if (Needs) {
-      MI.getOperand(3).setImm(1);
+      if (Needs != State) {
-  }
+        MachineBasicBlock::iterator Before =
            prepareInsertion(MBB, First, II, Needs == StateWQM,
                             Needs == StateExact || WQMFromExec);
-  if ((BI.OutNeeds & StateWQM) && State != StateWQM) {
+        if (Needs == StateExact) {
-    assert(WQMFromExec == (SavedWQMReg == 0));
+          if (!WQMFromExec && (OutNeeds & StateWQM))
-    toWQM(MBB, MBB.end(), SavedWQMReg);
+            SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-  } else if (BI.OutNeeds == StateExact && State != StateExact) {
+
-    toExact(MBB, FirstNonWQM ? MachineBasicBlock::iterator(FirstNonWQM)
+          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
-                             : MBB.getFirstTerminator(),
+        } else {
-            0, LiveMaskReg);
+          assert(WQMFromExec == (SavedWQMReg == 0));
          toWQM(MBB, Before, SavedWQMReg);
          if (SavedWQMReg) {
            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
            SavedWQMReg = 0;
          }
        }
        State = Needs;
      }
      First = IE;
    }
    if (II == IE)
      break;
    II = Next;
  }
 }
@ -524,8 +641,11 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
  for (MachineInstr *MI : LiveMaskQueries) {
    const DebugLoc &DL = MI->getDebugLoc();
    unsigned Dest = MI->getOperand(0).getReg();
-    BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
+    MachineInstr *Copy =
-        .addReg(LiveMaskReg);
+        BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
            .addReg(LiveMaskReg);
    LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
    MI->eraseFromParent();
  }
 }
@ -559,8 +679,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
    if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
      LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
+      MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
-          .addReg(AMDGPU::EXEC);
+                                 TII->get(AMDGPU::COPY), LiveMaskReg)
                             .addReg(AMDGPU::EXEC);
      LIS->InsertMachineInstrInMaps(*MI);
    }
    if (GlobalFlags == StateWQM) {
@ -583,5 +705,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
  for (auto BII : Blocks)
    processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
  // Physical registers like SCC aren't tracked by default anyway, so just
  // removing the ranges we computed is the simplest option for maintaining
  // the analysis results.
  LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));
  return true;
 }
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@ -466,6 +466,42 @@ else:
  ret <4 x float> %dtex
 }
 ; Test awareness that s_wqm_b64 clobbers SCC.
 ;
 ; CHECK-LABEL: {{^}}test_scc:
 ; CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ; CHECK: s_wqm_b64 exec, exec
 ; CHECK: s_cmp_
 ; CHECK-NEXT: s_cbranch_scc
 ; CHECK: ; %if
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
 ; CHECK: image_sample
 ; CHECK: ; %else
 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
 ; CHECK: image_sample
 ; CHECK: ; %end
 define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
 main_body:
  %cc = icmp sgt i32 %sel, 0
  br i1 %cc, label %if, label %else
 if:
  %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  br label %end
 else:
  %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  br label %end
 end:
  %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
  call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
  ret <4 x float> %r
 }
 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
@ -474,6 +510,7 @@ declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1,
 declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
 declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare void @llvm.AMDGPU.kill(float)