[AMDGPU] Add support for Whole Wavefront Mode

Summary: Whole Wavefront Wode (WWM) is similar to WQM, except that all of the lanes are always enabled, regardless of control flow. This is required for implementing wavefront reductions in non-uniform control flow, where we need to use the inactive lanes to propagate intermediate results, so they need to be enabled. We need to propagate WWM to uses (unless they're explicitly marked as exact) so that they also propagate intermediate results correctly. We do the analysis and exec mask munging during the WQM pass, since there are interactions with WQM for things that require both WQM and WWM. For simplicity, WWM is entirely block-local -- blocks are never WWM on entry or exit of a block, and WWM is not propagated to the block level. This means that computations involving WWM cannot involve control flow, but we only ever plan to use WWM for a few limited purposes (none of which involve control flow) anyways. Shaders can ask for WWM using the @llvm.amdgcn.wwm intrinsic. There isn't yet a way to turn WWM off -- that will be added in a future change. Finally, it turns out that turning on inactive lanes causes a number of problems with register allocation. While the best long-term solution seems like teaching LLVM's register allocator about predication, for now we need to add some hacks to prevent ourselves from getting into trouble due to constraints that aren't currently expressed in LLVM. For the gory details, see the comments at the top of SIFixWWMLiveness.cpp. Reviewers: arsenm, nhaehnle, tpr Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D35524 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310087 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-13 23:18:58 +00:00 · 2017-08-04 18:36:52 +00:00 · 2017-08-04 18:36:52 +00:00 · ecf573917a
commit ecf573917a
parent 6ba4865928
13 changed files with 671 additions and 52 deletions
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -747,6 +747,15 @@ def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
 >;

+// Copies the active channels of the source value to the destination value,
+// with the guarantee that the source value is computed as if the entire
+// program were executed in Whole Wavefront Mode, i.e. with all channels
+// enabled, with a few exceptions: - Phi nodes with require WWM return an
+// undefined value.
+def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
+  [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
+>;
+
 //===----------------------------------------------------------------------===//
 // CI+ Intrinsics
 //===----------------------------------------------------------------------===//
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@ -50,6 +50,7 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
+FunctionPass *createSIFixWWMLivenessPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
@ -120,6 +121,9 @@ extern char &SIInsertSkipsPassID;
 void initializeSIOptimizeExecMaskingPass(PassRegistry &);
 extern char &SIOptimizeExecMaskingID;

+void initializeSIFixWWMLivenessPass(PassRegistry &);
+extern char &SIFixWWMLivenessID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@ -168,6 +168,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
  initializeSIMemoryLegalizerPass(*PR);
  initializeSIDebuggerInsertNopsPass(*PR);
  initializeSIOptimizeExecMaskingPass(*PR);
+  initializeSIFixWWMLivenessPass(*PR);
  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
  initializeAMDGPUAAWrapperPassPass(*PR);
 }
@ -792,6 +793,10 @@ void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
  // SI_ELSE will introduce a copy of the tied operand source after the else.
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

+  // This must be run after SILowerControlFlow, since it needs to use the
+  // machine-level CFG, but before register allocation.
+  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
  TargetPassConfig::addFastRegAlloc(RegAllocPass);
 }

@ -808,6 +813,10 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
  // SI_ELSE will introduce a copy of the tied operand source after the else.
  insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

+  // This must be run after SILowerControlFlow, since it needs to use the
+  // machine-level CFG, but before register allocation.
+  insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);
+
  TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
 }

--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@ -69,6 +69,7 @@ add_llvm_target(AMDGPUCodeGen
  SIFixControlFlowLiveIntervals.cpp
  SIFixSGPRCopies.cpp
  SIFixVGPRCopies.cpp
+  SIFixWWMLiveness.cpp
  SIFoldOperands.cpp
  SIFrameLowering.cpp
  SIInsertSkips.cpp
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@ -568,7 +568,8 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
      default:
        continue;
      case AMDGPU::COPY:
-      case AMDGPU::WQM: {
+      case AMDGPU::WQM:
+      case AMDGPU::WWM: {
        // If the destination register is a physical register there isn't really
        // much we can do to fix this.
        if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
--- a/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
+++ b/lib/Target/AMDGPU/SIFixWWMLiveness.cpp
@ -0,0 +1,202 @@
+//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Computations in WWM can overwrite values in inactive channels for
+/// variables that the register allocator thinks are dead. This pass adds fake
+/// uses of those variables to WWM instructions to make sure that they aren't
+/// overwritten.
+///
+/// As an example, consider this snippet:
+/// %vgpr0 = V_MOV_B32_e32 0.0
+/// if (...) {
+///   %vgpr1 = ...
+///   %vgpr2 = WWM %vgpr1<kill>
+///   ... = %vgpr2<kill>
+///   %vgpr0 = V_MOV_B32_e32 1.0
+/// }
+/// ... = %vgpr0
+///
+/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
+/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
+/// writing %vgpr1 would only write to channels that would be clobbered by the
+/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
+/// it would clobber even the inactive channels for which the if-condition is
+/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
+/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
+/// same register.
+///
+/// In general, we need to figure out what registers might have their inactive
+/// channels which are eventually used accidentally clobbered by a WWM
+/// instruction. We approximate this using two conditions:
+///
+/// 1. A definition of the variable reaches the WWM instruction.
+/// 2. The variable would be live at the WWM instruction if all its defs were
+/// partial defs (i.e. considered as a use), ignoring normal uses.
+///
+/// If a register matches both conditions, then we add an implicit use of it to
+/// the WWM instruction. Condition #2 is the heart of the matter: every
+/// definition is really a partial definition, since every VALU instruction is
+/// implicitly predicated.  We can usually ignore this, but WWM forces us not
+/// to. Condition #1 prevents false positives if the variable is undefined at
+/// the WWM instruction anyways. This is overly conservative in certain cases,
+/// especially in uniform control flow, but this is a workaround anyways until
+/// LLVM gains the notion of predicated uses and definitions of variables.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-fix-wwm-liveness"
+
+namespace {
+
+class SIFixWWMLiveness : public MachineFunctionPass {
+private:
+  LiveIntervals *LIS = nullptr;
+  const SIRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  static char ID;
+
+  SIFixWWMLiveness() : MachineFunctionPass(ID) {
+    initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  bool runOnWWMInstruction(MachineInstr &MI);
+
+  void addDefs(const MachineInstr &MI, SparseBitVector<> &set);
+
+  StringRef getPassName() const override { return "SI Fix WWM Liveness"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // Should preserve the same set that TwoAddressInstructions does.
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(LiveVariablesID);
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
+                "SI fix WWM liveness", false, false)
+
+char SIFixWWMLiveness::ID = 0;
+
+char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;
+
+FunctionPass *llvm::createSIFixWWMLivenessPass() {
+  return new SIFixWWMLiveness();
+}
+
+void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
+{
+  for (const MachineOperand &Op : MI.defs()) {
+    if (Op.isReg()) {
+      unsigned Reg = Op.getReg();
+      if (TRI->isVGPR(*MRI, Reg))
+        Regs.set(Reg);
+    }
+  }
+}
+
+bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
+  MachineBasicBlock *MBB = WWM.getParent();
+
+  // Compute the registers that are live out of MI by figuring out which defs
+  // are reachable from MI.
+  SparseBitVector<> LiveOut;
+
+  for (auto II = MachineBasicBlock::iterator(WWM), IE =
+       MBB->end(); II != IE; ++II) {
+    addDefs(*II, LiveOut);
+  }
+
+  for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
+                                        E = df_end(MBB);
+       I != E; ++I) {
+    for (const MachineInstr &MI : **I) {
+      addDefs(MI, LiveOut);
+    }
+  }
+
+  // Compute the registers that reach MI.
+  SparseBitVector<> Reachable;
+
+  for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
+       MBB->rend(); II != IE; ++II) {
+    addDefs(*II, Reachable);
+  }
+
+  for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
+                                         E = idf_end(MBB);
+       I != E; ++I) {
+    for (const MachineInstr &MI : **I) {
+      addDefs(MI, Reachable);
+    }
+  }
+
+  // find the intersection, and add implicit uses.
+  LiveOut &= Reachable;
+
+  bool Modified = false;
+  for (unsigned Reg : LiveOut) {
+    WWM.addOperand(MachineOperand::CreateReg(Reg, false, /*isImp=*/true));
+    if (LIS) {
+      // FIXME: is there a better way to update the live interval?
+      LIS->removeInterval(Reg);
+      LIS->createAndComputeVirtRegInterval(Reg);
+    }
+    Modified = true;
+  }
+
+  return Modified;
+}
+
+bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  // This doesn't actually need LiveIntervals, but we can preserve them.
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
+        Modified |= runOnWWMInstruction(MI);
+      }
+    }
+  }
+
+  return Modified;
+}
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -3947,6 +3947,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
                   0);
  }
+  case Intrinsic::amdgcn_wwm: {
+    SDValue Src = Op.getOperand(1);
+    return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
+                   0);
+  }
  default:
    return Op;
  }
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -1156,6 +1156,12 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
    MI.eraseFromParent();
    break;
  }
+  case AMDGPU::EXIT_WWM: {
+    // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
+    // is exited.
+    MI.setDesc(get(AMDGPU::S_MOV_B64));
+    break;
+  }
  }
  return true;
 }
@ -2667,6 +2673,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
  case AMDGPU::PHI: return AMDGPU::PHI;
  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
  case AMDGPU::WQM: return AMDGPU::WQM;
+  case AMDGPU::WWM: return AMDGPU::WWM;
  case AMDGPU::S_MOV_B32:
    return MI.getOperand(1).isReg() ?
           AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@ -3972,6 +3979,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
  case AMDGPU::REG_SEQUENCE:
  case AMDGPU::INSERT_SUBREG:
  case AMDGPU::WQM:
+  case AMDGPU::WWM:
    if (RI.hasVGPRs(NewDstRC))
      return nullptr;

--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -117,12 +117,26 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                      (ins VSrc_b64:$src0)>;

-// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy
-// after the WQM pass processes them.
+// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
+// WQM pass processes it.
 def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

+// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
+// that the @earlyclobber is respected. The @earlyclobber is to make sure that
+// the instruction that defines $src0 (which is run in WWM) doesn't
+// accidentally clobber inactive channels of $vdst.
+let Constraints = "@earlyclobber $vdst" in {
+def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+}
+
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]

+def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$sdst), (ins SReg_64:$src0)> {
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+
 let usesCustomInserter = 1, SALU = 1 in {
 def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
  [(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@ -9,7 +9,7 @@
 //
 /// \file
 /// \brief This pass adds instructions to enable whole quad mode for pixel
-/// shaders.
+/// shaders, and whole wavefront mode for all programs.
 ///
 /// Whole quad mode is required for derivative computations, but it interferes
 /// with shader side effects (stores and atomics). This pass is run on the
@ -29,6 +29,13 @@
 ///   ...
 ///   S_MOV_B64 EXEC, Tmp
 ///
+/// We also compute when a sequence of instructions requires Whole Wavefront
+/// Mode (WWM) and insert instructions to save and restore it:
+///
+/// S_OR_SAVEEXEC_B64 Tmp, -1
+/// ...
+/// S_MOV_B64 EXEC, Tmp
+///
 /// In order to avoid excessive switching during sequences of Exact
 /// instructions, the pass first analyzes which instructions must be run in WQM
 /// (aka which instructions produce values that lead to derivative
@ -85,7 +92,8 @@ namespace {

 enum {
  StateWQM = 0x1,
-  StateExact = 0x2,
+  StateWWM = 0x2,
+  StateExact = 0x4,
 };

 struct PrintState {
@ -98,9 +106,14 @@ public:
 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
  if (PS.State & StateWQM)
    OS << "WQM";
-  if (PS.State & StateExact) {
+  if (PS.State & StateWWM) {
    if (PS.State & StateWQM)
      OS << '|';
+    OS << "WWM";
+  }
+  if (PS.State & StateExact) {
+    if (PS.State & (StateWQM | StateWWM))
+      OS << '|';
    OS << "Exact";
  }

@ -130,6 +143,7 @@ struct WorkItem {

 class SIWholeQuadMode : public MachineFunctionPass {
 private:
+  CallingConv::ID CallingConv;
  const SIInstrInfo *TII;
  const SIRegisterInfo *TRI;
  MachineRegisterInfo *MRI;
@ -163,6 +177,10 @@ private:
               unsigned SaveWQM, unsigned LiveMaskReg);
  void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
             unsigned SavedWQM);
+  void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+             unsigned SaveOrig);
+  void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
+               unsigned SavedOrig);
  void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);

  void lowerLiveMaskQueries(unsigned LiveMaskReg);
@ -223,7 +241,7 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
                                      std::vector<WorkItem> &Worklist) {
  InstrInfo &II = Instructions[&MI];

-  assert(Flag == StateWQM);
+  assert(!(Flag & StateExact) && Flag != 0);

  // Remove any disabled states from the flag. The user that required it gets
  // an undefined value in the helper lanes. For example, this can happen if
@ -243,7 +261,6 @@ void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
 /// Mark all instructions defining the uses in \p MI with \p Flag.
 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
                                          std::vector<WorkItem> &Worklist) {
-  assert(Flag == StateWQM);
  for (const MachineOperand &Use : MI.uses()) {
    if (!Use.isReg() || !Use.isUse())
      continue;
@ -302,7 +319,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
      unsigned Opcode = MI.getOpcode();
      char Flags = 0;

-      if (TII->isDS(Opcode)) {
+      if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
        Flags = StateWQM;
      } else if (TII->isWQM(Opcode)) {
        // Sampling instructions don't need to produce results for all pixels
@ -316,6 +333,14 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
        // correct, so we need it to be in WQM.
        Flags = StateWQM;
        LowerToCopyInstrs.push_back(&MI);
+      } else if (Opcode == AMDGPU::WWM) {
+        // The WWM intrinsic doesn't make the same guarantee, and plus it needs
+        // to be executed in WQM or Exact so that its copy doesn't clobber
+        // inactive lanes.
+        markInstructionUses(MI, StateWWM, Worklist);
+        GlobalFlags |= StateWWM;
+        LowerToCopyInstrs.push_back(&MI);
+        continue;
      } else if (TII->isDisableWQM(MI)) {
        BBI.Needs |= StateExact;
        if (!(BBI.InNeeds & StateExact)) {
@ -323,7 +348,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
          Worklist.push_back(&MBB);
        }
        GlobalFlags |= StateExact;
-        III.Disabled = StateWQM;
+        III.Disabled = StateWQM | StateWWM;
        continue;
      } else {
        if (Opcode == AMDGPU::SI_PS_LIVE) {
@ -383,7 +408,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,

  // Propagate backwards within block
  if (MachineInstr *PrevMI = MI.getPrevNode()) {
-    char InNeeds = II.Needs | II.OutNeeds;
+    char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
    if (!PrevMI->isPHI()) {
      InstrInfo &PrevII = Instructions[PrevMI];
      if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
@ -589,6 +614,29 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
  LIS->InsertMachineInstrInMaps(*MI);
 }

+void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator Before,
+                            unsigned SaveOrig) {
+  MachineInstr *MI;
+
+  assert(SaveOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+               SaveOrig)
+           .addImm(-1);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
+void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator Before,
+                              unsigned SavedOrig) {
+  MachineInstr *MI;
+
+  assert(SavedOrig);
+  MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
+           .addReg(SavedOrig);
+  LIS->InsertMachineInstrInMaps(*MI);
+}
+
 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
                                   bool isEntry) {
  auto BII = Blocks.find(&MBB);
@ -597,45 +645,63 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,

  const BlockInfo &BI = BII->second;

-  if (!(BI.InNeeds & StateWQM))
-    return;
-
  // This is a non-entry block that is WQM throughout, so no need to do
  // anything.
-  if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)
+  if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
    return;

  DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");

  unsigned SavedWQMReg = 0;
+  unsigned SavedNonWWMReg = 0;
  bool WQMFromExec = isEntry;
-  char State = isEntry ? StateExact : StateWQM;
+  char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
+  char NonWWMState = 0;

  auto II = MBB.getFirstNonPHI(), IE = MBB.end();
  if (isEntry)
    ++II; // Skip the instruction that saves LiveMask

-  MachineBasicBlock::iterator First = IE;
+  // This stores the first instruction where it's safe to switch from WQM to
+  // Exact or vice versa.
+  MachineBasicBlock::iterator FirstWQM = IE;
+
+  // This stores the first instruction where it's safe to switch from WWM to
+  // Exact/WQM or to switch to WWM. It must always be the same as, or after,
+  // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
+  // switch to/from WQM as well.
+  MachineBasicBlock::iterator FirstWWM = IE;
  for (;;) {
    MachineBasicBlock::iterator Next = II;
-    char Needs = StateExact | StateWQM;
+    char Needs = StateExact | StateWQM; // WWM is disabled by default
    char OutNeeds = 0;

-    if (First == IE)
-      First = II;
+    if (FirstWQM == IE)
+      FirstWQM = II;

+    if (FirstWWM == IE)
+      FirstWWM = II;
+
+    // First, figure out the allowed states (Needs) based on the propagated
+    // flags.
    if (II != IE) {
      MachineInstr &MI = *II;

      if (requiresCorrectState(MI)) {
        auto III = Instructions.find(&MI);
        if (III != Instructions.end()) {
-          if (III->second.Needs & StateWQM)
+          if (III->second.Needs & StateWWM)
+            Needs = StateWWM;
+          else if (III->second.Needs & StateWQM)
            Needs = StateWQM;
          else
            Needs &= ~III->second.Disabled;
          OutNeeds = III->second.OutNeeds;
        }
+      } else {
+        // If the instruction doesn't actually need a correct EXEC, then we can
+        // safely leave WWM enabled.
+        Needs = StateExact | StateWQM | StateWWM;
      }

      if (MI.isTerminator() && OutNeeds == StateExact)
@ -655,35 +721,63 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
        Needs = StateWQM | StateExact;
    }

+    // Now, transition if necessary.
    if (!(Needs & State)) {
+      MachineBasicBlock::iterator First;
+      if (State == StateWWM || Needs == StateWWM) {
+        // We must switch to or from WWM
+        First = FirstWWM;
+      } else {
+        // We only need to switch to/from WQM, so we can use FirstWQM
+        First = FirstWQM;
+      }
+
      MachineBasicBlock::iterator Before =
          prepareInsertion(MBB, First, II, Needs == StateWQM,
                           Needs == StateExact || WQMFromExec);

-      if (Needs == StateExact) {
-        if (!WQMFromExec && (OutNeeds & StateWQM))
-          SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
-
-        toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
-        State = StateExact;
-      } else {
-        assert(Needs == StateWQM);
-        assert(WQMFromExec == (SavedWQMReg == 0));
-
-        toWQM(MBB, Before, SavedWQMReg);
-
-        if (SavedWQMReg) {
-          LIS->createAndComputeVirtRegInterval(SavedWQMReg);
-          SavedWQMReg = 0;
-        }
-        State = StateWQM;
+      if (State == StateWWM) {
+        assert(SavedNonWWMReg);
+        fromWWM(MBB, Before, SavedNonWWMReg);
+        State = NonWWMState;
      }

-      First = IE;
+      if (Needs == StateWWM) {
+        NonWWMState = State;
+        SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        toWWM(MBB, Before, SavedNonWWMReg);
+        State = StateWWM;
+      } else {
+        if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
+          if (!WQMFromExec && (OutNeeds & StateWQM))
+            SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+          toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
+          State = StateExact;
+        } else if (State == StateExact && (Needs & StateWQM) &&
+                   !(Needs & StateExact)) {
+          assert(WQMFromExec == (SavedWQMReg == 0));
+
+          toWQM(MBB, Before, SavedWQMReg);
+
+          if (SavedWQMReg) {
+            LIS->createAndComputeVirtRegInterval(SavedWQMReg);
+            SavedWQMReg = 0;
+          }
+          State = StateWQM;
+        } else {
+          // We can get here if we transitioned from WWM to a non-WWM state that
+          // already matches our needs, but we shouldn't need to do anything.
+          assert(Needs & State);
+        }
+      }
    }

-    if (Needs != (StateExact | StateWQM))
-      First = IE;
+    if (Needs != (StateExact | StateWQM | StateWWM)) {
+      if (Needs != (StateExact | StateWQM))
+        FirstWQM = IE;
+      FirstWWM = IE;
+    }

    if (II == IE)
      break;
@ -710,13 +804,11 @@ void SIWholeQuadMode::lowerCopyInstrs() {
 }

 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
-    return false;
-
  Instructions.clear();
  Blocks.clear();
  LiveMaskQueries.clear();
  LowerToCopyInstrs.clear();
+  CallingConv = MF.getFunction()->getCallingConv();

  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

@ -726,14 +818,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
  LIS = &getAnalysis<LiveIntervals>();

  char GlobalFlags = analyzeFunction(MF);
+  unsigned LiveMaskReg = 0;
  if (!(GlobalFlags & StateWQM)) {
    lowerLiveMaskQueries(AMDGPU::EXEC);
-    return !LiveMaskQueries.empty();
-  }
-
-  // Store a copy of the original live mask when required
-  unsigned LiveMaskReg = 0;
-  {
+    if (!(GlobalFlags & StateWWM))
+      return !LiveMaskQueries.empty();
+  } else {
+    // Store a copy of the original live mask when required
    MachineBasicBlock &Entry = MF.front();
    MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

@ -745,13 +836,14 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
      LIS->InsertMachineInstrInMaps(*MI);
    }

+    lowerLiveMaskQueries(LiveMaskReg);
+
    if (GlobalFlags == StateWQM) {
      // For a shader that needs only WQM, we can just set it once.
      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
              AMDGPU::EXEC)
          .addReg(AMDGPU::EXEC);

-      lowerLiveMaskQueries(LiveMaskReg);
      lowerCopyInstrs();
      // EntryMI may become invalid here
      return true;
@ -760,7 +852,6 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {

  DEBUG(printInfo());

-  lowerLiveMaskQueries(LiveMaskReg);
  lowerCopyInstrs();

  // Handle the general case
--- a/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
+++ b/test/CodeGen/AMDGPU/fix-wwm-liveness.mir
@ -0,0 +1,73 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o -  %s | FileCheck %s
+#CHECK: %exec = EXIT_WWM killed %19, implicit %21
+
+---
+name:            test_wwm_liveness
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: sreg_64, preferred-register: '' }
+  - { id: 1, class: sgpr_32, preferred-register: '' }
+  - { id: 2, class: sgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: vgpr_32, preferred-register: '' }
+  - { id: 5, class: vgpr_32, preferred-register: '' }
+  - { id: 6, class: vgpr_32, preferred-register: '' }
+  - { id: 7, class: vgpr_32, preferred-register: '' }
+  - { id: 8, class: sreg_64, preferred-register: '%vcc' }
+  - { id: 9, class: sreg_64, preferred-register: '' }
+  - { id: 10, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 11, class: sreg_64, preferred-register: '' }
+  - { id: 12, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 13, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 14, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 15, class: sreg_128, preferred-register: '' }
+  - { id: 16, class: vgpr_32, preferred-register: '' }
+  - { id: 17, class: vgpr_32, preferred-register: '' }
+  - { id: 18, class: vgpr_32, preferred-register: '' }
+  - { id: 19, class: sreg_64, preferred-register: '' }
+  - { id: 20, class: sreg_64, preferred-register: '' }
+  - { id: 21, class: vgpr_32, preferred-register: '' }
+  - { id: 22, class: sreg_64, preferred-register: '' }
+  - { id: 23, class: sreg_64, preferred-register: '' }
+liveins:         
+body:             |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  
+    %21 = V_MOV_B32_e32 0, implicit %exec
+    %5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit %exec
+    %6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit %exec
+    %8 = V_CMP_GT_U32_e64 32, killed %6, implicit %exec
+    %22 = COPY %exec, implicit-def %exec
+    %23 = S_AND_B64 %22, %8, implicit-def dead %scc
+    %0 = S_XOR_B64 %23, %22, implicit-def dead %scc
+    %exec = S_MOV_B64_term killed %23
+    SI_MASK_BRANCH %bb.2, implicit %exec
+    S_BRANCH %bb.1
+  
+  bb.1:
+    successors: %bb.2(0x80000000)
+  
+    %13 = S_MOV_B32 61440
+    %14 = S_MOV_B32 -1
+    %15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
+    %19 = COPY %exec
+    %exec = S_MOV_B64 -1
+    %16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4)
+    %17 = V_ADD_F32_e32 1065353216, killed %16, implicit %exec
+    %exec = EXIT_WWM killed %19
+    %21 = V_MOV_B32_e32 1, implicit %exec
+    early-clobber %18 = WWM killed %17, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit %exec :: (store 4)
+  
+  bb.2:
+    %exec = S_OR_B64 %exec, killed %0, implicit-def %scc
+    %vgpr0 = COPY killed %21
+    SI_RETURN_TO_EPILOG killed %vgpr0
+
+...
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@ -108,6 +108,154 @@ main_body:
  ret float %out.2
 }

+; Check that WWM is triggered by the wwm intrinsic.
+;
+;CHECK-LABEL: {{^}}test_wwm1:
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  ret float %out.0
+}
+
+; Same as above, but with an integer type.
+;
+;CHECK-LABEL: {{^}}test_wwm2:
+;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
+;CHECK: buffer_load_dword
+;CHECK: buffer_load_dword
+;CHECK: v_add_i32_e32
+define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %src0.0 = bitcast float %src0 to i32
+  %src1.0 = bitcast float %src1 to i32
+  %out = add i32 %src0.0, %src1.0
+  %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
+  %out.1 = bitcast i32 %out.0 to float
+  ret float %out.1
+}
+
+; Check that we don't leave WWM on for computations that don't require WWM,
+; since that will lead clobbering things that aren't supposed to be clobbered
+; in cases like this.
+;
+;CHECK-LABEL: {{^}}test_wwm3:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: v_add_f32_e32
+define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
+main_body:
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %out = fadd float %src, %src
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  %out.1 = fadd float %src, %out.0
+  br label %endif
+
+endif:
+  %out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
+  ret float %out.2
+}
+
+; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
+; write could clobber disabled channels in the non-WWM one.
+;
+;CHECK-LABEL: {{^}}test_wwm4:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK-NEXT: v_mov_b32_e32
+define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
+main_body:
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
+  %out = fadd float %src, %src
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  br label %endif
+
+endif:
+  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+  ret float %out.1
+}
+
+; Make sure the transition from Exact to WWM then WQM works properly.
+;
+;CHECK-LABEL: {{^}}test_wwm5:
+;CHECK: buffer_load_dword
+;CHECK: buffer_store_dword
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;CHECK: buffer_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: s_wqm_b64 exec, exec
+define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
+main_body:
+  %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
+  %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
+  %temp = fadd float %src1, %src1
+  %temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
+  %out = fadd float %temp.0, %temp.0
+  %out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
+  ret float %out.0
+}
+
+; Check that WWM is turned on correctly across basic block boundaries.
+;
+;CHECK-LABEL: {{^}}test_wwm6:
+;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: s_mov_b64 exec, [[ORIG]]
+;CHECK: %if
+;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
+;SI-CHECK: buffer_load_dword
+;VI-CHECK: flat_load_dword
+;CHECK: v_add_f32_e32
+;CHECK: s_mov_b64 exec, [[ORIG2]]
+define amdgpu_ps float @test_wwm6() {
+main_body:
+  %src0 = load volatile float, float addrspace(1)* undef
+  ; use mbcnt to make sure the branch is divergent
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
+  %cc = icmp uge i32 %hi, 32
+  br i1 %cc, label %endif, label %if
+
+if:
+  %src1 = load volatile float, float addrspace(1)* undef
+  %out = fadd float %src0, %src1
+  %out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
+  br label %endif
+
+endif:
+  %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
+  ret float %out.1
+}
+
 ; Check a case of one branch of an if-else requiring WQM, the other requiring
 ; exact.
 ;
@ -530,6 +678,10 @@ declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8
 declare void @llvm.AMDGPU.kill(float) #1
 declare float @llvm.amdgcn.wqm.f32(float) #3
 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
+declare float @llvm.amdgcn.wwm.f32(float) #3
+declare i32 @llvm.amdgcn.wwm.i32(i32) #3
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3

 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
--- a/test/CodeGen/AMDGPU/wqm.mir
+++ b/test/CodeGen/AMDGPU/wqm.mir
@ -0,0 +1,50 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-wqm -o -  %s | FileCheck %s
+
+---
+# Check for awareness that s_or_saveexec_b64 clobbers SCC
+#
+#CHECK: S_OR_SAVEEXEC_B64
+#CHECK: S_CMP_LT_I32
+#CHECK: S_CSELECT_B32
+name:            test_wwm_scc
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: sgpr_32, preferred-register: '' }
+  - { id: 1, class: sgpr_32, preferred-register: '' }
+  - { id: 2, class: sgpr_32, preferred-register: '' }
+  - { id: 3, class: vgpr_32, preferred-register: '' }
+  - { id: 4, class: vgpr_32, preferred-register: '' }
+  - { id: 5, class: sgpr_32, preferred-register: '' }
+  - { id: 6, class: vgpr_32, preferred-register: '' }
+  - { id: 7, class: vgpr_32, preferred-register: '' }
+  - { id: 8, class: sreg_32_xm0, preferred-register: '' }
+  - { id: 9, class: sreg_32, preferred-register: '' }
+  - { id: 10, class: sreg_32, preferred-register: '' }
+  - { id: 11, class: vgpr_32, preferred-register: '' }
+  - { id: 12, class: vgpr_32, preferred-register: '' }
+liveins:         
+  - { reg: '%sgpr0', virtual-reg: '%0' }
+  - { reg: '%sgpr1', virtual-reg: '%1' }
+  - { reg: '%sgpr2', virtual-reg: '%2' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: %sgpr0, %sgpr1, %sgpr2, %vgpr0
+  
+    %3 = COPY %vgpr0
+    %2 = COPY %sgpr2
+    %1 = COPY %sgpr1
+    %0 = COPY %sgpr0
+    S_CMP_LT_I32 0, %0, implicit-def %scc
+    %12 = V_ADD_I32_e32 %3, %3, implicit-def %vcc, implicit %exec
+    %5 = S_CSELECT_B32 %2, %1, implicit %scc
+    %11 = V_ADD_I32_e32 %5, %12, implicit-def %vcc, implicit %exec
+    %vgpr0 = WWM %11, implicit %exec
+    SI_RETURN_TO_EPILOG %vgpr0
+
+...