AMDGPU: Constant fold when immediate is materialized

In future commits these patterns will appear after moveToVALU changes. llvm-svn: 291615
2025-03-05 02:49:18 +00:00 · 2017-01-10 23:32:04 +00:00 · 2017-01-10 23:32:04 +00:00 · 6b917afcf9
commit 6b917afcf9
parent 740f03ad29
2 changed files with 1091 additions and 146 deletions
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@ -25,25 +25,6 @@ using namespace llvm;

 namespace {

-class SIFoldOperands : public MachineFunctionPass {
-public:
-  static char ID;
-
-public:
-  SIFoldOperands() : MachineFunctionPass(ID) {
-    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  StringRef getPassName() const override { return "SI Fold Operands"; }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
 struct FoldCandidate {
  MachineInstr *UseMI;
  union {
@ -79,6 +60,36 @@ struct FoldCandidate {
  }
 };

+class SIFoldOperands : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineRegisterInfo *MRI;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+
+  void foldOperand(MachineOperand &OpToFold,
+                   MachineInstr *UseMI,
+                   unsigned UseOpIdx,
+                   SmallVectorImpl<FoldCandidate> &FoldList,
+                   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;
+
+  void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+
+public:
+  SIFoldOperands() : MachineFunctionPass(ID) {
+    initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "SI Fold Operands"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // End anonymous namespace.

 INITIALIZE_PASS(SIFoldOperands, DEBUG_TYPE,
@ -141,7 +152,7 @@ static bool updateOperand(FoldCandidate &Fold,
  return false;
 }

-static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
+static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
                              const MachineInstr *MI) {
  for (auto Candidate : FoldList) {
    if (Candidate.UseMI == MI)
@ -150,7 +161,7 @@ static bool isUseMIInFoldList(const std::vector<FoldCandidate> &FoldList,
  return false;
 }

-static bool tryAddToFoldList(std::vector<FoldCandidate> &FoldList,
+static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
                             MachineInstr *MI, unsigned OpNo,
                             MachineOperand *OpToFold,
                             const SIInstrInfo *TII) {
@ -227,12 +238,12 @@ static bool isUseSafeToFold(const MachineInstr &MI,
  //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }

-static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
-                        unsigned UseOpIdx,
-                        std::vector<FoldCandidate> &FoldList,
-                        SmallVectorImpl<MachineInstr *> &CopiesToReplace,
-                        const SIInstrInfo *TII, const SIRegisterInfo &TRI,
-                        MachineRegisterInfo &MRI) {
+void SIFoldOperands::foldOperand(
+  MachineOperand &OpToFold,
+  MachineInstr *UseMI,
+  unsigned UseOpIdx,
+  SmallVectorImpl<FoldCandidate> &FoldList,
+  SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
  const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);

  if (!isUseSafeToFold(*UseMI, UseOp))
@ -264,7 +275,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
    unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();

    for (MachineRegisterInfo::use_iterator
-           RSUse = MRI.use_begin(RegSeqDstReg), RSE = MRI.use_end();
+           RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();
         RSUse != RSE; ++RSUse) {

      MachineInstr *RSUseMI = RSUse->getParent();
@ -272,7 +283,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
        continue;

      foldOperand(OpToFold, RSUseMI, RSUse.getOperandNo(), FoldList,
-                  CopiesToReplace, TII, TRI, MRI);
+                  CopiesToReplace);
    }

    return;
@ -287,8 +298,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
    unsigned DestReg = UseMI->getOperand(0).getReg();
    const TargetRegisterClass *DestRC
      = TargetRegisterInfo::isVirtualRegister(DestReg) ?
-      MRI.getRegClass(DestReg) :
-      TRI.getPhysRegClass(DestReg);
+      MRI->getRegClass(DestReg) :
+      TRI->getPhysRegClass(DestReg);

    unsigned MovOp = TII->getMovOpcode(DestRC);
    if (MovOp == AMDGPU::COPY)
@ -318,7 +329,7 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,

  const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
  const TargetRegisterClass *FoldRC =
-    TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
+    TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);

  APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
            OpToFold.getImm());
@ -328,8 +339,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
    unsigned UseReg = UseOp.getReg();
    const TargetRegisterClass *UseRC
      = TargetRegisterInfo::isVirtualRegister(UseReg) ?
-      MRI.getRegClass(UseReg) :
-      TRI.getPhysRegClass(UseReg);
+      MRI->getRegClass(UseReg) :
+      TRI->getPhysRegClass(UseReg);

    assert(Imm.getBitWidth() == 64);

@ -349,20 +360,51 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
 }

 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
-                                  int32_t LHS, int32_t RHS) {
+                                  uint32_t LHS, uint32_t RHS) {
  switch (Opcode) {
  case AMDGPU::V_AND_B32_e64:
+  case AMDGPU::V_AND_B32_e32:
  case AMDGPU::S_AND_B32:
    Result = LHS & RHS;
    return true;
  case AMDGPU::V_OR_B32_e64:
+  case AMDGPU::V_OR_B32_e32:
  case AMDGPU::S_OR_B32:
    Result = LHS | RHS;
    return true;
  case AMDGPU::V_XOR_B32_e64:
+  case AMDGPU::V_XOR_B32_e32:
  case AMDGPU::S_XOR_B32:
    Result = LHS ^ RHS;
    return true;
+  case AMDGPU::V_LSHL_B32_e64:
+  case AMDGPU::V_LSHL_B32_e32:
+  case AMDGPU::S_LSHL_B32:
+    // The instruction ignores the high bits for out of bounds shifts.
+    Result = LHS << (RHS & 31);
+    return true;
+  case AMDGPU::V_LSHLREV_B32_e64:
+  case AMDGPU::V_LSHLREV_B32_e32:
+    Result = RHS << (LHS & 31);
+    return true;
+  case AMDGPU::V_LSHR_B32_e64:
+  case AMDGPU::V_LSHR_B32_e32:
+  case AMDGPU::S_LSHR_B32:
+    Result = LHS >> (RHS & 31);
+    return true;
+  case AMDGPU::V_LSHRREV_B32_e64:
+  case AMDGPU::V_LSHRREV_B32_e32:
+    Result = RHS >> (LHS & 31);
+    return true;
+  case AMDGPU::V_ASHR_I32_e64:
+  case AMDGPU::V_ASHR_I32_e32:
+  case AMDGPU::S_ASHR_I32:
+    Result = static_cast<int32_t>(LHS) >> (RHS & 31);
+    return true;
+  case AMDGPU::V_ASHRREV_I32_e64:
+  case AMDGPU::V_ASHRREV_I32_e32:
+    Result = static_cast<int32_t>(RHS) >> (LHS & 31);
+    return true;
  default:
    return false;
  }
@ -390,33 +432,47 @@ static void mutateCopyOp(MachineInstr &MI, const MCInstrDesc &NewDesc) {
  stripExtraCopyOperands(MI);
 }

-// Try to simplify operations with a constant that may appear after instruction
-// selection.
-static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
-                              const SIInstrInfo *TII,
-                              MachineInstr *MI) {
-  unsigned Opc = MI->getOpcode();
+static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
+                                               MachineOperand &Op) {
+  if (Op.isReg()) {
+    // If this has a subregister, it obviously is a register source.
+    if (Op.getSubReg() != AMDGPU::NoSubRegister)
+      return &Op;

-  if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
-      Opc == AMDGPU::S_NOT_B32) {
-    MachineOperand &Src0 = MI->getOperand(1);
-    if (Src0.isImm()) {
-      Src0.setImm(~Src0.getImm());
-      mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
-      return true;
+    MachineInstr *Def = MRI.getVRegDef(Op.getReg());
+    if (Def->isMoveImmediate()) {
+      MachineOperand &ImmSrc = Def->getOperand(1);
+      if (ImmSrc.isImm())
+        return &ImmSrc;
    }
-
-    return false;
  }

-  if (!MI->isCommutable())
+  return &Op;
+}
+
+// Try to simplify operations with a constant that may appear after instruction
+// selection.
+// TODO: See if a frame index with a fixed offset can fold.
+static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
+                              const SIInstrInfo *TII,
+                              MachineInstr *MI,
+                              MachineOperand *ImmOp) {
+  unsigned Opc = MI->getOpcode();
+  if (Opc == AMDGPU::V_NOT_B32_e64 || Opc == AMDGPU::V_NOT_B32_e32 ||
+      Opc == AMDGPU::S_NOT_B32) {
+    MI->getOperand(1).ChangeToImmediate(~ImmOp->getImm());
+    mutateCopyOp(*MI, TII->get(getMovOpc(Opc == AMDGPU::S_NOT_B32)));
+    return true;
+  }
+
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  if (Src1Idx == -1)
    return false;

  int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
-  int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+  MachineOperand *Src0 = getImmOrMaterializedImm(MRI, MI->getOperand(Src0Idx));
+  MachineOperand *Src1 = getImmOrMaterializedImm(MRI, MI->getOperand(Src1Idx));

-  MachineOperand *Src0 = &MI->getOperand(Src0Idx);
-  MachineOperand *Src1 = &MI->getOperand(Src1Idx);
  if (!Src0->isImm() && !Src1->isImm())
    return false;

@ -431,19 +487,26 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
    const SIRegisterInfo &TRI = TII->getRegisterInfo();
    bool IsSGPR = TRI.isSGPRReg(MRI, MI->getOperand(0).getReg());

-    Src0->setImm(NewImm);
+    // Be careful to change the right operand, src0 may belong to a different
+    // instruction.
+    MI->getOperand(Src0Idx).ChangeToImmediate(NewImm);
    MI->RemoveOperand(Src1Idx);
    mutateCopyOp(*MI, TII->get(getMovOpc(IsSGPR)));
    return true;
  }

+  if (!MI->isCommutable())
+    return false;
+
  if (Src0->isImm() && !Src1->isImm()) {
    std::swap(Src0, Src1);
    std::swap(Src0Idx, Src1Idx);
  }

  int32_t Src1Val = static_cast<int32_t>(Src1->getImm());
-  if (Opc == AMDGPU::V_OR_B32_e64 || Opc == AMDGPU::S_OR_B32) {
+  if (Opc == AMDGPU::V_OR_B32_e64 ||
+      Opc == AMDGPU::V_OR_B32_e32 ||
+      Opc == AMDGPU::S_OR_B32) {
    if (Src1Val == 0) {
      // y = or x, 0 => y = copy x
      MI->RemoveOperand(Src1Idx);
@ -459,6 +522,7 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
  }

  if (MI->getOpcode() == AMDGPU::V_AND_B32_e64 ||
+      MI->getOpcode() == AMDGPU::V_AND_B32_e32 ||
      MI->getOpcode() == AMDGPU::S_AND_B32) {
    if (Src1Val == 0) {
      // y = and x, 0 => y = v_mov_b32 0
@ -476,29 +540,136 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
  }

  if (MI->getOpcode() == AMDGPU::V_XOR_B32_e64 ||
+      MI->getOpcode() == AMDGPU::V_XOR_B32_e32 ||
      MI->getOpcode() == AMDGPU::S_XOR_B32) {
    if (Src1Val == 0) {
      // y = xor x, 0 => y = copy x
      MI->RemoveOperand(Src1Idx);
      mutateCopyOp(*MI, TII->get(AMDGPU::COPY));
+      return true;
    }
  }

  return false;
 }

+void SIFoldOperands::foldInstOperand(MachineInstr &MI,
+                                     MachineOperand &OpToFold) const {
+  // We need mutate the operands of new mov instructions to add implicit
+  // uses of EXEC, but adding them invalidates the use_iterator, so defer
+  // this.
+  SmallVector<MachineInstr *, 4> CopiesToReplace;
+  SmallVector<FoldCandidate, 4> FoldList;
+  MachineOperand &Dst = MI.getOperand(0);
+
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  if (FoldingImm) {
+    unsigned NumLiteralUses = 0;
+    MachineOperand *NonInlineUse = nullptr;
+    int NonInlineUseOpNo = -1;
+
+    MachineRegisterInfo::use_iterator NextUse, NextInstUse;
+    for (MachineRegisterInfo::use_iterator
+           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+         Use != E; Use = NextUse) {
+      NextUse = std::next(Use);
+      MachineInstr *UseMI = Use->getParent();
+      unsigned OpNo = Use.getOperandNo();
+
+      // Folding the immediate may reveal operations that can be constant
+      // folded or replaced with a copy. This can happen for example after
+      // frame indices are lowered to constants or from splitting 64-bit
+      // constants.
+      //
+      // We may also encounter cases where one or both operands are
+      // immediates materialized into a register, which would ordinarily not
+      // be folded due to multiple uses or operand constraints.
+
+      if (OpToFold.isImm() && tryConstantFoldOp(*MRI, TII, UseMI, &OpToFold)) {
+        DEBUG(dbgs() << "Constant folded " << *UseMI <<'\n');
+
+        // Some constant folding cases change the same immediate's use to a new
+        // instruction, e.g. and x, 0 -> 0. Make sure we re-visit the user
+        // again. The same constant folded instruction could also have a second
+        // use operand.
+        NextUse = MRI->use_begin(Dst.getReg());
+        continue;
+      }
+
+      // Try to fold any inline immediate uses, and then only fold other
+      // constants if they have one use.
+      //
+      // The legality of the inline immediate must be checked based on the use
+      // operand, not the defining instruction, because 32-bit instructions
+      // with 32-bit inline immediate sources may be used to materialize
+      // constants used in 16-bit operands.
+      //
+      // e.g. it is unsafe to fold:
+      //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
+      //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
+
+      // Folding immediates with more than one use will increase program size.
+      // FIXME: This will also reduce register usage, which may be better
+      // in some cases. A better heuristic is needed.
+      if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
+        foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);
+      } else {
+        if (++NumLiteralUses == 1) {
+          NonInlineUse = &*Use;
+          NonInlineUseOpNo = OpNo;
+        }
+      }
+    }
+
+    if (NumLiteralUses == 1) {
+      MachineInstr *UseMI = NonInlineUse->getParent();
+      foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList, CopiesToReplace);
+    }
+  } else {
+    // Folding register.
+    for (MachineRegisterInfo::use_iterator
+           Use = MRI->use_begin(Dst.getReg()), E = MRI->use_end();
+         Use != E; ++Use) {
+      MachineInstr *UseMI = Use->getParent();
+
+      foldOperand(OpToFold, UseMI, Use.getOperandNo(),
+                  FoldList, CopiesToReplace);
+    }
+  }
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  // Make sure we add EXEC uses to any new v_mov instructions created.
+  for (MachineInstr *Copy : CopiesToReplace)
+    Copy->addImplicitDefUseOperands(*MF);
+
+  for (FoldCandidate &Fold : FoldList) {
+    if (updateOperand(Fold, *TRI)) {
+      // Clear kill flags.
+      if (Fold.isReg()) {
+        assert(Fold.OpToFold && Fold.OpToFold->isReg());
+        // FIXME: Probably shouldn't bother trying to fold if not an
+        // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
+        // copies.
+        MRI->clearKillFlags(Fold.OpToFold->getReg());
+      }
+      DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
+            static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+    }
+  }
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
  if (skipFunction(*MF.getFunction()))
    return false;

  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-                                                  BI != BE; ++BI) {
+       BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
@ -512,8 +683,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
      MachineOperand &OpToFold = MI.getOperand(1);
      bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();

-      // FIXME: We could also be folding things like FrameIndexes and
-      // TargetIndexes.
+      // FIXME: We could also be folding things like TargetIndexes.
      if (!FoldingImm && !OpToFold.isReg())
        continue;

@ -532,90 +702,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
          !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
        continue;

-      // We need mutate the operands of new mov instructions to add implicit
-      // uses of EXEC, but adding them invalidates the use_iterator, so defer
-      // this.
-      SmallVector<MachineInstr *, 4> CopiesToReplace;
-
-      std::vector<FoldCandidate> FoldList;
-      if (FoldingImm) {
-        unsigned NumLiteralUses = 0;
-        MachineOperand *NonInlineUse = nullptr;
-        int NonInlineUseOpNo = -1;
-
-        // Try to fold any inline immediate uses, and then only fold other
-        // constants if they have one use.
-        //
-        // The legality of the inline immediate must be checked based on the use
-        // operand, not the defining instruction, because 32-bit instructions
-        // with 32-bit inline immediate sources may be used to materialize
-        // constants used in 16-bit operands.
-        //
-        // e.g. it is unsafe to fold:
-        //  s_mov_b32 s0, 1.0    // materializes 0x3f800000
-        //  v_add_f16 v0, v1, s0 // 1.0 f16 inline immediate sees 0x00003c00
-
-        // Folding immediates with more than one use will increase program size.
-        // FIXME: This will also reduce register usage, which may be better
-        // in some cases. A better heuristic is needed.
-        for (MachineRegisterInfo::use_iterator
-               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
-             Use != E; ++Use) {
-          MachineInstr *UseMI = Use->getParent();
-          unsigned OpNo = Use.getOperandNo();
-
-          if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
-            foldOperand(OpToFold, UseMI, OpNo, FoldList,
-                        CopiesToReplace, TII, TRI, MRI);
-          } else {
-            if (++NumLiteralUses == 1) {
-              NonInlineUse = &*Use;
-              NonInlineUseOpNo = OpNo;
-            }
-          }
-        }
-
-        if (NumLiteralUses == 1) {
-          MachineInstr *UseMI = NonInlineUse->getParent();
-          foldOperand(OpToFold, UseMI, NonInlineUseOpNo, FoldList,
-                      CopiesToReplace, TII, TRI, MRI);
-        }
-      } else {
-        // Folding register.
-        for (MachineRegisterInfo::use_iterator
-               Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
-             Use != E; ++Use) {
-          MachineInstr *UseMI = Use->getParent();
-
-          foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
-                      CopiesToReplace, TII, TRI, MRI);
-        }
-      }
-
-      // Make sure we add EXEC uses to any new v_mov instructions created.
-      for (MachineInstr *Copy : CopiesToReplace)
-        Copy->addImplicitDefUseOperands(MF);
-
-      for (FoldCandidate &Fold : FoldList) {
-        if (updateOperand(Fold, TRI)) {
-          // Clear kill flags.
-          if (Fold.isReg()) {
-            assert(Fold.OpToFold && Fold.OpToFold->isReg());
-            // FIXME: Probably shouldn't bother trying to fold if not an
-            // SGPR. PeepholeOptimizer can eliminate redundant VGPR->VGPR
-            // copies.
-            MRI.clearKillFlags(Fold.OpToFold->getReg());
-          }
-          DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
-                static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
-
-          // Folding the immediate may reveal operations that can be constant
-          // folded or replaced with a copy. This can happen for example after
-          // frame indices are lowered to constants or from splitting 64-bit
-          // constants.
-          tryConstantFoldOp(MRI, TII, Fold.UseMI);
-        }
-      }
+      foldInstOperand(MI, OpToFold);
    }
  }
  return false;
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@ -0,0 +1,858 @@
+# RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
+--- |
+  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %and = and i32 %a, 1234567
+    store volatile i32 %and, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %and = and i32 %a, 1234567
+    store i32 %and, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %shl = shl i32 %a, 12
+    store volatile i32 %shl, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %shl = shl i32 %a, 12
+    store i32 %shl, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %ashr = ashr i32 %a, 12
+    store volatile i32 %ashr, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %ashr = ashr i32 %a, 12
+    store i32 %ashr, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+    %lshr = lshr i32 %a, 12
+    store volatile i32 %lshr, i32 addrspace(1)* %out
+    ret void
+  }
+
+  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+    %tid = call i32 @llvm.amdgcn.workitem.id.x()
+    %idxprom = sext i32 %tid to i64
+    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
+    %gep.out = getelementptr i32, i32 addrspace(1)* %out, i64 %idxprom
+    %a = load i32, i32 addrspace(1)* %gep.a
+    %lshr = lshr i32 %a, 12
+    store i32 %lshr, i32 addrspace(1)* %gep.out
+    ret void
+  }
+
+  declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+  attributes #0 = { nounwind }
+  attributes #1 = { nounwind readnone }
+
+...
+---
+
+# GCN-LABEL: name: s_fold_and_imm_regimm_32{{$}}
+# GCN: %10 = V_MOV_B32_e32 1543, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %10,
+name:            s_fold_and_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_64_xexec }
+  - { id: 2, class: sreg_32_xm0 }
+  - { id: 3, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_32_xm0 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sreg_128 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %1 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %2 = COPY %1.sub1
+    %3 = COPY %1.sub0
+    %4 = S_MOV_B32 61440
+    %5 = S_MOV_B32 -1
+    %6 = REG_SEQUENCE killed %2, 1, killed %3, 2, killed %4, 3, killed %5, 4
+    %7 = S_MOV_B32 1234567
+    %8 = S_MOV_B32 9999
+    %9 = S_AND_B32 killed %7, killed %8, implicit-def dead %scc
+    %10 = COPY %9
+    BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_and_imm_regimm_32{{$}}
+
+# GCN: %9 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %9,
+
+# GCN: %10 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %10
+
+# GCN: %11 = V_MOV_B32_e32 646, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %11,
+
+# GCN: %12 = V_MOV_B32_e32 1234567, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %12,
+
+# GCN: %13 = V_MOV_B32_e32 63, implicit %exec
+# GCN: FLAT_STORE_DWORD %19, %13,
+
+name:            v_fold_and_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 20, class: sreg_32_xm0 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 29, class: vgpr_32 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vgpr_32 }
+  - { id: 32, class: vreg_64 }
+  - { id: 33, class: vreg_64 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+  - { id: 36, class: vgpr_32 }
+  - { id: 37, class: vreg_64 }
+  - { id: 44, class: vgpr_32 }
+
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %31 = V_ASHRREV_I32_e64 31, %3, implicit %exec
+    %32 = REG_SEQUENCE %3, 1, %31, 2
+    %33 = V_LSHLREV_B64 2, killed %32, implicit %exec
+    %20 = COPY %4.sub1
+    %44 = V_ADD_I32_e32 %4.sub0, %33.sub0, implicit-def %vcc, implicit %exec
+    %36 = COPY killed %20
+    %35 = V_ADDC_U32_e32 %33.sub1, %36, implicit-def %vcc, implicit %vcc, implicit %exec
+    %37 = REG_SEQUENCE %44, 1, killed %35, 2
+    %24 = V_MOV_B32_e32 982, implicit %exec
+    %26 = S_MOV_B32 1234567
+    %34 = V_MOV_B32_e32 63, implicit %exec
+
+    %27 = V_AND_B32_e64 %26, %24, implicit %exec
+    FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_AND_B32_e64 %24, %26, implicit %exec
+    FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %29 = V_AND_B32_e32 %26, %24, implicit %exec
+    FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %30 = V_AND_B32_e64 %26, %26, implicit %exec
+    FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %31 = V_AND_B32_e64 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_shl_imm_regimm_32{{$}}
+# GC1: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %13,
+
+name:            s_fold_shl_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 1
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_LSHL_B32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_fold_shl_imm_regimm_32{{$}}
+
+# GCN: %11 = V_MOV_B32_e32 40955904, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 24, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 0, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 4096, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 2, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 7927808, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -8, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_shl_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 9999, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1, implicit %exec
+    %7 = S_MOV_B32 1
+    %27 = S_MOV_B32 -4
+
+    %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_LSHL_B32_e64 %7, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_LSHL_B32_e64 12, %7, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_LSHL_B32_e64 12, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_LSHL_B32_e64 %6, 12, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_LSHL_B32_e64 %6, 32, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_LSHL_B32_e32 %6, %6, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_LSHL_B32_e32 %27, %6, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 243, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name:            s_fold_ashr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 999123
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_ASHR_I32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+
+# GCN-LABEL: name: v_fold_ashr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 -1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_ashr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 999234234, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1000000, implicit %exec
+    %7 = S_MOV_B32 13424252
+    %8 = S_MOV_B32 4
+    %27 = S_MOV_B32 -4
+    %32 = S_MOV_B32 1
+    %33 = S_MOV_B32 3841
+    %34 = V_MOV_B32_e32 3841, implicit %exec
+    %35 = V_MOV_B32_e32 2, implicit %exec
+
+    %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_ASHR_I32_e64 %7, 3, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_ASHR_I32_e64 7, %32, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_ASHR_I32_e64 %27, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_ASHR_I32_e64 %6, 4, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_ASHR_I32_e64 %6, %33, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_ASHR_I32_e32 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_ASHR_I32_e32 %27, %35, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: s_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 1048332, implicit %exec
+# GCN: BUFFER_STORE_DWORD_OFFSET killed %11, killed %8,
+name:            s_fold_lshr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32_xm0 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_128 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1
+
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_MOV_B32 -999123
+    %6 = COPY %4.sub1
+    %7 = COPY %4.sub0
+    %8 = S_MOV_B32 61440
+    %9 = S_MOV_B32 -1
+    %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4
+    %12 = S_LSHR_B32 killed %5, 12, implicit-def dead %scc
+    %13 = COPY %12
+    BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into %ir.out)
+    S_ENDPGM
+
+...
+---
+
+# GCN-LABEL: name: v_fold_lshr_imm_regimm_32{{$}}
+# GCN: %11 = V_MOV_B32_e32 3903258, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %11,
+
+# GCN: %12 = V_MOV_B32_e32 62452139, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %12,
+
+# GCN: %13 = V_MOV_B32_e32 1678031, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %13,
+
+# GCN: %14 = V_MOV_B32_e32 3, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %14,
+
+# GCN: %15 = V_MOV_B32_e32 1, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %15,
+
+# GCN: %22 = V_MOV_B32_e32 62500, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %22,
+
+# GCN: %23 = V_MOV_B32_e32 500000, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %23,
+
+# GCN: %25 = V_MOV_B32_e32 1920, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %25,
+
+# GCN: %26 = V_MOV_B32_e32 487907, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %26,
+
+# GCN: %28 = V_MOV_B32_e32 1073741823, implicit %exec
+# GCN: FLAT_STORE_DWORD %20, %28,
+
+name:            v_fold_lshr_imm_regimm_32
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_64_xexec }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: sreg_32_xm0 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: vgpr_32 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vgpr_32 }
+  - { id: 13, class: vgpr_32 }
+  - { id: 14, class: vgpr_32 }
+  - { id: 15, class: vgpr_32 }
+  - { id: 16, class: vreg_64 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vgpr_32 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: vgpr_32 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: vgpr_32 }
+  - { id: 35, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %2 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %3 = S_LOAD_DWORDX2_IMM %0, 36, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %15 = V_ASHRREV_I32_e64 31, %2, implicit %exec
+    %16 = REG_SEQUENCE %2, 1, %15, 2
+    %17 = V_LSHLREV_B64 2, killed %16, implicit %exec
+    %9 = COPY %3.sub1
+    %21 = V_ADD_I32_e32 %3.sub0, %17.sub0, implicit-def %vcc, implicit %exec
+    %19 = COPY killed %9
+    %18 = V_ADDC_U32_e32 %17.sub1, %19, implicit-def %vcc, implicit %vcc, implicit %exec
+    %20 = REG_SEQUENCE %21, 1, killed %18, 2
+    %10 = V_MOV_B32_e32 999234234, implicit %exec
+    %24 = V_MOV_B32_e32 3871, implicit %exec
+    %6 = V_MOV_B32_e32 1000000, implicit %exec
+    %7 = S_MOV_B32 13424252
+    %8 = S_MOV_B32 4
+    %27 = S_MOV_B32 -4
+    %32 = S_MOV_B32 1
+    %33 = S_MOV_B32 3841
+    %34 = V_MOV_B32_e32 3841, implicit %exec
+    %35 = V_MOV_B32_e32 2, implicit %exec
+
+    %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %13 = V_LSHR_B32_e64 %7, 3, implicit %exec
+    FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %14 = V_LSHR_B32_e64 7, %32, implicit %exec
+    FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %15 = V_LSHR_B32_e64 %27, %24, implicit %exec
+    FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %22 = V_LSHR_B32_e64 %6, 4, implicit %exec
+    FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %23 = V_LSHR_B32_e64 %6, %33, implicit %exec
+    FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %25 = V_LSHR_B32_e32 %34, %34, implicit %exec
+    FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec
+    FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    %28 = V_LSHR_B32_e32 %27, %35, implicit %exec
+    FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out)
+
+    S_ENDPGM
+
+...