From 1b216a87183f200226465e9b501fbc2fcc440f25 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <kparzysz@codeaurora.org>
Date: Fri, 2 Sep 2016 19:48:55 +0000
Subject: [PATCH] Do not consider subreg defs as reads when computing subrange
 liveness

Subregister definitions are considered uses for the purpose of tracking
liveness of the whole register. At the same time, when calculating live
interval subranges, subregister defs should not be treated as uses.

Differential Revision: https://reviews.llvm.org/D24190

llvm-svn: 280532
---
 lib/CodeGen/LiveIntervalAnalysis.cpp          |   9 +-
 lib/CodeGen/LiveRangeCalc.cpp                 |   7 +-
 lib/CodeGen/LiveRangeCalc.h                   |   1 +
 lib/CodeGen/SplitKit.cpp                      |   7 +-
 .../AMDGPU/coalescer-subrange-crash.ll        |  62 ++++++++++
 test/CodeGen/AMDGPU/coalescer-subreg-join.mir |  75 ++++++++++++
 .../AMDGPU/scheduler-subrange-crash.ll        |  55 +++++++++
 test/CodeGen/AMDGPU/unigine-liveness-crash.ll | 115 ++++++++++++++++++
 8 files changed, 320 insertions(+), 11 deletions(-)
 create mode 100644 test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
 create mode 100644 test/CodeGen/AMDGPU/coalescer-subreg-join.mir
 create mode 100644 test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
 create mode 100644 test/CodeGen/AMDGPU/unigine-liveness-crash.ll

diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index a3122763806..4195f007d43 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -506,20 +506,19 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
 
   // Visit all instructions reading Reg.
   SlotIndex LastIdx;
-  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
-    MachineInstr *UseMI = MO.getParent();
-    if (UseMI->isDebugValue() || !MO.readsReg())
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    // Skip "undef" uses.
+    if (!MO.readsReg())
       continue;
     // Maybe the operand is for a subregister we don't care about.
     unsigned SubReg = MO.getSubReg();
     if (SubReg != 0) {
       LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg);
-      if (MO.isDef())
-        LaneMask = ~LaneMask & MRI->getMaxLaneMaskForVReg(Reg);
       if ((LaneMask & SR.LaneMask) == 0)
         continue;
     }
     // We only need to visit each instruction once.
+    MachineInstr *UseMI = MO.getParent();
     SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
     if (Idx == LastIdx)
       continue;
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index 25d22a02d33..98022d99cf2 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -163,13 +163,18 @@ void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask Mask,
     LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes);
 
   // Visit all operands that read Reg. This may include partial defs.
+  bool IsSubRange = (Mask != ~0U);
   const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     // Clear all kill flags. They will be reinserted after register allocation
     // by LiveIntervalAnalysis::addKillFlags().
     if (MO.isUse())
       MO.setIsKill(false);
-    if (!MO.readsReg())
+    // MO::readsReg returns "true" for subregister defs. This is for keeping
+    // liveness of the entire register (i.e. for the main range of the live
+    // interval). For subranges, definitions of non-overlapping subregisters
+    // do not count as uses.
+    if (!MO.readsReg() || (IsSubRange && MO.isDef()))
       continue;
 
     unsigned SubReg = MO.getSubReg();
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 84cc1cb2be7..892f535a5ae 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -160,6 +160,7 @@ class LiveRangeCalc {
   /// all uses must be jointly dominated by the definitions from @p LR
   /// together with definitions of other lanes where @p LR becomes undefined
   /// (via <def,read-undef> operands).
+  /// If @p LR is a main range, the @p LaneMask should be set to ~0.
   void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask,
                     LiveInterval *LI = nullptr);
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 6d59f85d35f..e06bc4a3614 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -1206,7 +1206,8 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
       // defining the register. This is because a <def,read-undef> operand
       // will create an "undef" point, and we cannot extend any subranges
       // until all of them have been accounted for.
-      ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
+      if (MO.isUse())
+        ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
     } else {
       LiveRangeCalc &LRC = getLRCalc(RegIdx);
       LRC.extend(LI, Next, 0, ArrayRef<SlotIndex>());
@@ -1221,10 +1222,6 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
     LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
                               : MRI.getMaxLaneMaskForVReg(Reg);
-    // If this is a non-read-undef definition of a sub-register, extend
-    // subranges for everything except that sub-register.
-    if (Sub != 0 && EP.MO.isDef())
-      LM = MRI.getMaxLaneMaskForVReg(Reg) & ~LM;
     for (LiveInterval::SubRange &S : LI.subranges()) {
       if (!(S.LaneMask & LM))
         continue;
diff --git a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
new file mode 100644
index 00000000000..7ff133b86e7
--- /dev/null
+++ b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
@@ -0,0 +1,62 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+; REQUIRES: asserts
+;
+; This testcase used to cause the following crash:
+;
+; *** Couldn't join subrange!
+;
+; UNREACHABLE executed at lib/CodeGen/RegisterCoalescer.cpp:2666!
+;
+; The insertelement instructions became subregister definitions: one virtual
+; register was defined and re-defined by one group of the consecutive insert-
+; elements, and another was defined by the second group.
+; Since a copy between the two full registers was present in the program,
+; the coalescer tried to merge them. The join algorithm for the main range
+; decided that it was correct to do so, while the subrange join unexpectedly
+; failed. This was caused by the live interval subranges not being computed
+; correctly: subregister defs are not uses for the purpose of subranges.
+;
+; Test for a valid output:
+; CHECK: image_sample_c_d_o
+
+target triple = "amdgcn--"
+
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
+main_body:
+  %tmp = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg6, <2 x i32> %arg8)
+  %tmp23 = fadd float %tmp, 0xBFA99999A0000000
+  %tmp24 = fadd float %tmp, 0x3FA99999A0000000
+  %tmp25 = bitcast float %tmp23 to i32
+  %tmp26 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp25, i32 1
+  %tmp27 = insertelement <16 x i32> %tmp26, i32 undef, i32 2
+  %tmp28 = insertelement <16 x i32> %tmp27, i32 undef, i32 3
+  %tmp29 = insertelement <16 x i32> %tmp28, i32 undef, i32 4
+  %tmp30 = insertelement <16 x i32> %tmp29, i32 0, i32 5
+  %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6
+  %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7
+  %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8
+  %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %tmp35 = extractelement <4 x float> %tmp34, i32 0
+  %tmp36 = bitcast float %tmp24 to i32
+  %tmp37 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp36, i32 1
+  %tmp38 = insertelement <16 x i32> %tmp37, i32 undef, i32 2
+  %tmp39 = insertelement <16 x i32> %tmp38, i32 undef, i32 3
+  %tmp40 = insertelement <16 x i32> %tmp39, i32 undef, i32 4
+  %tmp41 = insertelement <16 x i32> %tmp40, i32 0, i32 5
+  %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6
+  %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7
+  %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8
+  %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %tmp46 = extractelement <4 x float> %tmp45, i32 0
+  %tmp47 = fmul float %tmp35, %tmp46
+  %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14
+  %tmp49 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp48, float %arg21, 24
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49
+}
+
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/coalescer-subreg-join.mir b/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
new file mode 100644
index 00000000000..124f9f519c0
--- /dev/null
+++ b/test/CodeGen/AMDGPU/coalescer-subreg-join.mir
@@ -0,0 +1,75 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
+# Check that %11 and %20 have been coalesced.
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG:[0-9]+]]
+# CHECK: IMAGE_SAMPLE_C_D_O_V1_V16 %[[REG]]
+
+---
+name:            main
+alignment:       0
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sreg_256 }
+  - { id: 4, class: sreg_128 }
+  - { id: 5, class: sreg_256 }
+  - { id: 6, class: sreg_128 }
+  - { id: 7, class: sreg_512 }
+  - { id: 9, class: vreg_512 }
+  - { id: 11, class: vreg_512 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 20, class: vreg_512 }
+  - { id: 27, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr2_sgpr3', virtual-reg: '%0' }
+  - { reg: '%vgpr2', virtual-reg: '%1' }
+  - { reg: '%vgpr3', virtual-reg: '%2' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0:
+    liveins: %sgpr2_sgpr3, %vgpr2, %vgpr3
+
+    %0 = COPY %sgpr2_sgpr3
+    %1 = COPY %vgpr2
+    %2 = COPY %vgpr3
+    %3 = S_LOAD_DWORDX8_IMM %0, 0
+    %4 = S_LOAD_DWORDX4_IMM %0, 12
+    %5 = S_LOAD_DWORDX8_IMM %0, 16
+    %6 = S_LOAD_DWORDX4_IMM %0, 28
+    undef %7.sub0 = S_MOV_B32 212739
+    %20 = COPY %7
+    %11 = COPY %20
+    %11.sub1 = COPY %1
+    %11.sub2 = COPY %1
+    %11.sub3 = COPY %1
+    %11.sub4 = COPY %1
+    %11.sub5 = COPY %1
+    %11.sub6 = COPY %1
+    %11.sub7 = COPY %1
+    %11.sub8 = COPY %1
+    dead %18 = IMAGE_SAMPLE_C_D_O_V1_V16 %11, %3, %4, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec
+    %20.sub1 = COPY %2
+    %20.sub2 = COPY %2
+    %20.sub3 = COPY %2
+    %20.sub4 = COPY %2
+    %20.sub5 = COPY %2
+    %20.sub6 = COPY %2
+    %20.sub7 = COPY %2
+    %20.sub8 = COPY %2
+    dead %27 = IMAGE_SAMPLE_C_D_O_V1_V16 %20, %5, %6, 1, 0, 0, 0, 0, 0, 0, -1, implicit %exec
+
+...
diff --git a/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
new file mode 100644
index 00000000000..c0e8e58556f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/scheduler-subrange-crash.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+; REQUIRES: asserts
+;
+; This test used to crash with the following assertion:
+; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode<llvm::SlotIndex, llvm::LiveInterval *, 8, llvm::IntervalMapInfo<llvm::SlotIndex> >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo<llvm::SlotIndex>]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed.
+;
+; This was related to incorrectly calculating subregister live ranges
+; (i.e. live interval subranges): subregister defs are not uses for that
+; purpose.
+;
+; Check for a valid output:
+; CHECK: tbuffer_store_format_x
+
+target triple = "amdgcn--"
+
+define amdgpu_gs void @main(i32 inreg %arg) #0 {
+main_body:
+  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 20)
+  %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 24)
+  %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 48)
+  %array_vector3 = insertelement <4 x float> zeroinitializer, float %tmp2, i32 3
+  %array_vector5 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
+  %array_vector6 = insertelement <4 x float> %array_vector5, float undef, i32 2
+  %array_vector9 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp1, i32 1
+  %array_vector10 = insertelement <4 x float> %array_vector9, float 0.000000e+00, i32 2
+  %array_vector11 = insertelement <4 x float> %array_vector10, float undef, i32 3
+  %tmp3 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8> undef, i32 undef, i32 4864, i32 0, i32 1, i32 0, i32 1, i32 0, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp3, i32 1, i32 36, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc = bitcast <4 x float> %array_vector3 to <4 x i32>
+  %tmp4 = extractelement <4 x i32> %bc, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp4, i32 1, i32 48, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc49 = bitcast <4 x float> %array_vector11 to <4 x i32>
+  %tmp5 = extractelement <4 x i32> %bc49, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp5, i32 1, i32 72, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %array_vector21 = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %tmp, i32 1
+  %array_vector22 = insertelement <4 x float> %array_vector21, float undef, i32 2
+  %array_vector23 = insertelement <4 x float> %array_vector22, float undef, i32 3
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 28, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  %bc52 = bitcast <4 x float> %array_vector23 to <4 x i32>
+  %tmp6 = extractelement <4 x i32> %bc52, i32 undef
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %tmp6, i32 1, i32 64, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 20, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 56, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 undef, i32 1, i32 92, i32 %arg, i32 0, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0)
+  ret void
+}
+
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #2
+declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) #3
+
+attributes #0 = { nounwind "target-cpu"="tonga" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
new file mode 100644
index 00000000000..732790ceb33
--- /dev/null
+++ b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
@@ -0,0 +1,115 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+; REQUIRES: asserts
+;
+; This test used to crash with the following assertion:
+; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode<llvm::SlotIndex, llvm::LiveInterval *, 8, llvm::IntervalMapInfo<llvm::SlotIndex> >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo<llvm::SlotIndex>]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed.
+;
+; This was related to incorrectly calculating subregister live ranges
+; (i.e. live interval subranges): subregister defs are not uses for that
+; purpose.
+;
+; Check for a valid output.
+; CHECK: image_sample_c
+
+target triple = "amdgcn--"
+
+@ddxy_lds = external addrspace(3) global [64 x i32]
+
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
+main_body:
+  %tmp = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg6, <2 x i32> %arg8)
+  %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp24 = extractelement <4 x float> %tmp23, i32 3
+  %tmp25 = fmul float %tmp24, undef
+  %tmp26 = fmul float undef, %tmp
+  %tmp27 = fadd float %tmp26, undef
+  %tmp28 = bitcast float %tmp27 to i32
+  %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
+  %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1
+  %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2
+  %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp33 = extractelement <4 x float> %tmp32, i32 0
+  %tmp34 = fadd float undef, %tmp33
+  %tmp35 = fadd float %tmp34, undef
+  %tmp36 = fadd float %tmp35, undef
+  %tmp37 = fadd float %tmp36, undef
+  %tmp38 = fadd float %tmp37, undef
+  %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp40 = extractelement <4 x float> %tmp39, i32 0
+  %tmp41 = extractelement <4 x float> %tmp39, i32 1
+  %tmp42 = extractelement <4 x float> %tmp39, i32 2
+  %tmp43 = extractelement <4 x float> %tmp39, i32 3
+  %tmp44 = fmul float %tmp40, undef
+  %tmp45 = fmul float %tmp41, undef
+  %tmp46 = fmul float %tmp42, undef
+  %tmp47 = fmul float %tmp43, undef
+  %tmp48 = fadd float undef, %tmp44
+  %tmp49 = fadd float undef, %tmp45
+  %tmp50 = bitcast float %tmp27 to i32
+  %tmp51 = bitcast float %tmp48 to i32
+  %tmp52 = bitcast float %tmp49 to i32
+  %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0
+  %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1
+  %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2
+  %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp57 = extractelement <4 x float> %tmp56, i32 0
+  %tmp58 = fadd float %tmp38, %tmp57
+  %tmp59 = fadd float undef, %tmp46
+  %tmp60 = fadd float undef, %tmp47
+  %tmp61 = bitcast float %tmp59 to i32
+  %tmp62 = bitcast float %tmp60 to i32
+  %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1
+  %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2
+  %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp66 = extractelement <4 x float> %tmp65, i32 0
+  %tmp67 = fadd float %tmp58, %tmp66
+  %tmp68 = fmul float %tmp67, 1.250000e-01
+  %tmp69 = fmul float %tmp68, undef
+  %tmp70 = fcmp une float %tmp69, 0.000000e+00
+  br i1 %tmp70, label %IF26, label %ENDIF25
+
+IF26:                                             ; preds = %main_body
+  %tmp71 = bitcast float %tmp27 to i32
+  %tmp72 = insertelement <4 x i32> undef, i32 %tmp71, i32 0
+  br label %LOOP
+
+ENDIF25:                                          ; preds = %IF29, %main_body
+  %.4 = phi float [ %tmp84, %IF29 ], [ %tmp68, %main_body ]
+  %tmp73 = fadd float %.4, undef
+  %tmp74 = call float @llvm.AMDGPU.clamp.(float %tmp73, float 0.000000e+00, float 1.000000e+00)
+  %tmp75 = fmul float undef, %tmp74
+  %tmp76 = fmul float %tmp75, undef
+  %tmp77 = fadd float %tmp76, undef
+  %tmp78 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp77, 11
+  %tmp79 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp78, float undef, 12
+  %tmp80 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp79, float undef, 13
+  %tmp81 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp80, float %tmp25, 14
+  %tmp82 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp81, float undef, 15
+  %tmp83 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp82, float %arg21, 24
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp83
+
+LOOP:                                             ; preds = %ENDIF28, %IF26
+  %.5 = phi float [ undef, %IF26 ], [ %tmp89, %ENDIF28 ]
+  br i1 false, label %IF29, label %ENDIF28
+
+IF29:                                             ; preds = %LOOP
+  %tmp84 = fmul float %.5, 3.125000e-02
+  br label %ENDIF25
+
+ENDIF28:                                          ; preds = %LOOP
+  %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1
+  %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2
+  %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp88 = extractelement <4 x float> %tmp87, i32 0
+  %tmp89 = fadd float undef, %tmp88
+  br label %LOOP
+}
+
+declare float @llvm.AMDGPU.clamp.(float, float, float) #1
+declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+
+attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #1 = { nounwind readnone }