From e9a23c4c57a235ead3003196c1ae1de05aa79fc6 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 1 Nov 2016 22:55:07 +0000
Subject: [PATCH] AMDGPU: Default to using scalar mov to materialize immediate

This is the conservatively correct way because it's easy to
move or replace a scalar immediate. This was incorrect in the case
when the register class wasn't known from the static instruction
definition, but still needed to be an SGPR. The main example of this
is inlineasm has an SGPR constraint.

Also start verifying the register classes of inlineasm operands.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285762 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/SIInstrInfo.cpp             | 22 +++++++++++
 lib/Target/AMDGPU/SIInstrInfo.td              | 16 +++++---
 lib/Target/AMDGPU/SIInstructions.td           | 24 ++++++------
 .../AMDGPU/control-flow-fastregalloc.ll       | 20 +++++-----
 test/CodeGen/AMDGPU/inline-constraints.ll     | 37 ++++++++++++++++++-
 test/CodeGen/AMDGPU/insert_vector_elt.ll      |  2 +-
 6 files changed, 92 insertions(+), 29 deletions(-)
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 905ee467319..108995a463f 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1943,6 +1943,28 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     return false;
   }
 
+  if (MI.isInlineAsm()) {
+    // Verify register classes for inlineasm constraints.
+    for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
+         I != E; ++I) {
+      const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
+      if (!RC)
+        continue;
+
+      const MachineOperand &Op = MI.getOperand(I);
+      if (!Op.isReg())
+        continue;
+
+      unsigned Reg = Op.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+        ErrInfo = "inlineasm operand has incorrect register class.";
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     if (MI.getOperand(i).isFPImm()) {
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index a56d909624f..f19e99e7cd1 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -265,19 +265,25 @@ class InlineFPImm <ValueType vt> : PatLeaf <(vt fpimm), [{
   return isInlineImmediate(N);
 }]>;
 
-class SGPRImm <dag frag> : PatLeaf<frag, [{
+class VGPRImm <dag frag> : PatLeaf<frag, [{
   if (Subtarget->getGeneration() < SISubtarget::SOUTHERN_ISLANDS) {
     return false;
   }
   const SIRegisterInfo *SIRI =
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+  unsigned Limit = 0;
   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
-                                                U != E; ++U) {
+         Limit < 10 && U != E; ++U, ++Limit) {
     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
-    if (RC && SIRI->isSGPRClass(RC))
-      return true;
+
+    // If the register class is unknown, it could be an unknown
+    // register class that needs to be an SGPR, e.g. an inline asm
+    // constraint
+    if (!RC || SIRI->isSGPRClass(RC))
+      return false;
   }
-  return false;
+
+  return Limit < 10;
 }]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index f3ccee288fd..4122eb915f3 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -615,25 +615,25 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <
-  (SGPRImm<(i32 imm)>:$imm),
-  (S_MOV_B32 imm:$imm)
->;
-
-def : Pat <
-  (SGPRImm<(f32 fpimm)>:$imm),
-  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
->;
-
-def : Pat <
-  (i32 imm:$imm),
+  (VGPRImm<(i32 imm)>:$imm),
   (V_MOV_B32_e32 imm:$imm)
 >;
 
 def : Pat <
-  (f32 fpimm:$imm),
+  (VGPRImm<(f32 fpimm)>:$imm),
   (V_MOV_B32_e32 (f32 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+def : Pat <
+  (i32 imm:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+def : Pat <
+  (f32 fpimm:$imm),
+  (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
+>;
+
 def : Pat <
  (i32 frameindex:$fi),
  (V_MOV_B32_e32 (i32 (frameindex_to_targetframeindex $fi)))
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index ad5e8908340..27b3cc0e435 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -15,7 +15,7 @@
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
 
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
@@ -93,24 +93,24 @@ endif:
 ; GCN: s_mov_b32 m0, -1
 ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]]
 
-; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0,
+; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
 
 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
+; Spill load
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 8-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:12 ; 8-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 8-byte Folded Spill
-
-; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:16 ; 8-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -120,7 +120,7 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
@@ -133,11 +133,11 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:12 ; 8-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 8-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:16 ; 8-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll
index 7282e89e85b..3c0bb75a607 100644
--- a/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -26,8 +26,43 @@ entry:
 ; GCN: s_mov_b32 m0, -1
 ; GCN-NOT: s_mov_b32 s{{[0-9]+}}, m0
 ; GCN: ; use m0
-define void @inline_sreg_constraint_m0(i32 addrspace(1)* %ptr) {
+define void @inline_sreg_constraint_m0() {
   %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
   tail call void asm sideeffect "; use $0", "s"(i32 %m0)
   ret void
 }
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
+; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
+; GCN: ; use [[REG]]
+define void @inline_sreg_constraint_imm_i32() {
+  tail call void asm sideeffect "; use $0", "s"(i32 32)
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
+; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
+; GCN: ; use [[REG]]
+define void @inline_sreg_constraint_imm_f32() {
+  tail call void asm sideeffect "; use $0", "s"(float 1.0)
+  ret void
+}
+
+; FIXME: Should be able to use s_mov_b64
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i64:
+; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
+; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
+; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+define void @inline_sreg_constraint_imm_i64() {
+  tail call void asm sideeffect "; use $0", "s"(i64 -4)
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f64:
+; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
+; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
+; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
+define void @inline_sreg_constraint_imm_f64() {
+  tail call void asm sideeffect "; use $0", "s"(double 1.0)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 0cdb1c9fb3a..37da9c5d5ad 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -15,7 +15,7 @@
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_mov_b32_e32 [[CONSTREG:v[0-9]+]], 0x40a00000
+; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
 define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {