AMDGPU: add llvm.amdgcn.buffer.load/store intrinsics

Summary: They correspond to BUFFER_LOAD/STORE_DWORD[_X2,X3,X4] and mostly behave like llvm.amdgcn.buffer.load/store.format. They will be used by Mesa for SSBO and atomic counters at least when robust buffer access behavior is desired. (These instructions perform no format conversion and do buffer range checking per component.) As a side effect of sharing patterns with llvm.amdgcn.buffer.store.format, it has become trivial to add support for the f32 and v2f32 variants of that intrinsic, so the patch does so. Also DAG-ify (and fix) some tests that I noticed intermittent failures in while developing this patch. Some tests were (temporarily) adjusted for the required mayLoad/hasSideEffects changes to the BUFFER_STORE_DWORD* instructions. See also http://reviews.llvm.org/D18291. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits Differential Revision: http://reviews.llvm.org/D18292 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266126 91177308-0d34-0410-b5e6-96231b3b80d8
2025-02-03 19:15:30 +00:00 · 2016-04-12 21:18:10 +00:00 · 2016-04-12 21:18:10 +00:00 · 756309c45b
commit 756309c45b
parent a439f48cb3
10 changed files with 318 additions and 84 deletions
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@ -224,7 +224,7 @@ def int_amdgcn_image_atomic_cmpswap : Intrinsic <
   llvm_i1_ty],       // slc(imm)
  []>;

-def int_amdgcn_buffer_load_format : Intrinsic <
+class AMDGPUBufferLoad : Intrinsic <
  [llvm_anyfloat_ty],
  [llvm_v4i32_ty,     // rsrc(SGPR)
   llvm_i32_ty,       // vindex(VGPR)
@ -232,16 +232,20 @@ def int_amdgcn_buffer_load_format : Intrinsic <
   llvm_i1_ty,        // glc(imm)
   llvm_i1_ty],       // slc(imm)
  [IntrReadMem]>;
+def int_amdgcn_buffer_load_format : AMDGPUBufferLoad;
+def int_amdgcn_buffer_load : AMDGPUBufferLoad;

-def int_amdgcn_buffer_store_format : Intrinsic <
+class AMDGPUBufferStore : Intrinsic <
  [],
-  [llvm_anyfloat_ty,  // vdata(VGPR) -- can currently only select v4f32
+  [llvm_anyfloat_ty,  // vdata(VGPR) -- can currently only select f32, v2f32, v4f32
   llvm_v4i32_ty,     // rsrc(SGPR)
   llvm_i32_ty,       // vindex(VGPR)
   llvm_i32_ty,       // offset(SGPR/VGPR/imm)
   llvm_i1_ty,        // glc(imm)
   llvm_i1_ty],       // slc(imm)
  []>;
+def int_amdgcn_buffer_store_format : AMDGPUBufferStore;
+def int_amdgcn_buffer_store : AMDGPUBufferStore;

 class AMDGPUBufferAtomic : Intrinsic <
  [llvm_i32_ty],
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -996,6 +996,11 @@ defm BUFFER_STORE_SHORT : MUBUF_Store_Helper <
  mubuf<0x1a>, "buffer_store_short", VGPR_32, i32, truncstorei16_global
 >;

+// Without mayLoad and hasSideEffects, TableGen complains about the pattern
+// matching llvm.amdgcn.buffer.store. Eventually, we'll want a WriteOnly
+// property to express the effects of this intrinsic more precisely, see
+// http://reviews.llvm.org/D18291
+let mayLoad = 1, hasSideEffects = 1 in {
 defm BUFFER_STORE_DWORD : MUBUF_Store_Helper <
  mubuf<0x1c>, "buffer_store_dword", VGPR_32, i32, global_store
 >;
@ -1007,6 +1012,7 @@ defm BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
 defm BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
  mubuf<0x1e, 0x1f>, "buffer_store_dwordx4", VReg_128, v4i32, global_store
 >;
+}

 defm BUFFER_ATOMIC_SWAP : MUBUF_Atomic <
  mubuf<0x30, 0x40>, "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
@ -2140,41 +2146,36 @@ def : Pat <
 // buffer_load/store_format patterns
 //===----------------------------------------------------------------------===//

-multiclass MUBUF_LoadIntrinsicPat<ValueType vt, string opcode> {
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                  string opcode> {
  def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
-                                       (MUBUFIntrinsicOffset i32:$soffset,
-                                                             i16:$offset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
    (!cast<MUBUF>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
      (as_i1imm $glc), (as_i1imm $slc), 0)
  >;

  def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
-                                       (MUBUFIntrinsicOffset i32:$soffset,
-                                                             i16:$offset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+              imm:$glc, imm:$slc)),
    (!cast<MUBUF>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
      (as_i1imm $glc), (as_i1imm $slc), 0)
  >;

  def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, 0,
-                                       (MUBUFIntrinsicVOffset i32:$soffset,
-                                                              i16:$offset,
-                                                              i32:$voffset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, 0,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
    (!cast<MUBUF>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
      (as_i1imm $glc), (as_i1imm $slc), 0)
  >;

  def : Pat<
-    (vt (int_amdgcn_buffer_load_format v4i32:$rsrc, i32:$vindex,
-                                       (MUBUFIntrinsicVOffset i32:$soffset,
-                                                              i16:$offset,
-                                                              i32:$voffset),
-                                       imm:$glc, imm:$slc)),
+    (vt (name v4i32:$rsrc, i32:$vindex,
+              (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+              imm:$glc, imm:$slc)),
    (!cast<MUBUF>(opcode # _BOTHEN)
      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
      $rsrc, $soffset, (as_i16imm $offset),
@ -2182,50 +2183,59 @@ multiclass MUBUF_LoadIntrinsicPat<ValueType vt, string opcode> {
  >;
 }

-defm : MUBUF_LoadIntrinsicPat<f32, "BUFFER_LOAD_FORMAT_X">;
-defm : MUBUF_LoadIntrinsicPat<v2f32, "BUFFER_LOAD_FORMAT_XY">;
-defm : MUBUF_LoadIntrinsicPat<v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load_format, v4f32, "BUFFER_LOAD_FORMAT_XYZW">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, f32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<int_amdgcn_buffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;

-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,
-                                  (MUBUFIntrinsicOffset i32:$soffset,
-                                                        i16:$offset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_OFFSET $vdata, $rsrc, $soffset, (as_i16imm $offset),
-    (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+                                   string opcode> {
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFSET) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+                                    (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;

-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex,
-                                  (MUBUFIntrinsicOffset i32:$soffset,
-                                                        i16:$offset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_IDXEN $vdata, $vindex, $rsrc, $soffset,
-    (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicOffset i32:$soffset, i16:$offset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _IDXEN) $vdata, $vindex, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;

-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, 0,
-                                  (MUBUFIntrinsicVOffset i32:$soffset,
-                                                         i16:$offset,
-                                                         i32:$voffset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_OFFEN $vdata, $voffset, $rsrc, $soffset,
-    (as_i16imm $offset), (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, 0,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _OFFEN) $vdata, $voffset, $rsrc, $soffset,
+                                   (as_i16imm $offset), (as_i1imm $glc),
+                                   (as_i1imm $slc), 0)
+  >;

-def : Pat<
-  (int_amdgcn_buffer_store_format v4f32:$vdata, v4i32:$rsrc, i32:$vindex,
-                                  (MUBUFIntrinsicVOffset i32:$soffset,
-                                                         i16:$offset,
-                                                         i32:$voffset),
-                                  imm:$glc, imm:$slc),
-  (BUFFER_STORE_FORMAT_XYZW_BOTHEN
-    $vdata,
-    (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-    $rsrc, $soffset, (as_i16imm $offset),
-    (as_i1imm $glc), (as_i1imm $slc), 0)
->;
+  def : Pat<
+    (name vt:$vdata, v4i32:$rsrc, i32:$vindex,
+          (MUBUFIntrinsicVOffset i32:$soffset, i16:$offset, i32:$voffset),
+          imm:$glc, imm:$slc),
+    (!cast<MUBUF>(opcode # _BOTHEN)
+      $vdata,
+      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
+      $rsrc, $soffset, (as_i16imm $offset),
+      (as_i1imm $glc), (as_i1imm $slc), 0)
+  >;
+}
+
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, f32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<int_amdgcn_buffer_store, v4f32, "BUFFER_STORE_DWORDX4">;

 //===----------------------------------------------------------------------===//
 // buffer_atomic patterns
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; GCN-LABEL: {{^}}stored_fi_to_lds:
-; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
-; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@ -33,8 +33,8 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; SI-NOT: bfe
 ; SI-NOT: v_cvt_f32_ubyte3_e32
 ; SI-DAG: v_cvt_f32_ubyte2_e32
-; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
-; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
+; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]],
+; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
 ; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
 define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@ -32,7 +32,6 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs
 ; GCN-LABEL: {{^}}bitcast_int_to_fpvector_extract_0:
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
-; GCN: v_addc_u32
 ; GCN: buffer_store_dword
 define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
   %a = load i64, i64 addrspace(1)* %in
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll
@ -0,0 +1,108 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_load:
+;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0
+;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc
+;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc
+;CHECK: s_waitcnt
+define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  %data_slc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  %r0 = insertvalue {<4 x float>, <4 x float>, <4 x float>} undef, <4 x float> %data, 0
+  %r1 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r0, <4 x float> %data_glc, 1
+  %r2 = insertvalue {<4 x float>, <4 x float>, <4 x float>} %r1, <4 x float> %data_slc, 2
+  ret {<4 x float>, <4 x float>, <4 x float>} %r2
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs:
+;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_immoffs_large:
+;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff
+;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_idx:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_ofs_imm:
+;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
+main_body:
+  %ofs = add i32 %1, 58
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both:
+;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_both_reversed:
+;CHECK: v_mov_b32_e32 v2, v0
+;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) {
+main_body:
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0)
+  ret <4 x float> %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x1:
+;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+  ret float %data
+}
+
+;CHECK-LABEL: {{^}}buffer_load_x2:
+;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen
+;CHECK: s_waitcnt
+define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) {
+main_body:
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0)
+  ret <2 x float> %data
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0
+
+attributes #0 = { nounwind readonly }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll
@ -70,6 +70,24 @@ main_body:
  ret void
 }

+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1

--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll
@ -0,0 +1,95 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+
+;CHECK-LABEL: {{^}}buffer_store:
+;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0
+;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc
+;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc
+define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %3, <4 x i32> %0, i32 0, i32 0, i1 0, i1 1)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_immoffs:
+;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42
+define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_idx:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_ofs:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both:
+;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_both_reversed:
+;CHECK: v_mov_b32_e32 v6, v4
+;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0)
+  ret void
+}
+
+; Ideally, the register allocator would avoid the wait here
+;
+;CHECK-LABEL: {{^}}buffer_store_wait:
+;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0) expcnt(0)
+;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0)
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0)
+  call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %data, <4 x i32> %0, i32 %4, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x1:
+;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+main_body:
+  call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+;CHECK-LABEL: {{^}}buffer_store_x2:
+;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+main_body:
+  call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@ -11,8 +11,8 @@ declare void @llvm.amdgcn.s.barrier() #1

 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
+; CI-NEXT: buffer_store_dword
 ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:8
-; CI: buffer_store_dword
 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
  %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4

@ -71,9 +71,9 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
 }

 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
-; CI: buffer_store_dword
 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: buffer_store_dword
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x2
 ; CI: buffer_store_dword
@ -184,11 +184,11 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }

 ; FUNC-LABEL: @reorder_global_offsets
+; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
 ; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
-; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: buffer_store_dword {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:404
 ; CI: buffer_store_dword
 ; CI: s_endpgm
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@ -46,11 +46,11 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind
 }

 ; FUNC-LABEL: {{^}}v_abs_v2i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]

-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]

 ; GCN: v_add_i32
 ; GCN: v_add_i32
@ -97,15 +97,15 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind
 }

 ; FUNC-LABEL: {{^}}v_abs_v4i32:
-; GCN: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
-; GCN: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG0:v[0-9]+]], vcc, 0, [[SRC0:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG1:v[0-9]+]], vcc, 0, [[SRC1:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG2:v[0-9]+]], vcc, 0, [[SRC2:v[0-9]+]]
+; GCN-DAG: v_sub_i32_e32 [[NEG3:v[0-9]+]], vcc, 0, [[SRC3:v[0-9]+]]

-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
-; GCN: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG0]], [[SRC0]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG1]], [[SRC1]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG2]], [[SRC2]]
+; GCN-DAG: v_max_i32_e32 {{v[0-9]+}}, [[NEG3]], [[SRC3]]

 ; GCN: v_add_i32
 ; GCN: v_add_i32