From faf7289db32265bb830bd7cecd97e0ca50cb5af3 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@gmail.com>
Date: Thu, 26 May 2016 19:35:29 +0000
Subject: [PATCH] AMDGPU/SI: Enable load-store-opt by default.

Summary: Enable load-store-opt by default, and update LIT tests.

Reviewers: arsenm

Differential Revision: http://reviews.llvm.org/D20694

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@270894 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp          |  2 +-
 test/CodeGen/AMDGPU/address-space.ll           |  3 +--
 test/CodeGen/AMDGPU/ctpop.ll                   |  6 +++---
 .../AMDGPU/drop-mem-operand-move-smrd.ll       |  3 +--
 test/CodeGen/AMDGPU/fceil64.ll                 |  4 ++--
 test/CodeGen/AMDGPU/fmax3.ll                   | 12 ++++++------
 test/CodeGen/AMDGPU/fmin3.ll                   | 12 ++++++------
 test/CodeGen/AMDGPU/global-extload-i1.ll       |  4 ++--
 test/CodeGen/AMDGPU/global-extload-i16.ll      |  4 ++--
 test/CodeGen/AMDGPU/global-extload-i32.ll      |  4 ++--
 test/CodeGen/AMDGPU/global-extload-i8.ll       |  4 ++--
 test/CodeGen/AMDGPU/indirect-private-64.ll     | 12 ++++++++++--
 .../AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll      |  9 ++++-----
 test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll   |  9 ++++-----
 test/CodeGen/AMDGPU/llvm.memcpy.ll             | 12 ++++--------
 test/CodeGen/AMDGPU/load-i1.ll                 |  4 ++--
 test/CodeGen/AMDGPU/local-64.ll                | 18 ++++++------------
 .../CodeGen/AMDGPU/local-memory-two-objects.ll |  4 +---
 test/CodeGen/AMDGPU/missing-store.ll           |  4 ++--
 test/CodeGen/AMDGPU/reorder-stores.ll          |  6 ++----
 test/CodeGen/AMDGPU/sdiv.ll                    |  4 ++--
 test/CodeGen/AMDGPU/shift-i64-opts.ll          |  8 ++++----
 .../AMDGPU/si-triv-disjoint-mem-access.ll      | 10 ++++------
 test/CodeGen/AMDGPU/store.ll                   |  3 +--
 test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll |  6 +++---
 test/CodeGen/AMDGPU/xor.ll                     |  4 ++--
 26 files changed, 79 insertions(+), 92 deletions(-)

diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 030bebcd1c0..4c69cbdce44 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -56,7 +56,7 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
 
-  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
     FullFS += "+flat-for-global,";
   FullFS += FS;
diff --git a/test/CodeGen/AMDGPU/address-space.ll b/test/CodeGen/AMDGPU/address-space.ll
index 3aa2f653bf9..de206f18991 100644
--- a/test/CodeGen/AMDGPU/address-space.ll
+++ b/test/CodeGen/AMDGPU/address-space.ll
@@ -8,8 +8,7 @@
 ; CHECK-LABEL: {{^}}do_as_ptr_calcs:
 ; CHECK: s_load_dword [[SREG1:s[0-9]+]],
 ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
-; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20
+; CHECK-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index 32b071d18ab..e53ad13464e 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -62,7 +62,7 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
 ; GCN: s_waitcnt
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
-; GCN-NEXT: buffer_store_dword [[RESULT]],
+; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
@@ -203,8 +203,8 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 }
 
 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index ac297cbcf9b..5e1ebfde3e1 100644
--- a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -7,8 +7,7 @@
 ; GCN-LABEL: {{^}}reschedule_global_load_lds_store:
 ; GCN: buffer_load_dword
 ; GCN: buffer_load_dword
-; GCN: ds_write_b32
-; GCN: ds_write_b32
+; GCN: ds_write2_b32
 ; GCN: s_endpgm
 define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index 6320f349e1a..92c39983964 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -13,8 +13,8 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
-; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
+; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
 ; SI-DAG: s_not_b64
 ; SI-DAG: s_and_b64
 ; SI-DAG: cmp_gt_i32
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index c3028a6217d..c0fde6e97f6 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -11,9 +11,9 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile  float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
@@ -29,9 +29,9 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 0a76699b43e..2d1facfc3a4 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -12,9 +12,9 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
@@ -30,9 +30,9 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
 define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
-  %a = load float, float addrspace(1)* %aptr, align 4
-  %b = load float, float addrspace(1)* %bptr, align 4
-  %c = load float, float addrspace(1)* %cptr, align 4
+  %a = load volatile float, float addrspace(1)* %aptr, align 4
+  %b = load volatile float, float addrspace(1)* %bptr, align 4
+  %c = load volatile float, float addrspace(1)* %cptr, align 4
   %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
   %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
   store float %f1, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/global-extload-i1.ll b/test/CodeGen/AMDGPU/global-extload-i1.ll
index bd9557d730f..c9d2c692e61 100644
--- a/test/CodeGen/AMDGPU/global-extload-i1.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i1.ll
@@ -153,8 +153,8 @@ define void @sextload_global_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; }
 
 ; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
-; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
+; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
+; SI-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; SI: buffer_store_dwordx2
 define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %a = load i1, i1 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
index 103a40dee27..1fd7f6b0335 100644
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -154,8 +154,8 @@ define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
-; SI: buffer_load_ushort v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/global-extload-i32.ll b/test/CodeGen/AMDGPU/global-extload-i32.ll
index e5e6be2199c..01dfe6f72b9 100644
--- a/test/CodeGen/AMDGPU/global-extload-i32.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i32.ll
@@ -3,8 +3,8 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
-; SI: buffer_load_dword v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_dword v[[LO:[0-9]+]],
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32, i32 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/global-extload-i8.ll b/test/CodeGen/AMDGPU/global-extload-i8.ll
index 85739476944..40bc1ddfd1a 100644
--- a/test/CodeGen/AMDGPU/global-extload-i8.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i8.ll
@@ -151,8 +151,8 @@ define void @sextload_global_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; }
 
 ; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
-; SI: buffer_load_ubyte v[[LO:[0-9]+]],
-; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; SI-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
+; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %a = load i8, i8 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index ae2e3ba378f..1f851f9de53 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
 
 declare void @llvm.amdgcn.s.barrier() #0
 
@@ -18,6 +18,8 @@ declare void @llvm.amdgcn.s.barrier() #0
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
+; CI-PROMOTE: ds_write_b64
+; CI-PROMOTE: ds_read_b64
 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load double, double addrspace(1)* %in, align 8
   %array = alloca [16 x double], align 8
@@ -47,6 +49,8 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 ; SI-PROMOTE: ds_read_b64
+; CI-PROMOTE: ds_write2_b64
+; CI-PROMOTE: ds_read2_b64
 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
   %array = alloca [8 x <2 x double>], align 16
@@ -71,6 +75,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
+; CI-PROMOTE: ds_write_b64
+; CI-PROMOTE: ds_read_b64
 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %array = alloca [8 x i64], align 8
@@ -101,6 +107,8 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 ; SI-PROMOTE: ds_write_b64
 ; SI-PROMOTE: ds_read_b64
 ; SI-PROMOTE: ds_read_b64
+; CI-PROMOTE: ds_write2_b64
+; CI-PROMOTE: ds_read2_b64
 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %array = alloca [8 x <2 x i64>], align 16
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
index 9e7852506f8..70fbee44251 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll
@@ -8,13 +8,12 @@ declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
 
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}]
 ; TODO: this constant should be folded:
-; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
+; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
-; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
-; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
-; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
-; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
+; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
+; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
 
 define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
   %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index cce5d1c67a6..6d8513cbbde 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -25,14 +25,13 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
 ; SI: v_rsq_clamp_f64_e32
 
 ; TODO: this constant should be folded:
-; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
+; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
-; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
-; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
-; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
-; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
+; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
+; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
+; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
 define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   store double %rsq_clamp, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 2bd9e166496..8398309d752 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -153,15 +153,11 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
 ; FIXME: Use 64-bit ops
 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
 
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_read_b64
+; SI: ds_read2_b64
+; SI: ds_read2_b64
 
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_write2_b64
+; SI: ds_write2_b64
 
 ; SI-DAG: s_endpgm
 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/load-i1.ll b/test/CodeGen/AMDGPU/load-i1.ll
index 0ca49fde3e7..e71cf53e24b 100644
--- a/test/CodeGen/AMDGPU/load-i1.ll
+++ b/test/CodeGen/AMDGPU/load-i1.ll
@@ -85,8 +85,8 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
-; SI: buffer_load_ubyte
-; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
+; SI-DAG: buffer_load_ubyte
+; SI-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
 define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index 33f3159d13e..f63d6e08ef7 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -122,8 +122,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
+; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:15 offset1:14
 ; BOTH: s_endpgm
 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
@@ -133,8 +132,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
+; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
 ; BOTH: s_endpgm
 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
@@ -143,10 +141,8 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:31 offset1:30
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:29 offset1:28
 ; BOTH: s_endpgm
 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
@@ -156,10 +152,8 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
 ; BOTH-NOT: ADD
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
-; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:3 offset1:2
+; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
 ; BOTH: s_endpgm
 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/local-memory-two-objects.ll b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
index c5deac39392..969769bea6f 100644
--- a/test/CodeGen/AMDGPU/local-memory-two-objects.ll
+++ b/test/CodeGen/AMDGPU/local-memory-two-objects.ll
@@ -32,9 +32,7 @@
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
-; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
-; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
-
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:4
 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
 entry:
   %x.i = call i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
index d608fd14ccf..3d6d7fae0fd 100644
--- a/test/CodeGen/AMDGPU/missing-store.ll
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -7,8 +7,8 @@
 
 ; FUNC-LABEL: {{^}}missing_store_reduced:
 ; SI: ds_read_b64
-; SI: buffer_store_dword
-; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; SI-DAG: buffer_store_dword
+; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; SI: s_load_dword
 ; SI: s_nop 2
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index d5e10d0be88..ad8d00c3639 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -16,10 +16,8 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
 }
 
 ; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
-; SI: ds_read_b64
-; SI: ds_read_b64
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_read2_b64
+; SI: ds_write2_b64
 ; SI: s_endpgm
 define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index 3970dd66a4b..29d893414c0 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -34,8 +34,8 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; working.
 
 ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
+; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
+; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
 ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
 ; SI: v_add_i32
 ; SI: v_lshrrev_b32
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 704a0263584..28a7b924904 100644
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -40,8 +40,8 @@ define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 }
 
 ; GCN-LABEL: {{^}}lshr_i64_32:
-; GCN: buffer_load_dword v[[LO:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
@@ -81,8 +81,8 @@ define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 }
 
 ; GCN-LABEL: {{^}}shl_i64_const_32:
-; GCN: buffer_load_dword v[[HI:[0-9]+]]
-; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
+; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 2a9298176df..f91a6cf7312 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -10,8 +10,7 @@ declare void @llvm.amdgcn.s.barrier() #1
 @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
 
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
-; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
@@ -71,8 +70,8 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
 }
 
 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
-; CI: buffer_store_dword
-; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
+; CI-DAG: buffer_store_dword
+; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
@@ -156,8 +155,7 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 }
 
 ; FUNC-LABEL: @reorder_local_offsets
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
+; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
diff --git a/test/CodeGen/AMDGPU/store.ll b/test/CodeGen/AMDGPU/store.ll
index e4034093f89..e107fe1a604 100644
--- a/test/CodeGen/AMDGPU/store.ll
+++ b/test/CodeGen/AMDGPU/store.ll
@@ -287,8 +287,7 @@ entry:
 ; CM: LDS_WRITE
 ; CM: LDS_WRITE
 
-; SI: ds_write_b64
-; SI: ds_write_b64
+; SI: ds_write2_b64
 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index d76a839886d..0435ed4d552 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -42,13 +42,13 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
+; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+
 ; GCN: buffer_load_dword [[VA0:v[0-9]+]]
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_load_dword [[VA1:v[0-9]+]]
 
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-
 ; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-NOT: v_mov_b32
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 655655d92f0..202170d6e22 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -64,8 +64,8 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
 define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
-  %a = load i1, i1 addrspace(1)* %in0
-  %b = load i1, i1 addrspace(1)* %in1
+  %a = load volatile i1, i1 addrspace(1)* %in0
+  %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
   store i1 %xor, i1 addrspace(1)* %out
   ret void