AMDGPU/SI: Enable lanemask tracking in misched

Summary:
This results in higher register usage, but should make it easier for
the compiler to hide latency.

This pass is a prerequisite for some more scheduler improvements, and I
think the increase register usage with this patch is acceptable, because
when combined with the scheduler improvements, the total register usage
will decrease.

shader-db stats:

2382 shaders in 478 tests
Totals:
SGPRS: 48672 -> 49088 (0.85 %)
VGPRS: 34148 -> 34847 (2.05 %)
Code Size: 1285816 -> 1289128 (0.26 %) bytes
LDS: 28 -> 28 (0.00 %) blocks
Scratch: 492544 -> 573440 (16.42 %) bytes per wave
Max Waves: 6856 -> 6846 (-0.15 %)
Wait states: 0 -> 0 (0.00 %)

Depends on D18451

Reviewers: nhaehnle, arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D18452

llvm-svn: 264876
This commit is contained in:
Tom Stellard 2016-03-30 16:35:09 +00:00
parent 23316f1822
commit 24f53ac119
31 changed files with 128 additions and 139 deletions

View File

@ -156,6 +156,10 @@ void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// register spills than just using one of these approaches on its own.
Policy.OnlyTopDown = false;
Policy.OnlyBottomUp = false;
// Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
if (!enableSIScheduler())
Policy.ShouldTrackLaneMasks = true;
}
}

View File

@ -282,11 +282,11 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
; SI-NOT: and
; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
; SI-NOT: and
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
; SI-NOT: and
; SI: buffer_store_dwordx2
; SI-NOT: and
; SI: buffer_store_dwordx2
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load volatile i64, i64 addrspace(1)* %aptr
%b = load volatile i64, i64 addrspace(1)* %aptr

View File

@ -3,11 +3,11 @@
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
@ -21,12 +21,12 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
}
; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i64_offset:
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dwordx2 s{{\[}}[[LOSWAP:[0-9]+]]:[[HISWAP:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; GCN-DAG: v_mov_b32_e32 v[[LOVCMP:[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 v[[HIVCMP:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 v[[LOSWAPV:[0-9]+]], s[[LOSWAP]]
; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]

View File

@ -2,12 +2,10 @@
; GCN-LABEL: {{^}}stored_fi_to_lds:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
%tmp = alloca float
@ -19,7 +17,6 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
; Offset is applied
; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
@ -27,6 +24,7 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
; GCN: buffer_store_dword v{{[0-9]+}}, [[FI1]]
; GCN-DAG: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]

View File

@ -7,7 +7,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
; FUNC-LABEL: @commute_add_imm_fabs_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_add_f32_e64 [[REG:v[0-9]+]], 2.0, |[[X]]|
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -21,7 +21,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
; FUNC-LABEL: @commute_mul_imm_fneg_fabs_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], -4.0, |[[X]]|
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -36,7 +36,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
; FUNC-LABEL: @commute_mul_imm_fneg_f32
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -52,7 +52,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -67,7 +67,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -84,7 +84,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -101,7 +101,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -120,7 +120,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@ -138,7 +138,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
; SI-DAG: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
; SI-NEXT: buffer_store_dword [[REG]]
; SI: buffer_store_dword [[REG]]
define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid

View File

@ -116,7 +116,7 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
; SI-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
; SI-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
; SI-DAG: s_flbit_i32_b32 [[FFBH_HI:s[0-9]+]], s[[HI]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[FFBH_LO]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_LO:v[0-9]+]], [[ADD]]
; SI-DAG: v_mov_b32_e32 [[VFFBH_HI:v[0-9]+]], [[FFBH_HI]]
; SI-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
; SI-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}

View File

@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
; SI: buffer_load_dword [[VAL:v[0-9]+]],
; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; SI-NEXT: buffer_store_dword [[RESULT]],
; SI: buffer_store_dword [[RESULT]],
define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
%val = load i32, i32 addrspace(1)* %valptr
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
; SI: buffer_load_dword [[VAL:v[0-9]+]],
; SI: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
; SI-NEXT: buffer_store_dword [[RESULT]],
; SI: buffer_store_dword [[RESULT]],
define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
%val = load i32, i32 addrspace(1)* %valptr
%ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone

View File

@ -116,9 +116,10 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
; FUNC-LABEL: {{^}}ctpop_i64_in_br:
; SI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xd
; VI: s_load_dwordx2 s{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x34
; GCN: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
; GCN-DAG: s_bcnt1_i32_b64 [[RESULT:s[0-9]+]], {{s\[}}[[LOVAL]]:[[HIVAL]]{{\]}}
; GCN-DAG: s_mov_b32 [[ZERO:s[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[RESULT]]
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HIVAL]]
; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
; GCN: s_endpgm
define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {

View File

@ -33,8 +33,8 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
; SI-NOT: bfe
; SI-NOT: v_cvt_f32_ubyte3_e32
; SI-DAG: v_cvt_f32_ubyte2_e32
; SI-DAG: v_cvt_f32_ubyte1_e32
; SI-DAG: v_cvt_f32_ubyte0_e32
; SI-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
; SI-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
; SI: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
%load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4

View File

@ -85,14 +85,8 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
}
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
; FIXME: These moves shouldn't be necessary, it should be able to
; store the same register if offset1 was the non-zero offset.
; CI: v_mov_b32
; CI: v_mov_b32
; CI: buffer_store_dwordx4
; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
; CI: buffer_store_dwordx4 [[REG_ZW]]
; CI: s_endpgm
define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -104,10 +98,8 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
}
; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
; CI: v_mov_b32
; CI: v_mov_b32
; CI: buffer_store_dwordx4
; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
; CI: buffer_store_dwordx4 [[REG_ZW]]
; CI: s_endpgm
define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -120,14 +112,10 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
; FIXME: Extra moves shuffling superregister
; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
; CI: v_mov_b32
; CI: v_mov_b32
; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
; CI: v_mov_b32
; CI: v_mov_b32
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI-DAG: ds_read2_b64 [[VEC_HI:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
; CI-DAG: ds_read2_b64 [[VEC_LO:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
; CI: s_endpgm
define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -140,22 +128,15 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
; FIXME: Extra moves shuffling superregister
; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
; CI: v_mov_b32
; CI: v_mov_b32
; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}}
; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}}
; CI: v_mov_b32
; CI: v_mov_b32
; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
; CI: v_mov_b32
; CI: v_mov_b32
; CI-DAG: ds_read2_b64 [[VEC0_3:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read2_b64 [[VEC4_7:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:2 offset1:3{{$}}
; CI-DAG: ds_read2_b64 [[VEC8_11:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:4 offset1:5{{$}}
; CI-DAG: ds_read2_b64 [[VEC12_15:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset0:6 offset1:7{{$}}
; CI: s_waitcnt lgkmcnt(0)
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI: buffer_store_dwordx4
; CI-DAG: buffer_store_dwordx4 [[VEC0_3]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
; CI-DAG: buffer_store_dwordx4 [[VEC4_7]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
; CI: s_endpgm
define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1

View File

@ -65,9 +65,9 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
; SI-LABEL: @simple_read2st64_f32_over_max_offset
; SI-NOT: ds_read2st64_b32
; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
; SI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
; SI: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]
; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
; SI: s_endpgm
define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
%x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1

View File

@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
}
; SI-LABEL: @simple_write2_two_val_f32_x2
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
}
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
; SI-NEXT: ds_write2_b32 [[BASEADDR]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1

View File

@ -13,8 +13,8 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
; GCN-DAG: s_mov_b32 [[SCONST:s[0-9]+]], 0x7fffffff
; GCN: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
; GCN: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN_HI]]
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
; GCN: s_endpgm
define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {

View File

@ -13,8 +13,8 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
; FUNC-LABEL: {{^}}ffloor_f64:
; CI: v_floor_f64_e32
; SI: v_fract_f64_e32
; SI: v_min_f64
; SI: v_cmp_class_f64_e64
; SI-DAG: v_min_f64
; SI-DAG: v_cmp_class_f64_e64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_add_f64
@ -28,8 +28,8 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
; FUNC-LABEL: {{^}}ffloor_f64_neg:
; CI: v_floor_f64_e64
; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT:s[[0-9]+:[0-9]+]]]
; SI: v_min_f64
; SI: v_cmp_class_f64_e64
; SI-DAG: v_min_f64
; SI-DAG: v_cmp_class_f64_e64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
@ -44,8 +44,8 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
; FUNC-LABEL: {{^}}ffloor_f64_neg_abs:
; CI: v_floor_f64_e64
; SI: v_fract_f64_e64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT:s[[0-9]+:[0-9]+]]]|
; SI: v_min_f64
; SI: v_cmp_class_f64_e64
; SI-DAG: v_min_f64
; SI-DAG: v_cmp_class_f64_e64
; SI: v_cndmask_b32_e64
; SI: v_cndmask_b32_e64
; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|

View File

@ -55,8 +55,8 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
}
; FUNC-LABEL: {{^}}fneg_fabs_f64:
; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
; SI: s_load_dwordx2
; SI: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}
; SI: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
; SI-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
; SI-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]

View File

@ -10,10 +10,11 @@ declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone
; TODO: this constant should be folded:
; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
; VI: s_mov_b32 s[[LOW:[0-9+]]], s[[ALLBITS]]
; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW]]:[[HIGH1]]]
; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW]]:[[HIGH2]]]
; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
%rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone

View File

@ -68,7 +68,6 @@ main_body:
; create copies which we don't bother to track here.
;
;CHECK-LABEL: {{^}}test3:
;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 glc
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v2, s[0:3], 0 idxen glc
@ -79,6 +78,7 @@ main_body:
;CHECK: s_waitcnt vmcnt(0)
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc
;CHECK-DAG: s_waitcnt vmcnt(0)
;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1fff
;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[SOFS]] offset:1 glc
define float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) #0 {
main_body:

View File

@ -187,7 +187,7 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
@ -202,7 +202,7 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
@ -218,7 +218,7 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
@ -234,7 +234,7 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {

View File

@ -28,10 +28,11 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
; TODO: this constant should be folded:
; VI: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
; VI: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
; VI: s_mov_b32 s[[LOW:[0-9+]]], s[[ALLBITS]]
; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW]]:[[HIGH1]]]
; VI: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
; VI: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW]]:[[HIGH2]]]
; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
%rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
store double %rsq_clamp, double addrspace(1)* %out

View File

@ -5,8 +5,8 @@
; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
; EG: LDS_WRXCHG_RET *
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: s_load_dword [[SPTR:s[0-9]+]],
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: buffer_store_dword [[RESULT]],
@ -31,8 +31,8 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
; XXX - Is it really necessary to load 4 into VGPR?
; FUNC-LABEL: {{^}}lds_atomic_add_ret_i32:
; EG: LDS_ADD_RET *
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: s_load_dword [[SPTR:s[0-9]+]],
; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
; GCN: buffer_store_dword [[RESULT]],

View File

@ -30,10 +30,10 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
}
; GCN-LABEL: {{^}}lds_atomic_add_ret_i64_offset:
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
; SI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GCN: v_mov_b32_e32 v[[LOVDATA:[0-9]+]], 9
; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
; GCN: buffer_store_dwordx2 [[RESULT]],

View File

@ -191,8 +191,8 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
}
; FUNC-LABEL: {{^}}s_test_umax_ugt_imm_v2i32:
; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
; SI: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 15
; SI-DAG: s_max_u32 {{s[0-9]+}}, {{s[0-9]+}}, 23
define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
%cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
%val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
@ -205,8 +205,8 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
; SI-NEXT: buffer_store_dword [[VMAX]]
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
; SI: buffer_store_dword [[VMAX]]
define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
@ -223,8 +223,8 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
; SI-NEXT: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
; SI-NEXT: buffer_store_dword [[VMAX]]
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
; SI: buffer_store_dword [[VMAX]]
define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32

View File

@ -301,8 +301,8 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
; SI-NEXT: buffer_store_dword [[VMIN]]
; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
; SI: buffer_store_dword [[VMIN]]
define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
@ -319,8 +319,8 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
; SI-NEXT: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
; SI-NEXT: buffer_store_dword [[VMIN]]
; SI: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
; SI: buffer_store_dword [[VMIN]]
define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32

View File

@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
; MUBUF load with an immediate byte offset that fits into 12-bits
; CHECK-LABEL: {{^}}mubuf_load0:
; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
; CHECK: buffer_load_dword v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
entry:
%0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
@ -19,7 +19,7 @@ entry:
; MUBUF load with the largest possible immediate offset
; CHECK-LABEL: {{^}}mubuf_load1:
; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
; CHECK: buffer_load_ubyte v{{[0-9]}}, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
%0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095

View File

@ -216,7 +216,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
; GCN: buffer_load_sbyte [[B:v[0-9]+]]
; GCN: v_cmp_ne_i32_e32 vcc, -1, [[B]]{{$}}
; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
; GCN-NEXT: buffer_store_byte [[RESULT]]
; GCN: buffer_store_byte [[RESULT]]
; GCN: s_endpgm
define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
%b = load i8, i8 addrspace(1)* %b.ptr

View File

@ -59,7 +59,7 @@ entry:
; FUNC-LABEL: {{^}}f64_one:
; SI: v_cmp_lg_f64_e32 vcc
; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp one double %a, %b
@ -80,7 +80,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ueq:
; SI: v_cmp_nlg_f64_e32 vcc
; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ueq double %a, %b
@ -92,7 +92,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ugt:
; SI: v_cmp_nle_f64_e32 vcc
; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ugt double %a, %b
@ -103,7 +103,7 @@ entry:
; FUNC-LABEL: {{^}}f64_uge:
; SI: v_cmp_nlt_f64_e32 vcc
; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp uge double %a, %b
@ -114,7 +114,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ult:
; SI: v_cmp_nge_f64_e32 vcc
; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ult double %a, %b
@ -125,7 +125,7 @@ entry:
; FUNC-LABEL: {{^}}f64_ule:
; SI: v_cmp_ngt_f64_e32 vcc
; SI-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) {
entry:
%0 = fcmp ule double %a, %b

View File

@ -1,4 +1,9 @@
; RUN: llc -march=amdgcn -mcpu=SI --misched=si < %s | FileCheck %s
; FIXME: The si scheduler crashes if when lane mask tracking is enabled, so
; we need to disable this when the si scheduler is being used.
; The only way the subtarget knows that the si machine scheduler is being used
; is to specify -mattr=si-scheduler. If we just pass --misched=si, the backend
; won't know what scheduler we are using.
; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s
; The test checks the "si" machine scheduler pass works correctly.

View File

@ -230,8 +230,8 @@ define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
; GCN-LABEL: {{^}}s_ashr_63_i64:
; GCN-DAG: s_load_dword s[[HI:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
; GCN: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]]
; GCN: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}}
; GCN-DAG: s_mov_b32 s[[COPYSHIFT:[0-9]+]], s[[SHIFT]]
; GCN-DAG: s_add_u32 {{s[0-9]+}}, s[[HI]], {{s[0-9]+}}
; GCN: s_addc_u32 {{s[0-9]+}}, s[[COPYSHIFT]], {{s[0-9]+}}
define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
%result = ashr i64 %a, 63

View File

@ -8,14 +8,12 @@ target triple="amdgcn--"
; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb
; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_cmp_eq_i32_e32 vcc, 0, v1
; CHECK: v_cmp_eq_i32_e32 vcc, 0, v0
; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
; BB0_1:
; CHECK: s_load_dword s6, s[0:1], 0xa
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s6
; BB0_2:
; CHECK: s_or_b64 exec, exec, s[2:3]
; CHECK-NEXT: s_mov_b32 s7, 0xf000

View File

@ -22,7 +22,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
; SI-NEXT: buffer_store_byte [[RESULT]]
; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@ -45,7 +45,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_1:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
; SI-NEXT: buffer_store_byte [[RESULT]]
; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@ -57,7 +57,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_neg1:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
; SI-NEXT: buffer_store_byte [[RESULT]]
; SI: buffer_store_byte [[RESULT]]
define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = sext i1 %load to i32
@ -81,7 +81,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
; FUNC-LABEL {{^}}sextload_i1_to_i32_trunc_cmp_ne_0:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
; SI-NEXT: buffer_store_byte [[RESULT]]
; SI: buffer_store_byte [[RESULT]]
define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = sext i1 %load to i32
@ -93,7 +93,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_0:
; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
; SI-NEXT: buffer_store_byte [[RESULT]]
; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@ -119,7 +119,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
; SI: v_cmp_eq_i32_e32 vcc, 1, [[TMP]]{{$}}
; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
; SI-NEXT: buffer_store_byte [[RESULT]]
; SI: buffer_store_byte [[RESULT]]
define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
%load = load i1, i1 addrspace(1)* %in
%ext = zext i1 %load to i32
@ -158,7 +158,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
; SI: buffer_load_sbyte [[LOAD:v[0-9]+]]
; SI: v_cmp_ne_i32_e32 vcc, -1, [[LOAD]]{{$}}
; SI-NEXT: v_cndmask_b32_e64
; SI-NEXT: buffer_store_byte
; SI: buffer_store_byte
define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
%load = load i8, i8 addrspace(1)* %in
%masked = and i8 %load, 255

View File

@ -28,10 +28,10 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a)
}
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_a_b:
; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
; GCN: buffer_store_dword [[RESULT]]
@ -42,13 +42,13 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
}
; GCN-LABEL: {{^}}test_use_s_v_s:
; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN: buffer_load_dword [[VA0:v[0-9]+]]
; GCN-NOT: v_mov_b32
; GCN: buffer_load_dword [[VA1:v[0-9]+]]
; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN-NOT: v_mov_b32
@ -68,10 +68,10 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float
}
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_a_b_a:
; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
; GCN: buffer_store_dword [[RESULT]]
@ -82,10 +82,10 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
}
; GCN-LABEL: {{^}}test_sgpr_use_twice_ternary_op_b_a_a:
; SI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; SI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
; GCN: buffer_store_dword [[RESULT]]