mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-13 14:46:53 +00:00
AMDGPU: Split LDS vector loads
If properly aligned this could allow using ds_read_b64. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@253975 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
04abf1ee5f
commit
25a68d8d25
@ -1484,7 +1484,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
|
||||
Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
|
||||
Store->getValue().getValueType().isVector()) {
|
||||
return ScalarizeVectorStore(Op, DAG);
|
||||
return SplitVectorStore(Op, DAG);
|
||||
}
|
||||
|
||||
EVT MemVT = Store->getMemoryVT();
|
||||
|
@ -1191,7 +1191,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
||||
break;
|
||||
// fall-through
|
||||
case AMDGPUAS::LOCAL_ADDRESS:
|
||||
return ScalarizeVectorLoad(Op, DAG);
|
||||
// If properly aligned, if we split we might be able to use ds_read_b64.
|
||||
return SplitVectorLoad(Op, DAG);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -89,8 +89,13 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
|
||||
}
|
||||
|
||||
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_align8:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
|
||||
; CI-DAG: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
|
||||
|
||||
; FIXME: These moves shouldn't be necessary, it should be able to
|
||||
; store the same register if offset1 was the non-zero offset.
|
||||
|
||||
; CI: v_mov_b32
|
||||
; CI: v_mov_b32
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: s_endpgm
|
||||
define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
|
||||
@ -103,8 +108,9 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
|
||||
}
|
||||
|
||||
; CI-LABEL: {{^}}simple_read2_v4f32_superreg:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_X:[0-9]+]]:[[REG_Y:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_W:[0-9]+]]:[[REG_Z:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI: v_mov_b32
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: s_endpgm
|
||||
define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
|
||||
@ -118,13 +124,11 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
|
||||
|
||||
; FIXME: Extra moves shuffling superregister
|
||||
; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT4:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: buffer_store_dwordx4
|
||||
@ -140,21 +144,15 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
|
||||
|
||||
; FIXME: Extra moves shuffling superregister
|
||||
; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:15{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:7{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:4{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT9:[0-9]+]]:[[REG_ELT8:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:10 offset1:9{{$}}
|
||||
; CI: ds_read2_b64 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:14 offset1:13{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:12 offset1:8{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
|
||||
; CI: v_mov_b32
|
||||
|
||||
; CI: s_waitcnt lgkmcnt(0)
|
||||
|
@ -29,14 +29,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
|
||||
; SI-ALLOCA: buffer_store_dwordx4
|
||||
; SI-ALLOCA: buffer_load_dwordx4
|
||||
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_write_b64
|
||||
; SI-PROMOTE: ds_write_b64
|
||||
; SI-PROMOTE: ds_read_b64
|
||||
; SI-PROMOTE: ds_read_b64
|
||||
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
|
||||
%val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
|
||||
%array = alloca <2 x double>, i32 16, align 16
|
||||
@ -71,14 +67,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
|
||||
; SI-ALLOCA: buffer_store_dwordx4
|
||||
; SI-ALLOCA: buffer_load_dwordx4
|
||||
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_read_b32
|
||||
; SI-PROMOTE: ds_write_b64
|
||||
; SI-PROMOTE: ds_write_b64
|
||||
; SI-PROMOTE: ds_read_b64
|
||||
; SI-PROMOTE: ds_read_b64
|
||||
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
|
||||
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
|
||||
%array = alloca <2 x i64>, i32 16, align 16
|
||||
|
@ -132,32 +132,15 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align4:
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
; SI: ds_read2_b32
|
||||
; SI: ds_read2_b32
|
||||
; SI: ds_read2_b32
|
||||
; SI: ds_read2_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
; SI: ds_write2_b32
|
||||
; SI: ds_write2_b32
|
||||
; SI: ds_write2_b32
|
||||
; SI: ds_write2_b32
|
||||
|
||||
; SI: s_endpgm
|
||||
define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
|
||||
@ -170,32 +153,15 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
|
||||
; FIXME: Use 64-bit ops
|
||||
; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
; SI: ds_read_b64
|
||||
; SI: ds_read_b64
|
||||
; SI: ds_read_b64
|
||||
; SI: ds_read_b64
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
|
||||
; SI-DAG: ds_read_b32
|
||||
; SI-DAG: ds_write_b32
|
||||
; SI: ds_write_b64
|
||||
; SI: ds_write_b64
|
||||
; SI: ds_write_b64
|
||||
; SI: ds_write_b64
|
||||
|
||||
; SI-DAG: s_endpgm
|
||||
define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
|
||||
|
@ -539,10 +539,15 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}merge_local_store_4_constants_i32:
|
||||
; GCN: ds_write_b32
|
||||
; GCN: ds_write_b32
|
||||
; GCN: ds_write_b32
|
||||
; GCN: ds_write_b32
|
||||
; GCN-DAG: v_mov_b32_e32 [[K2:v[0-9]+]], 0x1c8
|
||||
; GCN-DAG: v_mov_b32_e32 [[K3:v[0-9]+]], 0x14d
|
||||
; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K2]], [[K3]] offset0:2 offset1:3
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x4d2
|
||||
; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x7b
|
||||
; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
|
||||
|
||||
; GCN: s_endpgm
|
||||
define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
|
||||
|
@ -287,16 +287,33 @@ entry:
|
||||
; CM: LDS_WRITE
|
||||
; CM: LDS_WRITE
|
||||
|
||||
; SI: ds_write_b32
|
||||
; SI: ds_write_b32
|
||||
; SI: ds_write_b32
|
||||
; SI: ds_write_b32
|
||||
; SI: ds_write_b64
|
||||
; SI: ds_write_b64
|
||||
define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
|
||||
entry:
|
||||
store <4 x i32> %in, <4 x i32> addrspace(3)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_v4i32_align4:
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
; EG: LDS_WRITE
|
||||
|
||||
; CM: LDS_WRITE
|
||||
; CM: LDS_WRITE
|
||||
; CM: LDS_WRITE
|
||||
; CM: LDS_WRITE
|
||||
|
||||
; SI: ds_write2_b32
|
||||
; SI: ds_write2_b32
|
||||
define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
|
||||
entry:
|
||||
store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}store_local_i64_i8:
|
||||
; EG: LDS_BYTE_WRITE
|
||||
; SI: ds_write_b8
|
||||
|
Loading…
Reference in New Issue
Block a user