mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-12-04 01:11:44 +00:00
AMDGPU: Split x8 and x16 vector loads instead of scalarize
The one regression in the builtin tests is in the read2 test which now (again) has many extra copies, but this should be solved once the pass is replaced with a DAG combine. llvm-svn: 253974
This commit is contained in:
parent
266a7da4e3
commit
1c34836bdc
@ -394,6 +394,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM,
|
||||
|
||||
setFsqrtIsCheap(true);
|
||||
|
||||
// We want to find all load dependencies for long chains of stores to enable
|
||||
// merging into very wide vectors. The problem is with vectors with > 4
|
||||
// elements. MergeConsecutiveStores will attempt to merge these because x8/x16
|
||||
// vectors are a legal type, even though we have to split the loads
|
||||
// usually. When we can more precisely specify load legality per address
|
||||
// space, we should be able to make FindBetterChain/MergeConsecutiveStores
|
||||
// smarter so that they can figure out what to do in 2 iterations without all
|
||||
// N > 4 stores on the same chain.
|
||||
GatherAllAliasesMaxDepth = 16;
|
||||
|
||||
// FIXME: Need to really handle these.
|
||||
MaxStoresPerMemcpy = 4096;
|
||||
MaxStoresPerMemmove = 4096;
|
||||
|
@ -1178,10 +1178,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
||||
"Custom lowering for non-i32 vectors hasn't been implemented.");
|
||||
unsigned NumElements = Op.getValueType().getVectorNumElements();
|
||||
assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
|
||||
|
||||
switch (Load->getAddressSpace()) {
|
||||
default: break;
|
||||
case AMDGPUAS::GLOBAL_ADDRESS:
|
||||
case AMDGPUAS::PRIVATE_ADDRESS:
|
||||
if (NumElements >= 8)
|
||||
return SplitVectorLoad(Op, DAG);
|
||||
|
||||
// v4 loads are supported for private and global memory.
|
||||
if (NumElements <= 4)
|
||||
break;
|
||||
@ -1409,7 +1413,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
return Ret;
|
||||
|
||||
if (VT.isVector() && VT.getVectorNumElements() >= 8)
|
||||
return ScalarizeVectorStore(Op, DAG);
|
||||
return SplitVectorStore(Op, DAG);
|
||||
|
||||
if (VT == MVT::i1)
|
||||
return DAG.getTruncStore(Store->getChain(), DL,
|
||||
|
@ -137,14 +137,8 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
|
||||
; SI-DAG: v_cvt_f32_ubyte0_e32 v{{[0-9]+}}, v[[HILOAD]]
|
||||
; SI-NOT: bfe
|
||||
; SI-NOT: lshr
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dwordx4
|
||||
; SI: buffer_store_dwordx4
|
||||
define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
|
||||
%load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
|
||||
%cvt = uitofp <8 x i8> %load to <8 x float>
|
||||
|
@ -116,19 +116,18 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Extra moves shuffling superregister
|
||||
; CI-LABEL: {{^}}simple_read2_v8f32_superreg:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT7:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT6:[0-9]+]]:[[REG_ELT5:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT4:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: s_endpgm
|
||||
define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
|
||||
@ -139,41 +138,30 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Extra moves shuffling superregister
|
||||
; CI-LABEL: {{^}}simple_read2_v16f32_superreg:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:15 offset1:14{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:13 offset1:12{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:10{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:9 offset1:8{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:7 offset1:6{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:5 offset1:4{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:2{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:1{{$}}
|
||||
; CI-NOT: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT11:[0-9]+]]:[[REG_ELT15:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:3 offset1:7{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT14:[0-9]+]]:[[REG_ELT13:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:11 offset1:15{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT12:[0-9]+]]:[[REG_ELT10:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:1{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT9:[0-9]+]]:[[REG_ELT8:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:6 offset1:5{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT7:[0-9]+]]:[[REG_ELT6:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:10 offset1:9{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT5:[0-9]+]]:[[REG_ELT4:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:14 offset1:13{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT3:[0-9]+]]:[[REG_ELT2:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:12 offset1:8{{$}}
|
||||
; CI: v_mov_b32
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT1:[0-9]+]]:[[REG_ELT0:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:4{{$}}
|
||||
; CI: v_mov_b32
|
||||
|
||||
; CI: s_waitcnt lgkmcnt(0)
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dword
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: buffer_store_dwordx4
|
||||
; CI: s_endpgm
|
||||
define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
|
||||
%x.i = tail call i32 @llvm.r600.read.tidig.x() #1
|
||||
|
@ -106,14 +106,8 @@ define void @sextload_global_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}zextload_global_v8i32_to_v8i64:
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
@ -131,14 +125,8 @@ define void @zextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}sextload_global_v8i32_to_v8i64:
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
|
||||
; SI-DAG: v_ashrrev_i32
|
||||
; SI-DAG: v_ashrrev_i32
|
||||
@ -166,22 +154,10 @@ define void @sextload_global_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}sextload_global_v16i32_to_v16i64:
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
|
||||
; SI-DAG: v_ashrrev_i32
|
||||
; SI-DAG: v_ashrrev_i32
|
||||
@ -219,22 +195,10 @@ define void @sextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}zextload_global_v16i32_to_v16i64
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
|
||||
; SI: buffer_store_dwordx2
|
||||
; SI: buffer_store_dwordx2
|
||||
@ -262,41 +226,15 @@ define void @zextload_global_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}sextload_global_v32i32_to_v32i64:
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI-DAG: v_ashrrev_i32
|
||||
; SI-DAG: v_ashrrev_i32
|
||||
@ -376,41 +314,14 @@ define void @sextload_global_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}zextload_global_v32i32_to_v32i64:
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
; SI-DAG: buffer_store_dwordx2
|
||||
|
@ -105,6 +105,26 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
|
||||
%ext = fpext <8 x half> %arg to <8 x float>
|
||||
store <8 x float> %ext, <8 x float> addrspace(1)* %out
|
||||
@ -298,6 +318,46 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f32:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
|
||||
; GCN: s_endpgm
|
||||
define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
|
||||
%val = load <16 x half>, <16 x half> addrspace(1)* %in
|
||||
%cvt = fpext <16 x half> %val to <16 x float>
|
||||
@ -426,14 +486,8 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN: v_cvt_f16_f32_e32
|
||||
; GCN: v_cvt_f16_f32_e32
|
||||
; GCN: v_cvt_f16_f32_e32
|
||||
@ -459,22 +513,10 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_truncstore_v16f32_to_v16f16:
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dword
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN: buffer_load_dwordx4
|
||||
; GCN-DAG: v_cvt_f16_f32_e32
|
||||
; GCN-DAG: v_cvt_f16_f32_e32
|
||||
; GCN-DAG: v_cvt_f16_f32_e32
|
||||
|
@ -277,15 +277,9 @@ entry:
|
||||
; FUNC-LABEL: {{^}}load_v8i32:
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; XXX: We should be using DWORDX4 instructions on SI.
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
define void @load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) {
|
||||
entry:
|
||||
%0 = load <8 x i32>, <8 x i32> addrspace(1)* %in
|
||||
@ -298,23 +292,11 @@ entry:
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; R600: VTX_READ_128
|
||||
; XXX: We should be using DWORDX4 instructions on SI.
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
define void @load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) {
|
||||
entry:
|
||||
%0 = load <16 x i32>, <16 x i32> addrspace(1)* %in
|
||||
|
@ -613,22 +613,9 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: This should do 2 dwordx4 loads
|
||||
; GCN-LABEL: {{^}}merge_global_store_8_constants_i32:
|
||||
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
; GCN-NOAA: buffer_store_dword v
|
||||
|
||||
; GCN-AA: buffer_store_dwordx4
|
||||
; GCN-AA: buffer_store_dwordx2
|
||||
; GCN-AA: buffer_store_dwordx2
|
||||
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: s_endpgm
|
||||
define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
|
||||
store i32 34, i32 addrspace(1)* %out, align 4
|
||||
|
@ -34,46 +34,16 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
|
||||
}
|
||||
|
||||
; SI-LABEL: {{^}}no_reorder_split_v8i32_global_load_store:
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dword
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
; SI: buffer_load_dwordx4
|
||||
|
||||
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dword
|
||||
; SI: buffer_store_dwordx4
|
||||
; SI: buffer_store_dwordx4
|
||||
; SI: buffer_store_dwordx4
|
||||
; SI: buffer_store_dwordx4
|
||||
; SI: s_endpgm
|
||||
define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
|
||||
%tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
|
||||
|
@ -162,14 +162,8 @@ entry:
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
|
||||
entry:
|
||||
%tmp = call i32 @llvm.r600.read.tidig.x() #0
|
||||
@ -184,7 +178,7 @@ entry:
|
||||
; FIXME: should use immediate offset instead of using s_add_i32 for adding to constant.
|
||||
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
|
||||
|
||||
; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
|
||||
; GCN-DAG: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
|
||||
; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16
|
||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}}
|
||||
|
||||
@ -197,6 +191,7 @@ entry:
|
||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}}
|
||||
; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16
|
||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}}
|
||||
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
@ -205,14 +200,12 @@ entry:
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dword
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
; GCN: buffer_store_dwordx4
|
||||
|
||||
; GCN: s_endpgm
|
||||
define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
|
||||
entry:
|
||||
%tmp = call i32 @llvm.r600.read.tidig.x() #0
|
||||
|
Loading…
Reference in New Issue
Block a user