mirror of
https://github.com/RPCS3/llvm.git
synced 2025-02-15 08:19:51 +00:00
AMDGPU/R600: Replace barrier intrinsics
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275870 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
865e2fa1dc
commit
40ca91a07a
@ -43,6 +43,8 @@ defm int_r600_read_tidig : AMDGPUReadPreloadRegisterIntrinsic_xyz;
|
||||
|
||||
def int_r600_read_workdim : AMDGPUReadPreloadRegisterIntrinsic;
|
||||
|
||||
def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
|
||||
Intrinsic<[], [], [IntrConvergent]>;
|
||||
|
||||
// AS 7 is PARAM_I_ADDRESS, used for kernel arguments
|
||||
def int_r600_implicitarg_ptr :
|
||||
|
@ -30,10 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
|
||||
[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
|
||||
>;
|
||||
|
||||
// Deprecated in favor of llvm.amdgcn.s.barrier
|
||||
def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>;
|
||||
def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
|
||||
|
||||
// Deprecated in favor of llvm.amdgcn.read.workdim
|
||||
def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
|
||||
}
|
||||
|
@ -394,7 +394,7 @@ def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
|
||||
def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
|
||||
|
||||
def GROUP_BARRIER : InstR600 <
|
||||
(outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
|
||||
(outs), (ins), " GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
|
||||
R600ALU_Word0,
|
||||
R600ALU_Word1_OP2 <0x54> {
|
||||
|
||||
@ -423,11 +423,6 @@ def GROUP_BARRIER : InstR600 <
|
||||
let ALUInst = 1;
|
||||
}
|
||||
|
||||
def : Pat <
|
||||
(int_AMDGPU_barrier_global),
|
||||
(GROUP_BARRIER)
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// LDS Instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -2453,17 +2453,6 @@ def : Pat <
|
||||
(S_WAITCNT (as_i16imm $simm16))
|
||||
>;
|
||||
|
||||
// FIXME: These should be removed eventually
|
||||
def : Pat <
|
||||
(int_AMDGPU_barrier_global),
|
||||
(S_BARRIER)
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(int_AMDGPU_barrier_local),
|
||||
(S_BARRIER)
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VOP1 Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
|
||||
;
|
||||
; This test checks that the lds input queue will is empty at the end of
|
||||
; the ALU clause.
|
||||
@ -14,7 +14,7 @@ define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32
|
||||
entry:
|
||||
%0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
|
||||
%1 = load i32, i32 addrspace(3)* %0
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
call void @llvm.r600.group.barrier()
|
||||
|
||||
; This will start a new clause for the vertex fetch
|
||||
%2 = load i32, i32 addrspace(1)* %in
|
||||
@ -23,7 +23,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.local()
|
||||
declare void @llvm.r600.group.barrier() nounwind convergent
|
||||
|
||||
; The machine scheduler does not do proper alias analysis and assumes that
|
||||
; loads from global values (Note that a global value is different that a
|
||||
|
@ -1,30 +0,0 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}test_barrier_global:
|
||||
; EG: GROUP_BARRIER
|
||||
; SI: buffer_store_dword
|
||||
; SI: s_waitcnt
|
||||
; SI: s_barrier
|
||||
|
||||
define void @test_barrier_global(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%0 = call i32 @llvm.r600.read.tidig.x()
|
||||
%1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
|
||||
store i32 %0, i32 addrspace(1)* %1
|
||||
call void @llvm.AMDGPU.barrier.global()
|
||||
%2 = call i32 @llvm.r600.read.local.size.x()
|
||||
%3 = sub i32 %2, 1
|
||||
%4 = sub i32 %3, %0
|
||||
%5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
|
||||
%6 = load i32, i32 addrspace(1)* %5
|
||||
store i32 %6, i32 addrspace(1)* %1
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.global()
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare i32 @llvm.r600.read.local.size.x() #0
|
||||
|
||||
attributes #0 = { readnone }
|
@ -1,31 +0,0 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: {{^}}test_barrier_local:
|
||||
; EG: GROUP_BARRIER
|
||||
|
||||
; SI: buffer_store_dword
|
||||
; SI: s_waitcnt
|
||||
; SI: s_barrier
|
||||
|
||||
define void @test_barrier_local(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%0 = call i32 @llvm.r600.read.tidig.x()
|
||||
%1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
|
||||
store i32 %0, i32 addrspace(1)* %1
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
%2 = call i32 @llvm.r600.read.local.size.x()
|
||||
%3 = sub i32 %2, 1
|
||||
%4 = sub i32 %3, %0
|
||||
%5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
|
||||
%6 = load i32, i32 addrspace(1)* %5
|
||||
store i32 %6, i32 addrspace(1)* %1
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.local()
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare i32 @llvm.r600.read.local.size.x() #0
|
||||
|
||||
attributes #0 = { readnone }
|
31
test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
Normal file
31
test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
Normal file
@ -0,0 +1,31 @@
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
|
||||
|
||||
; EG-LABEL: {{^}}test_group_barrier:
|
||||
; EG: GROUP_BARRIER
|
||||
define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%tmp = call i32 @llvm.r600.read.tidig.x()
|
||||
%tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
|
||||
store i32 %tmp, i32 addrspace(1)* %tmp1
|
||||
call void @llvm.r600.group.barrier()
|
||||
%tmp2 = call i32 @llvm.r600.read.local.size.x()
|
||||
%tmp3 = sub i32 %tmp2, 1
|
||||
%tmp4 = sub i32 %tmp3, %tmp
|
||||
%tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
|
||||
%tmp6 = load i32, i32 addrspace(1)* %tmp5
|
||||
store i32 %tmp6, i32 addrspace(1)* %tmp1
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.r600.group.barrier() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.x() #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.local.size.x() #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { convergent nounwind }
|
||||
attributes #2 = { nounwind readnone }
|
92
test/CodeGen/AMDGPU/local-memory.amdgcn.ll
Normal file
92
test/CodeGen/AMDGPU/local-memory.amdgcn.ll
Normal file
@ -0,0 +1,92 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
|
||||
|
||||
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; SI: .long 47180
|
||||
; SI-NEXT: .long 65668
|
||||
; CI: .long 47180
|
||||
; CI-NEXT: .long 32900
|
||||
|
||||
; GCN-LABEL: {{^}}local_memory:
|
||||
|
||||
; GCN-NOT: s_wqm_b64
|
||||
; GCN: ds_write_b32
|
||||
|
||||
; GCN: s_barrier
|
||||
|
||||
; GCN: ds_read_b32 {{v[0-9]+}},
|
||||
define void @local_memory(i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
|
||||
store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%add = add nsw i32 %y.i, 1
|
||||
%cmp = icmp eq i32 %add, 16
|
||||
%.add = select i1 %cmp, i32 0, i32 %add
|
||||
call void @llvm.amdgcn.s.barrier()
|
||||
%arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
|
||||
%tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
|
||||
store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
|
||||
@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; EG: .long 166120
|
||||
; EG-NEXT: .long 8
|
||||
; GCN: .long 47180
|
||||
; GCN-NEXT: .long 32900
|
||||
|
||||
; GCN-LABEL: {{^}}local_memory_two_objects:
|
||||
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
|
||||
|
||||
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
|
||||
|
||||
; SI-DAG: ds_write_b32 [[ADDRW]],
|
||||
; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
|
||||
|
||||
; GCN: s_barrier
|
||||
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
|
||||
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
|
||||
|
||||
; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
|
||||
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
|
||||
define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%x.i = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
|
||||
store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%mul = shl nsw i32 %x.i, 1
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
|
||||
store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%sub = sub nsw i32 3, %x.i
|
||||
call void @llvm.amdgcn.s.barrier()
|
||||
%arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
|
||||
%tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
|
||||
store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
|
||||
%arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
|
||||
%tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
|
||||
%add = add nsw i32 %x.i, 4
|
||||
%arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
|
||||
store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
declare void @llvm.amdgcn.s.barrier() #2
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { convergent nounwind }
|
@ -1,57 +1,20 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
|
||||
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; EG: .long 166120
|
||||
; EG-NEXT: .long 128
|
||||
; SI: .long 47180
|
||||
; SI-NEXT: .long 65668
|
||||
; CI: .long 47180
|
||||
; CI-NEXT: .long 32900
|
||||
|
||||
; FUNC-LABEL: {{^}}local_memory:
|
||||
|
||||
; EG: LDS_WRITE
|
||||
; SI-NOT: s_wqm_b64
|
||||
; SI: ds_write_b32
|
||||
|
||||
; GROUP_BARRIER must be the last instruction in a clause
|
||||
; EG: GROUP_BARRIER
|
||||
; EG-NEXT: ALU clause
|
||||
; SI: s_barrier
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; SI: ds_read_b32 {{v[0-9]+}},
|
||||
|
||||
define void @local_memory(i32 addrspace(1)* %out) {
|
||||
entry:
|
||||
%y.i = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
|
||||
store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%add = add nsw i32 %y.i, 1
|
||||
%cmp = icmp eq i32 %add, 16
|
||||
%.add = select i1 %cmp, i32 0, i32 %add
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
%arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
|
||||
%0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
|
||||
store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@lds = addrspace(3) global [512 x i32] undef, align 4
|
||||
|
||||
; On SI we need to make sure that the base offset is a register and not
|
||||
; an immediate.
|
||||
; On SI we need to make sure that the base offset is a register and
|
||||
; not an immediate.
|
||||
|
||||
; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
|
||||
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
|
||||
; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
|
||||
|
||||
; R600: LDS_READ_RET
|
||||
define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
|
||||
define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
%tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
|
||||
%tmp1 = load i32, i32 addrspace(3)* %tmp0
|
||||
@ -67,7 +30,7 @@ entry:
|
||||
; R600: LDS_READ_RET
|
||||
; GCN-DAG: ds_read_b32
|
||||
; GCN-DAG: ds_read2_b32
|
||||
define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
|
||||
define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
|
||||
%scalar = load i32, i32 addrspace(3)* %in
|
||||
%tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
|
||||
%vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
|
||||
@ -78,7 +41,4 @@ define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare void @llvm.AMDGPU.barrier.local()
|
||||
|
||||
attributes #0 = { readnone }
|
||||
attributes #0 = { nounwind }
|
||||
|
@ -1,18 +1,45 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; EG: .long 166120
|
||||
; EG-NEXT: .long 128
|
||||
|
||||
; FUNC-LABEL: {{^}}local_memory:
|
||||
|
||||
; EG: LDS_WRITE
|
||||
|
||||
; GROUP_BARRIER must be the last instruction in a clause
|
||||
; EG: GROUP_BARRIER
|
||||
; EG-NEXT: ALU clause
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
define void @local_memory(i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%y.i = call i32 @llvm.r600.read.tidig.x() #1
|
||||
%arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
|
||||
store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%add = add nsw i32 %y.i, 1
|
||||
%cmp = icmp eq i32 %add, 16
|
||||
%.add = select i1 %cmp, i32 0, i32 %add
|
||||
call void @llvm.r600.group.barrier()
|
||||
%arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
|
||||
%tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
|
||||
store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
|
||||
@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
|
||||
|
||||
|
||||
; Check that the LDS size emitted correctly
|
||||
; EG: .long 166120
|
||||
; EG-NEXT: .long 8
|
||||
; GCN: .long 47180
|
||||
; GCN-NEXT: .long 32900
|
||||
|
||||
|
||||
; FUNC-LABEL: {{^}}local_memory_two_objects:
|
||||
|
||||
; We would like to check the lds writes are using different
|
||||
@ -30,51 +57,31 @@
|
||||
; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
|
||||
; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
|
||||
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
|
||||
|
||||
|
||||
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
|
||||
|
||||
; SI-DAG: ds_write_b32 [[ADDRW]],
|
||||
; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
|
||||
|
||||
; GCN: s_barrier
|
||||
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
|
||||
; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
|
||||
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
|
||||
|
||||
; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
|
||||
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
|
||||
|
||||
define void @local_memory_two_objects(i32 addrspace(1)* %out) {
|
||||
define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
|
||||
entry:
|
||||
%x.i = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%x.i = call i32 @llvm.r600.read.tidig.x() #1
|
||||
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
|
||||
store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
|
||||
%mul = shl nsw i32 %x.i, 1
|
||||
%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
|
||||
store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
|
||||
%sub = sub nsw i32 3, %x.i
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
call void @llvm.r600.group.barrier()
|
||||
%arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
|
||||
%0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
|
||||
%tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
|
||||
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
|
||||
store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
|
||||
store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
|
||||
%arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
|
||||
%1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
|
||||
%tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
|
||||
%add = add nsw i32 %x.i, 4
|
||||
%arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
|
||||
store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
|
||||
store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare void @llvm.AMDGPU.barrier.local()
|
||||
declare i32 @llvm.r600.read.tidig.x() #1
|
||||
declare void @llvm.r600.group.barrier() #2
|
||||
|
||||
attributes #0 = { readnone }
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { convergent nounwind }
|
@ -1,12 +1,9 @@
|
||||
; XFAIL: *
|
||||
; REQUIRES: asserts
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
|
||||
; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.local() nounwind convergent
|
||||
declare void @llvm.amdgcn.s.barrier() nounwind convergent
|
||||
|
||||
|
||||
; SI-LABEL: {{^}}main(
|
||||
; GCN-LABEL: {{^}}main:
|
||||
define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
|
||||
main_body:
|
||||
%0 = extractelement <4 x float> %reg1, i32 0
|
||||
@ -39,63 +36,63 @@ ENDIF: ; preds = %main_body, %Flow2
|
||||
%temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
|
||||
%15 = extractelement <4 x float> %reg1, i32 1
|
||||
%16 = extractelement <4 x float> %reg1, i32 3
|
||||
%17 = load <4 x float>, <4 x float> addrspace(9)* null
|
||||
%17 = load <4 x float>, <4 x float> addrspace(2)* null
|
||||
%18 = extractelement <4 x float> %17, i32 0
|
||||
%19 = fmul float %18, %0
|
||||
%20 = load <4 x float>, <4 x float> addrspace(9)* null
|
||||
%20 = load <4 x float>, <4 x float> addrspace(2)* null
|
||||
%21 = extractelement <4 x float> %20, i32 1
|
||||
%22 = fmul float %21, %0
|
||||
%23 = load <4 x float>, <4 x float> addrspace(9)* null
|
||||
%23 = load <4 x float>, <4 x float> addrspace(2)* null
|
||||
%24 = extractelement <4 x float> %23, i32 2
|
||||
%25 = fmul float %24, %0
|
||||
%26 = load <4 x float>, <4 x float> addrspace(9)* null
|
||||
%26 = load <4 x float>, <4 x float> addrspace(2)* null
|
||||
%27 = extractelement <4 x float> %26, i32 3
|
||||
%28 = fmul float %27, %0
|
||||
%29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
||||
%29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
|
||||
%30 = extractelement <4 x float> %29, i32 0
|
||||
%31 = fmul float %30, %15
|
||||
%32 = fadd float %31, %19
|
||||
%33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
||||
%33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
|
||||
%34 = extractelement <4 x float> %33, i32 1
|
||||
%35 = fmul float %34, %15
|
||||
%36 = fadd float %35, %22
|
||||
%37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
||||
%37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
|
||||
%38 = extractelement <4 x float> %37, i32 2
|
||||
%39 = fmul float %38, %15
|
||||
%40 = fadd float %39, %25
|
||||
%41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
|
||||
%41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
|
||||
%42 = extractelement <4 x float> %41, i32 3
|
||||
%43 = fmul float %42, %15
|
||||
%44 = fadd float %43, %28
|
||||
%45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
||||
%45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
|
||||
%46 = extractelement <4 x float> %45, i32 0
|
||||
%47 = fmul float %46, %1
|
||||
%48 = fadd float %47, %32
|
||||
%49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
||||
%49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
|
||||
%50 = extractelement <4 x float> %49, i32 1
|
||||
%51 = fmul float %50, %1
|
||||
%52 = fadd float %51, %36
|
||||
%53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
||||
%53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
|
||||
%54 = extractelement <4 x float> %53, i32 2
|
||||
%55 = fmul float %54, %1
|
||||
%56 = fadd float %55, %40
|
||||
%57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
|
||||
%57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
|
||||
%58 = extractelement <4 x float> %57, i32 3
|
||||
%59 = fmul float %58, %1
|
||||
%60 = fadd float %59, %44
|
||||
%61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
|
||||
%61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
|
||||
%62 = extractelement <4 x float> %61, i32 0
|
||||
%63 = fmul float %62, %16
|
||||
%64 = fadd float %63, %48
|
||||
%65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
|
||||
%65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
|
||||
%66 = extractelement <4 x float> %65, i32 1
|
||||
%67 = fmul float %66, %16
|
||||
%68 = fadd float %67, %52
|
||||
%69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
|
||||
%69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
|
||||
%70 = extractelement <4 x float> %69, i32 2
|
||||
%71 = fmul float %70, %16
|
||||
%72 = fadd float %71, %56
|
||||
%73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
|
||||
%73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
|
||||
%74 = extractelement <4 x float> %73, i32 3
|
||||
%75 = fmul float %74, %16
|
||||
%76 = fadd float %75, %60
|
||||
@ -103,12 +100,12 @@ ENDIF: ; preds = %main_body, %Flow2
|
||||
%78 = insertelement <4 x float> %77, float %68, i32 1
|
||||
%79 = insertelement <4 x float> %78, float %72, i32 2
|
||||
%80 = insertelement <4 x float> %79, float %76, i32 3
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
call void @llvm.amdgcn.s.barrier()
|
||||
%81 = insertelement <4 x float> undef, float %temp.0, i32 0
|
||||
%82 = insertelement <4 x float> %81, float %temp1.0, i32 1
|
||||
%83 = insertelement <4 x float> %82, float %temp2.0, i32 2
|
||||
%84 = insertelement <4 x float> %83, float %temp3.0, i32 3
|
||||
call void @llvm.AMDGPU.barrier.local()
|
||||
call void @llvm.amdgcn.s.barrier()
|
||||
ret void
|
||||
|
||||
LOOP: ; preds = %main_body, %Flow
|
||||
|
Loading…
x
Reference in New Issue
Block a user