mirror of
https://github.com/RPCSX/llvm.git
synced 2024-12-12 22:26:14 +00:00
AMDGPU: Switch barrier intrinsics to using convergent
noduplicate prevents unrolling of small loops that happen to have barriers in them. If a loop has a barrier in it, it is OK to duplicate it for the unroll. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256075 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
4b9d868cc7
commit
7aed0ccd46
@ -69,8 +69,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
|
||||
def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_barrier_local : Intrinsic<[], [], []>;
|
||||
def int_AMDGPU_barrier_global : Intrinsic<[], [], []>;
|
||||
def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>;
|
||||
def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
|
||||
}
|
||||
|
||||
// Legacy names for compatibility.
|
||||
|
@ -62,5 +62,5 @@ declare void @llvm.AMDGPU.barrier.local() #1
|
||||
declare i32 @llvm.r600.read.tidig.x() #3
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind noduplicate }
|
||||
attributes #1 = { nounwind convergent }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
@ -2,7 +2,7 @@
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
|
||||
|
||||
declare i32 @llvm.SI.tid() nounwind readnone
|
||||
declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
|
||||
declare void @llvm.AMDGPU.barrier.local() nounwind convergent
|
||||
|
||||
; The required pointer calculations for the alloca'd actually requires
|
||||
; an add and won't be folded into the addressing, which fails with a
|
||||
@ -35,7 +35,7 @@ define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 add
|
||||
%alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b
|
||||
store i32 %result, i32* %alloca_ptr, align 4
|
||||
; Dummy call
|
||||
call void @llvm.AMDGPU.barrier.local() nounwind noduplicate
|
||||
call void @llvm.AMDGPU.barrier.local() nounwind convergent
|
||||
%reload = load i32, i32* %alloca_ptr, align 4
|
||||
%out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
|
||||
|
@ -49,4 +49,4 @@ declare i32 @llvm.r600.read.tgid.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
attributes #2 = { convergent nounwind }
|
||||
|
@ -66,5 +66,5 @@ for.end: ; preds = %for.body
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { noduplicate nounwind }
|
||||
attributes #1 = { convergent nounwind }
|
||||
attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
|
@ -122,4 +122,4 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind noduplicate convergent }
|
||||
attributes #2 = { nounwind convergent }
|
||||
|
@ -505,9 +505,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.y() #1
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() #2
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
attributes #2 = { convergent nounwind }
|
||||
|
@ -229,9 +229,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.y() #1
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() #2
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
attributes #2 = { convergent nounwind }
|
||||
|
@ -264,9 +264,5 @@ declare i32 @llvm.r600.read.tidig.x() #1
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.y() #1
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() #2
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
|
@ -431,9 +431,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.y() #1
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() #2
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
attributes #2 = { convergent nounwind }
|
||||
|
@ -109,9 +109,9 @@ declare i32 @llvm.r600.read.tidig.x() #1
|
||||
; Function Attrs: nounwind readnone
|
||||
declare i32 @llvm.r600.read.tidig.y() #1
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() #2
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
attributes #2 = { convergent nounwind }
|
||||
|
@ -131,5 +131,5 @@ declare void @llvm.AMDGPU.barrier.local() #1
|
||||
declare i32 @llvm.r600.read.tidig.x() #3
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind noduplicate }
|
||||
attributes #1 = { nounwind convergent }
|
||||
attributes #3 = { nounwind readnone }
|
||||
|
@ -4,7 +4,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
|
||||
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() convergent nounwind
|
||||
|
||||
; SI-LABEL: {{^}}private_access_f64_alloca:
|
||||
|
||||
@ -18,7 +18,7 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
|
||||
%array = alloca double, i32 16, align 8
|
||||
%ptr = getelementptr double, double* %array, i32 %b
|
||||
store double %val, double* %ptr, align 8
|
||||
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||
call void @llvm.AMDGPU.barrier.local() convergent nounwind
|
||||
%result = load double, double* %ptr, align 8
|
||||
store double %result, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
@ -38,7 +38,7 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
|
||||
%array = alloca <2 x double>, i32 16, align 16
|
||||
%ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b
|
||||
store <2 x double> %val, <2 x double>* %ptr, align 16
|
||||
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||
call void @llvm.AMDGPU.barrier.local() convergent nounwind
|
||||
%result = load <2 x double>, <2 x double>* %ptr, align 16
|
||||
store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16
|
||||
ret void
|
||||
@ -56,7 +56,7 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
|
||||
%array = alloca i64, i32 16, align 8
|
||||
%ptr = getelementptr i64, i64* %array, i32 %b
|
||||
store i64 %val, i64* %ptr, align 8
|
||||
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||
call void @llvm.AMDGPU.barrier.local() convergent nounwind
|
||||
%result = load i64, i64* %ptr, align 8
|
||||
store i64 %result, i64 addrspace(1)* %out, align 8
|
||||
ret void
|
||||
@ -76,7 +76,7 @@ define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <
|
||||
%array = alloca <2 x i64>, i32 16, align 16
|
||||
%ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b
|
||||
store <2 x i64> %val, <2 x i64>* %ptr, align 16
|
||||
call void @llvm.AMDGPU.barrier.local() noduplicate nounwind
|
||||
call void @llvm.AMDGPU.barrier.local() convergent nounwind
|
||||
%result = load <2 x i64>, <2 x i64>* %ptr, align 16
|
||||
store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16
|
||||
ret void
|
||||
|
@ -4,7 +4,6 @@
|
||||
; FIXME: Enable for VI.
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
|
||||
declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
|
||||
declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone
|
||||
declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone
|
||||
|
||||
|
@ -708,4 +708,4 @@ define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x dou
|
||||
declare void @llvm.AMDGPU.barrier.local() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { noduplicate nounwind }
|
||||
attributes #1 = { convergent nounwind }
|
||||
|
@ -3,7 +3,7 @@
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
|
||||
declare void @llvm.AMDGPU.barrier.local() nounwind convergent
|
||||
|
||||
|
||||
; SI-LABEL: {{^}}main(
|
||||
|
@ -234,4 +234,4 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
|
||||
|
||||
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
|
||||
attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
|
||||
attributes #2 = { nounwind noduplicate }
|
||||
attributes #2 = { nounwind convergent }
|
||||
|
@ -36,7 +36,7 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.AMDGPU.barrier.local() #2
|
||||
|
||||
attributes #2 = { noduplicate nounwind }
|
||||
attributes #2 = { convergent nounwind }
|
||||
|
@ -70,7 +70,7 @@ main_body:
|
||||
}
|
||||
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
; Function Attrs: convergent nounwind
|
||||
declare void @llvm.AMDGPU.barrier.global() #1
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
@ -79,7 +79,7 @@ declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
|
||||
declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
|
||||
|
||||
attributes #0 = { "ShaderType"="1" }
|
||||
attributes #1 = { noduplicate nounwind }
|
||||
attributes #1 = { convergent nounwind }
|
||||
attributes #2 = { nounwind readnone }
|
||||
|
||||
!0 = !{!1, !1, i64 0, i32 1}
|
||||
|
3
test/Transforms/LoopUnroll/AMDGPU/lit.local.cfg
Normal file
3
test/Transforms/LoopUnroll/AMDGPU/lit.local.cfg
Normal file
@ -0,0 +1,3 @@
|
||||
if not 'AMDGPU' in config.root.targets:
|
||||
config.unsupported = True
|
||||
|
33
test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
Normal file
33
test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
Normal file
@ -0,0 +1,33 @@
|
||||
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -S < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: @test_unroll_convergent_barrier(
|
||||
; CHECK: call void @llvm.AMDGPU.barrier.global()
|
||||
; CHECK: call void @llvm.AMDGPU.barrier.global()
|
||||
; CHECK: call void @llvm.AMDGPU.barrier.global()
|
||||
; CHECK: call void @llvm.AMDGPU.barrier.global()
|
||||
; CHECK-NOT: br
|
||||
define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body, %entry
|
||||
%indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
|
||||
%sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ]
|
||||
%arrayidx.in = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %indvars.iv
|
||||
%arrayidx.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %indvars.iv
|
||||
%load = load i32, i32 addrspace(1)* %arrayidx.in
|
||||
call void @llvm.AMDGPU.barrier.global() #1
|
||||
%add = add i32 %load, %sum.02
|
||||
store i32 %add, i32 addrspace(1)* %arrayidx.out
|
||||
%indvars.iv.next = add i32 %indvars.iv, 1
|
||||
%exitcond = icmp eq i32 %indvars.iv.next, 4
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end: ; preds = %for.body, %entry
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.AMDGPU.barrier.global() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind convergent }
|
Loading…
Reference in New Issue
Block a user