mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-26 22:34:39 +00:00
NVPTX: Replace uses of cuda.syncthreads with nvvm.barrier0
Everywhere where cuda.syncthreads or __syncthreads is used, use the properly namespaced nvvm.barrier0 instead. llvm-svn: 274664
This commit is contained in:
parent
c2d61e0b4b
commit
18054ad909
@ -1318,7 +1318,7 @@ example:
|
||||
The ``convergent`` attribute may appear on functions or call/invoke
|
||||
instructions. When it appears on a function, it indicates that calls to
|
||||
this function should not be made control-dependent on additional values.
|
||||
For example, the intrinsic ``llvm.cuda.syncthreads`` is ``convergent``, so
|
||||
For example, the intrinsic ``llvm.nvvm.barrier0`` is ``convergent``, so
|
||||
calls to this intrinsic cannot be made control-dependent on additional
|
||||
values.
|
||||
|
||||
|
@ -566,7 +566,7 @@ Intrinsic CUDA Equivalent
|
||||
``i32 @llvm.nvvm.read.ptx.sreg.ctaid.{x,y,z}`` blockIdx.{x,y,z}
|
||||
``i32 @llvm.nvvm.read.ptx.sreg.ntid.{x,y,z}`` blockDim.{x,y,z}
|
||||
``i32 @llvm.nvvm.read.ptx.sreg.nctaid.{x,y,z}`` gridDim.{x,y,z}
|
||||
``void @llvm.cuda.syncthreads()`` __syncthreads()
|
||||
``void @llvm.nvvm.barrier0()`` __syncthreads()
|
||||
================================================ ====================
|
||||
|
||||
|
||||
|
@ -729,8 +729,6 @@ def llvm_anyi64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
|
||||
[IntrArgMemOnly, NoCapture<0>]>;
|
||||
|
||||
// Bar.Sync
|
||||
def int_cuda_syncthreads : GCCBuiltin<"__syncthreads">,
|
||||
Intrinsic<[], [], [IntrConvergent]>;
|
||||
def int_nvvm_barrier0 : GCCBuiltin<"__nvvm_bar0">,
|
||||
Intrinsic<[], [], [IntrConvergent]>;
|
||||
def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
|
||||
|
@ -112,7 +112,7 @@ bool NVPTXInstrInfo::isStoreInstr(const MachineInstr &MI,
|
||||
|
||||
bool NVPTXInstrInfo::CanTailMerge(const MachineInstr *MI) const {
|
||||
unsigned addrspace = 0;
|
||||
if (MI->getOpcode() == NVPTX::INT_CUDA_SYNCTHREADS)
|
||||
if (MI->getOpcode() == NVPTX::INT_BARRIER0)
|
||||
return false;
|
||||
if (isLoadInstr(*MI, addrspace))
|
||||
if (addrspace == NVPTX::PTXLdStInstCode::SHARED)
|
||||
|
@ -33,9 +33,6 @@ def immDouble1 : PatLeaf<(fpimm), [{
|
||||
// Synchronization and shuffle functions
|
||||
//-----------------------------------
|
||||
let isConvergent = 1 in {
|
||||
def INT_CUDA_SYNCTHREADS : NVPTXInst<(outs), (ins),
|
||||
"bar.sync \t0;",
|
||||
[(int_cuda_syncthreads)]>;
|
||||
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
|
||||
"bar.sync \t0;",
|
||||
[(int_nvvm_barrier0)]>;
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: llc < %s | FileCheck %s
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @llvm.cuda.syncthreads()
|
||||
declare void @llvm.nvvm.barrier0()
|
||||
|
||||
; Load a value, then syncthreads. Branch, and use the loaded value only on one
|
||||
; side of the branch. The load shouldn't be sunk beneath the call, because
|
||||
@ -11,7 +11,7 @@ Start:
|
||||
; CHECK: ld.u32
|
||||
%ptr_val = load i32, i32* %ptr
|
||||
; CHECK: bar.sync
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br i1 %cond, label %L1, label %L2
|
||||
L1:
|
||||
%ptr_val2 = add i32 %ptr_val, 100
|
||||
|
@ -2,7 +2,7 @@
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare void @foo()
|
||||
declare void @llvm.cuda.syncthreads()
|
||||
declare void @llvm.nvvm.barrier0()
|
||||
|
||||
; syncthreads shouldn't be duplicated.
|
||||
; CHECK: .func call_syncthreads
|
||||
@ -20,7 +20,7 @@ L2:
|
||||
store i32 1, i32* %a
|
||||
br label %L42
|
||||
L42:
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
br label %Ret
|
||||
}
|
||||
|
||||
|
@ -34,7 +34,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
store float %v, float* addrspacecast (float addrspace(3)* @scalar to float*), align 4
|
||||
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
|
||||
; use syncthreads to disable optimizations across components
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
; cast; load
|
||||
@ -45,7 +45,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
; cast; store
|
||||
store float %v, float* %2, align 4
|
||||
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
; load gep cast
|
||||
@ -55,7 +55,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
; store gep cast
|
||||
store float %v, float* getelementptr inbounds ([10 x float], [10 x float]* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float]*), i32 0, i32 5), align 4
|
||||
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
; gep cast; load
|
||||
@ -66,7 +66,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
; gep cast; store
|
||||
store float %v, float* %5, align 4
|
||||
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
; cast; gep; load
|
||||
@ -78,7 +78,7 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
; cast; gep; store
|
||||
store float %v, float* %8, align 4
|
||||
; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
ret void
|
||||
@ -181,7 +181,7 @@ exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.cuda.syncthreads() #3
|
||||
declare void @llvm.nvvm.barrier0() #3
|
||||
|
||||
declare void @use(float)
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
; Make sure the call to syncthreads is not duplicate here by the LLVM
|
||||
; optimizations, because it has the noduplicate attribute set.
|
||||
|
||||
; CHECK: call void @llvm.cuda.syncthreads
|
||||
; CHECK-NOT: call void @llvm.cuda.syncthreads
|
||||
; CHECK: call void @llvm.nvvm.barrier0
|
||||
; CHECK-NOT: call void @llvm.nvvm.barrier0
|
||||
|
||||
; Function Attrs: nounwind
|
||||
define void @foo(float* %output) #1 {
|
||||
@ -37,7 +37,7 @@ if.else: ; preds = %entry
|
||||
br label %if.end
|
||||
|
||||
if.end: ; preds = %if.else, %if.then
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
%6 = load float*, float** %output.addr, align 8
|
||||
%arrayidx6 = getelementptr inbounds float, float* %6, i64 0
|
||||
%7 = load float, float* %arrayidx6, align 4
|
||||
@ -68,7 +68,7 @@ if.end17: ; preds = %if.else13, %if.then
|
||||
}
|
||||
|
||||
; Function Attrs: noduplicate nounwind
|
||||
declare void @llvm.cuda.syncthreads() #2
|
||||
declare void @llvm.nvvm.barrier0() #2
|
||||
|
||||
!0 = !{void (float*)* @foo, !"kernel", i32 1}
|
||||
!1 = !{null, !"align", i32 8}
|
||||
|
@ -1,9 +1,9 @@
|
||||
; RUN: llvm-as < %s | llvm-dis | FileCheck %s
|
||||
|
||||
; Make sure LLVM knows about the convergent attribute on the
|
||||
; llvm.cuda.syncthreads intrinsic.
|
||||
; llvm.nvvm.barrier0 intrinsic.
|
||||
|
||||
declare void @llvm.cuda.syncthreads()
|
||||
declare void @llvm.nvvm.barrier0()
|
||||
|
||||
; CHECK: declare void @llvm.cuda.syncthreads() #[[ATTRNUM:[0-9]+]]
|
||||
; CHECK: declare void @llvm.nvvm.barrier0() #[[ATTRNUM:[0-9]+]]
|
||||
; CHECK: attributes #[[ATTRNUM]] = { convergent nounwind }
|
||||
|
@ -59,15 +59,15 @@ define i32 @indirect_non_convergent_call(i32 ()* %f) convergent norecurse {
|
||||
|
||||
; CHECK: Function Attrs
|
||||
; CHECK-SAME: convergent
|
||||
; CHECK-NEXT: declare void @llvm.cuda.syncthreads()
|
||||
declare void @llvm.cuda.syncthreads() convergent
|
||||
; CHECK-NEXT: declare void @llvm.nvvm.barrier0()
|
||||
declare void @llvm.nvvm.barrier0() convergent
|
||||
|
||||
; CHECK: Function Attrs
|
||||
; CHECK-SAME: convergent
|
||||
; CHECK-NEXT: define i32 @intrinsic()
|
||||
define i32 @intrinsic() convergent {
|
||||
; Implicitly convergent, because the intrinsic is convergent.
|
||||
call void @llvm.cuda.syncthreads()
|
||||
call void @llvm.nvvm.barrier0()
|
||||
ret i32 0
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user