From 496b39a5b9f212036912fa4222dd55bc74521add Mon Sep 17 00:00:00 2001 From: Nicolai Haehnle Date: Mon, 5 Aug 2019 09:36:06 +0000 Subject: [PATCH] AMDGPU: add missing llvm.amdgcn.{raw,struct}.buffer.atomic.{inc,dec} Summary: Wrapping increment/decrement. These aren't exposed by many APIs... Change-Id: I1df25c7889de5a5ba76468ad8e8a2597efa9af6c Reviewers: arsenm, tpr, dstuttard Subscribers: kzhuravl, jvesely, wdng, yaxunl, t-tye, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65283 llvm-svn: 367821 --- include/llvm/IR/IntrinsicsAMDGPU.td | 7 ++++--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 ++ lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 ++ lib/Target/AMDGPU/AMDGPUSearchableTables.td | 4 ++++ lib/Target/AMDGPU/BUFInstructions.td | 4 ++++ lib/Target/AMDGPU/SIISelLowering.cpp | 20 +++++++++++++++++-- lib/Target/AMDGPU/SIInstrInfo.td | 2 ++ .../AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll | 10 +++++++++- .../llvm.amdgcn.struct.buffer.atomic.ll | 10 +++++++++- 9 files changed, 54 insertions(+), 7 deletions(-) diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 071e1b6bec1..337567acd72 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -837,9 +837,6 @@ defset list AMDGPUImageDimAtomicIntrinsics = { defm int_amdgcn_image_atomic_and : AMDGPUImageDimAtomic<"ATOMIC_AND">; defm int_amdgcn_image_atomic_or : AMDGPUImageDimAtomic<"ATOMIC_OR">; defm int_amdgcn_image_atomic_xor : AMDGPUImageDimAtomic<"ATOMIC_XOR">; - - // TODO: INC/DEC are weird: they seem to have a vdata argument in hardware, - // even though it clearly shouldn't be needed defm int_amdgcn_image_atomic_inc : AMDGPUImageDimAtomic<"ATOMIC_INC">; defm int_amdgcn_image_atomic_dec : AMDGPUImageDimAtomic<"ATOMIC_DEC">; @@ -963,6 +960,8 @@ def int_amdgcn_raw_buffer_atomic_umax : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_and : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic; +def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic; def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) @@ -994,6 +993,8 @@ def int_amdgcn_struct_buffer_atomic_umax : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_and : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic; +def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic; def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic< [llvm_anyint_ty], [LLVMMatchType<0>, // src(VGPR) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index a5098f42fca..90e0369f3bc 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4354,6 +4354,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_AND) NODE_NAME_CASE(BUFFER_ATOMIC_OR) NODE_NAME_CASE(BUFFER_ATOMIC_XOR) + NODE_NAME_CASE(BUFFER_ATOMIC_INC) + NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index fe7ad694943..acafd6fbe5a 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -532,6 +532,8 @@ enum NodeType : unsigned { BUFFER_ATOMIC_AND, BUFFER_ATOMIC_OR, BUFFER_ATOMIC_XOR, + BUFFER_ATOMIC_INC, + BUFFER_ATOMIC_DEC, BUFFER_ATOMIC_CMPSWAP, BUFFER_ATOMIC_FADD, BUFFER_ATOMIC_PK_FADD, diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td index f8703c36127..26b8b784027 100644 --- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -81,6 +81,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; @@ -92,6 +94,8 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index bd2a2d834cf..38f427aee85 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -1316,6 +1316,8 @@ defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; @@ -1326,6 +1328,8 @@ defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; +defm : BufferAtomicPatterns; multiclass BufferAtomicPatterns_NO_RTN { diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 639d8193749..0e18fa2100c 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6414,7 +6414,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_or: - case Intrinsic::amdgcn_raw_buffer_atomic_xor: { + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6463,6 +6465,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6479,7 +6487,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_or: - case Intrinsic::amdgcn_struct_buffer_atomic_xor: { + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6528,6 +6538,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 71054561551..60d9e8f60fa 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -198,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">; def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">; def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">; def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">; +def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">; +def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">; def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>; def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>; diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll index 08edaf98927..3c53950294d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll @@ -44,6 +44,10 @@ main_body: ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 offen glc slc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_inc v0, v1, s[0:3], 0 offen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_dec v0, v1, s[0:3], 0 offen glc define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %voffset) { main_body: %t1 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) @@ -55,7 +59,9 @@ main_body: %t7 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) %t8 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 2) %t9 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) - %out = bitcast i32 %t9 to float + %t10 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 %t9, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %t11 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 %t10, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0) + %out = bitcast i32 %t11 to float ret float %out } @@ -110,6 +116,8 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i3 declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32) #0 declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #0 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll index 6c9ca9b17d2..e7668f0f76d 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll @@ -50,6 +50,10 @@ main_body: ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc slc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_inc v0, v1, s[0:3], 0 idxen glc +;CHECK: s_waitcnt vmcnt(0) +;CHECK: buffer_atomic_dec v0, v1, s[0:3], 0 idxen glc define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { main_body: %t1 = call i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) @@ -61,7 +65,9 @@ main_body: %t7 = call i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32 %t6, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) %t8 = call i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32 %t7, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 2) %t9 = call i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32 %t8, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) - %out = bitcast i32 %t9 to float + %t10 = call i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32 %t9, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %t11 = call i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32 %t10, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0) + %out = bitcast i32 %t11 to float ret float %out } @@ -122,6 +128,8 @@ declare i32 @llvm.amdgcn.struct.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, declare i32 @llvm.amdgcn.struct.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 declare i32 @llvm.amdgcn.struct.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 +declare i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32, i32) #0 declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #0 attributes #0 = { nounwind }