From 68916ffa694b944c78a419036a8aad4d89af4409 Mon Sep 17 00:00:00 2001 From: Michel Danzer Date: Wed, 10 Jul 2013 16:37:07 +0000 Subject: [PATCH] R600/SI: Initial local memory support Enough for the radeonsi driver to use it for calculating derivatives. Reviewed-by: Tom Stellard llvm-svn: 186012 --- lib/Target/R600/AMDGPUAsmPrinter.cpp | 7 ++ lib/Target/R600/AMDGPUISelLowering.cpp | 4 +- lib/Target/R600/R600ISelLowering.cpp | 2 + lib/Target/R600/SIDefines.h | 4 ++ lib/Target/R600/SIISelLowering.cpp | 5 ++ lib/Target/R600/SIInstructions.td | 15 +++++ test/CodeGen/R600/local-memory-two-objects.ll | 51 ++++++++++++++ test/CodeGen/R600/local-memory.ll | 67 +++++-------------- 8 files changed, 100 insertions(+), 55 deletions(-) create mode 100644 test/CodeGen/R600/local-memory-two-objects.ll diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 996d2a61221..e039b773de7 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -233,7 +233,14 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) { OutStreamer.EmitIntValue(RsrcReg, 4); OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4); + + if (MFI->ShaderType == ShaderType::COMPUTE) { + OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); + OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4); + } if (MFI->ShaderType == ShaderType::PIXEL) { + OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); + OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(RoundUpToAlignment(MFI->LDSSize, 256) >> 8), 4); OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer.EmitIntValue(MFI->PSInputAddr, 4); } diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 4019a1f02ef..7fad3bbc6c8 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -72,8 +72,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) : setOperationAction(ISD::UDIVREM, MVT::i32, Custom); setOperationAction(ISD::UREM, MVT::i32, Expand); - setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); - int types[] = { (int)MVT::v2i32, (int)MVT::v4i32 @@ -158,7 +156,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, // XXX: Account for alignment? MFI->LDSSize += Size; - return DAG.getConstant(Offset, MVT::i32); + return DAG.getConstant(Offset, TD->getPointerSize() == 8 ? MVT::i64 : MVT::i32); } SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index ad4fd87b79a..7aef08a904d 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -92,6 +92,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::SELECT_CC); + setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); + setBooleanContents(ZeroOrNegativeOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); setSchedulingPreference(Sched::VLIW); diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 716b093fc69..147578ce8d2 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -12,11 +12,15 @@ #define SIDEFINES_H_ #define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028 +#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS 0x00B02C +#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) #define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) +#define R_00B84C_COMPUTE_PGM_RSRC2 0x00B84C +#define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC #endif // SIDEFINES_H_ diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp index 9d4cfefee16..a314bc40c48 100644 --- a/lib/Target/R600/SIISelLowering.cpp +++ b/lib/Target/R600/SIISelLowering.cpp @@ -75,6 +75,8 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); + setTargetDAGCombine(ISD::SELECT_CC); setTargetDAGCombine(ISD::SETCC); @@ -310,11 +312,14 @@ MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const { //===----------------------------------------------------------------------===// SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *MFI = MF.getInfo(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG); + case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG); case ISD::INTRINSIC_WO_CHAIN: { unsigned IntrinsicID = cast(Op.getOperand(0))->getZExtValue(); diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 30f2a4aa438..5a1bf305f29 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1599,6 +1599,21 @@ def : Pat < (V_MAD_F32 $src0, $src1, $src2) >; +/********** ======================= **********/ +/********** Load/Store Patterns **********/ +/********** ======================= **********/ + +def : Pat < + (local_load i64:$src0), + (i32 (DS_READ_B32 0, (EXTRACT_SUBREG $src0, sub0), + (EXTRACT_SUBREG $src0, sub0), (EXTRACT_SUBREG $src0, sub0), 0, 0)) +>; + +def : Pat < + (local_store i32:$src1, i64:$src0), + (DS_WRITE_B32 0, (EXTRACT_SUBREG $src0, sub0), $src1, $src1, 0, 0) +>; + /********** ================== **********/ /********** SMRD Patterns **********/ /********** ================== **********/ diff --git a/test/CodeGen/R600/local-memory-two-objects.ll b/test/CodeGen/R600/local-memory-two-objects.ll new file mode 100644 index 00000000000..6d3610e101e --- /dev/null +++ b/test/CodeGen/R600/local-memory-two-objects.ll @@ -0,0 +1,51 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; TODO: Add RUN and CHECK lines for SI once this test works there + +@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4 +@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4 + +; CHECK: @local_memory_two_objects + +; Check that the LDS size emitted correctly +; CHECK: .long 166120 +; CHECK-NEXT: .long 8 + +; Make sure the lds writes are using different addresses. +; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]] +; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]] + +; GROUP_BARRIER must be the last instruction in a clause +; CHECK: GROUP_BARRIER +; CHECK-NEXT: ALU clause + +; Make sure the lds reads are using different addresses. +; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] +; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] + +define void @local_memory_two_objects(i32 addrspace(1)* %out) { +entry: + %x.i = call i32 @llvm.r600.read.tidig.x() #0 + %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i + store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 + %mul = shl nsw i32 %x.i, 1 + %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i + store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 + %sub = sub nsw i32 3, %x.i + call void @llvm.AMDGPU.barrier.local() + %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub + %0 = load i32 addrspace(3)* %arrayidx2, align 4 + %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i + store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 + %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub + %1 = load i32 addrspace(3)* %arrayidx4, align 4 + %add = add nsw i32 %x.i, 4 + %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add + store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #0 +declare void @llvm.AMDGPU.barrier.local() + +attributes #0 = { readnone } diff --git a/test/CodeGen/R600/local-memory.ll b/test/CodeGen/R600/local-memory.ll index 0ff38483087..5458fb90573 100644 --- a/test/CodeGen/R600/local-memory.ll +++ b/test/CodeGen/R600/local-memory.ll @@ -1,21 +1,27 @@ -; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s +; RUN: llc < %s -march=r600 -mcpu=verde | FileCheck --check-prefix=SI-CHECK %s @local_memory.local_mem = internal addrspace(3) unnamed_addr global [16 x i32] zeroinitializer, align 4 -; CHECK: @local_memory +; EG-CHECK: @local_memory +; SI-CHECK: @local_memory ; Check that the LDS size emitted correctly -; CHECK: .long 166120 -; CHECK-NEXT: .long 16 +; EG-CHECK: .long 166120 +; EG-CHECK-NEXT: .long 16 +; SI-CHECK: .long 47180 +; SI-CHECK-NEXT: .long 32768 -; CHECK: LDS_WRITE +; EG-CHECK: LDS_WRITE +; SI-CHECK: DS_WRITE_B32 ; GROUP_BARRIER must be the last instruction in a clause -; CHECK: GROUP_BARRIER -; CHECK-NEXT: ALU clause +; EG-CHECK: GROUP_BARRIER +; EG-CHECK-NEXT: ALU clause +; SI-CHECK: S_BARRIER -; CHECK: LDS_READ_RET +; EG-CHECK: LDS_READ_RET +; SI-CHECK: DS_READ_B32 define void @local_memory(i32 addrspace(1)* %out) { entry: @@ -33,49 +39,6 @@ entry: ret void } -@local_memory_two_objects.local_mem0 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4 -@local_memory_two_objects.local_mem1 = internal addrspace(3) unnamed_addr global [4 x i32] zeroinitializer, align 4 - -; CHECK: @local_memory_two_objects - -; Check that the LDS size emitted correctly -; CHECK: .long 166120 -; CHECK-NEXT: .long 8 - -; Make sure the lds writes are using different addresses. -; CHECK: LDS_WRITE {{[*]*}} {{PV|T}}[[ADDRW:[0-9]*\.[XYZW]]] -; CHECK-NOT: LDS_WRITE {{[*]*}} T[[ADDRW]] - -; GROUP_BARRIER must be the last instruction in a clause -; CHECK: GROUP_BARRIER -; CHECK-NEXT: ALU clause - -; Make sure the lds reads are using different addresses. -; CHECK: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]] -; CHECK-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]] - -define void @local_memory_two_objects(i32 addrspace(1)* %out) { -entry: - %x.i = call i32 @llvm.r600.read.tidig.x() #0 - %arrayidx = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i - store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4 - %mul = shl nsw i32 %x.i, 1 - %arrayidx1 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i - store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4 - %sub = sub nsw i32 3, %x.i - call void @llvm.AMDGPU.barrier.local() - %arrayidx2 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub - %0 = load i32 addrspace(3)* %arrayidx2, align 4 - %arrayidx3 = getelementptr inbounds i32 addrspace(1)* %out, i32 %x.i - store i32 %0, i32 addrspace(1)* %arrayidx3, align 4 - %arrayidx4 = getelementptr inbounds [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub - %1 = load i32 addrspace(3)* %arrayidx4, align 4 - %add = add nsw i32 %x.i, 4 - %arrayidx5 = getelementptr inbounds i32 addrspace(1)* %out, i32 %add - store i32 %1, i32 addrspace(1)* %arrayidx5, align 4 - ret void -} - declare i32 @llvm.r600.read.tidig.x() #0 declare void @llvm.AMDGPU.barrier.local()