diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 0975f97998a..af12ce7360b 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -198,14 +198,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>; def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>; def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>; - -def FeatureEnableHugeScratchBuffer : SubtargetFeature< - "huge-scratch-buffer", - "EnableHugeScratchBuffer", - "true", - "Enable scratch buffer sizes greater than 128 GB" ->; - def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling", "EnableVGPRSpilling", "true", diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index fed46fe3715..96319151765 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -84,7 +84,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), HasSMemRealTime(false), Has16BitInsts(false), LDSBankCount(0), - IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false), + IsaVersion(ISAVersion0_0_0), EnableSIScheduler(false), FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index b9874f8a186..39228945d06 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -93,7 +93,6 @@ private: bool FeatureDisable; int LDSBankCount; unsigned IsaVersion; - bool EnableHugeScratchBuffer; bool EnableSIScheduler; std::unique_ptr FrameLowering; @@ -293,10 +292,6 @@ public: return false; } - bool enableHugeScratchBuffer() const { - return EnableHugeScratchBuffer; - } - bool enableSIScheduler() const { return EnableSIScheduler; } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 46b73f7ed09..d6a69da7ba5 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1178,25 +1178,35 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const { FrameIndexSDNode *FINode = cast(Op); unsigned FrameIndex = FINode->getIndex(); - // A FrameIndex node represents a 32-bit offset into scratch memory. If - // the high bit of a frame index offset were to be set, this would mean - // that it represented an offset of ~2GB * 64 = ~128GB from the start of the - // scratch buffer, with 64 being the number of threads per wave. + // A FrameIndex node represents a 32-bit offset into scratch memory. If the + // high bit of a frame index offset were to be set, this would mean that it + // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch + // buffer, with 64 being the number of threads per wave. // - // If we know the machine uses less than 128GB of scratch, then we can - // amrk the high bit of the FrameIndex node as known zero, - // which is important, because it means in most situations we can - // prove that values derived from FrameIndex nodes are non-negative. - // This enables us to take advantage of more addressing modes when - // accessing scratch buffers, since for scratch reads/writes, the register - // offset must always be positive. + // The maximum private allocation for the entire GPU is 4G, and we are + // concerned with the largest the index could ever be for an individual + // workitem. This will occur with the minmum dispatch size. If a program + // requires more, the dispatch size will be reduced. + // + // With this limit, we can mark the high bit of the FrameIndex node as known + // zero, which is important, because it means in most situations we can prove + // that values derived from FrameIndex nodes are non-negative. This enables us + // to take advantage of more addressing modes when accessing scratch buffers, + // since for scratch reads/writes, the register offset must always be + // positive. + + uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024; + + // XXX - It is unclear if partial dispatch works. Assume it works at half wave + // granularity. It is probably a full wave. + uint64_t MinGranularity = 32; + + unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity); + EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits); SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32); - if (Subtarget->enableHugeScratchBuffer()) - return TFI; - return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI, - DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31))); + DAG.getValueType(ExtVT)); } bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll index 63666971c87..7251f0cdbf5 100644 --- a/test/CodeGen/AMDGPU/private-element-size.ll +++ b/test/CodeGen/AMDGPU/private-element-size.ll @@ -33,9 +33,9 @@ ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -99,10 +99,14 @@ entry: ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}} ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll index d43de476605..c992eb817e5 100644 --- a/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -1,7 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s +; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s ; When a frame index offset is more than 12-bits, make sure we don't store ; it in mubuf's offset field. @@ -102,8 +100,7 @@ entry: } ; GCN-LABEL: @pos_vaddr_offse -; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16 -; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16 define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32]