mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-29 22:30:33 +00:00
AMDGPU: More bits of frame index are known to be zero
The maximum private allocation for the whole GPU is 4G, so the maximum possible index for a single workitem is the maximum size divided by the smallest granularity for a dispatch. This increases the number of known zero high bits, which enables more offset folding. The maximum private size per workitem with this is 128M but may be smaller still. llvm-svn: 262153
This commit is contained in:
parent
eedb835102
commit
cd69621a21
@ -198,14 +198,6 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
|
||||
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
|
||||
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
|
||||
|
||||
|
||||
def FeatureEnableHugeScratchBuffer : SubtargetFeature<
|
||||
"huge-scratch-buffer",
|
||||
"EnableHugeScratchBuffer",
|
||||
"true",
|
||||
"Enable scratch buffer sizes greater than 128 GB"
|
||||
>;
|
||||
|
||||
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
|
||||
"EnableVGPRSpilling",
|
||||
"true",
|
||||
|
@ -84,7 +84,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
GCN1Encoding(false), GCN3Encoding(false), CIInsts(false),
|
||||
HasSMemRealTime(false), Has16BitInsts(false),
|
||||
LDSBankCount(0),
|
||||
IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
|
||||
IsaVersion(ISAVersion0_0_0),
|
||||
EnableSIScheduler(false), FrameLowering(nullptr),
|
||||
InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) {
|
||||
|
||||
|
@ -93,7 +93,6 @@ private:
|
||||
bool FeatureDisable;
|
||||
int LDSBankCount;
|
||||
unsigned IsaVersion;
|
||||
bool EnableHugeScratchBuffer;
|
||||
bool EnableSIScheduler;
|
||||
|
||||
std::unique_ptr<AMDGPUFrameLowering> FrameLowering;
|
||||
@ -293,10 +292,6 @@ public:
|
||||
return false;
|
||||
}
|
||||
|
||||
bool enableHugeScratchBuffer() const {
|
||||
return EnableHugeScratchBuffer;
|
||||
}
|
||||
|
||||
bool enableSIScheduler() const {
|
||||
return EnableSIScheduler;
|
||||
}
|
||||
|
@ -1178,25 +1178,35 @@ SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
|
||||
FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
|
||||
unsigned FrameIndex = FINode->getIndex();
|
||||
|
||||
// A FrameIndex node represents a 32-bit offset into scratch memory. If
|
||||
// the high bit of a frame index offset were to be set, this would mean
|
||||
// that it represented an offset of ~2GB * 64 = ~128GB from the start of the
|
||||
// scratch buffer, with 64 being the number of threads per wave.
|
||||
// A FrameIndex node represents a 32-bit offset into scratch memory. If the
|
||||
// high bit of a frame index offset were to be set, this would mean that it
|
||||
// represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
|
||||
// buffer, with 64 being the number of threads per wave.
|
||||
//
|
||||
// If we know the machine uses less than 128GB of scratch, then we can
|
||||
// amrk the high bit of the FrameIndex node as known zero,
|
||||
// which is important, because it means in most situations we can
|
||||
// prove that values derived from FrameIndex nodes are non-negative.
|
||||
// This enables us to take advantage of more addressing modes when
|
||||
// accessing scratch buffers, since for scratch reads/writes, the register
|
||||
// offset must always be positive.
|
||||
// The maximum private allocation for the entire GPU is 4G, and we are
|
||||
// concerned with the largest the index could ever be for an individual
|
||||
// workitem. This will occur with the minmum dispatch size. If a program
|
||||
// requires more, the dispatch size will be reduced.
|
||||
//
|
||||
// With this limit, we can mark the high bit of the FrameIndex node as known
|
||||
// zero, which is important, because it means in most situations we can prove
|
||||
// that values derived from FrameIndex nodes are non-negative. This enables us
|
||||
// to take advantage of more addressing modes when accessing scratch buffers,
|
||||
// since for scratch reads/writes, the register offset must always be
|
||||
// positive.
|
||||
|
||||
uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
|
||||
|
||||
// XXX - It is unclear if partial dispatch works. Assume it works at half wave
|
||||
// granularity. It is probably a full wave.
|
||||
uint64_t MinGranularity = 32;
|
||||
|
||||
unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
|
||||
EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
|
||||
|
||||
SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
|
||||
if (Subtarget->enableHugeScratchBuffer())
|
||||
return TFI;
|
||||
|
||||
return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
|
||||
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), 31)));
|
||||
DAG.getValueType(ExtVT));
|
||||
}
|
||||
|
||||
bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
|
||||
|
@ -33,9 +33,9 @@
|
||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
|
||||
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
|
||||
define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
@ -99,10 +99,14 @@ entry:
|
||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:56{{$}}
|
||||
; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:60{{$}}
|
||||
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
|
||||
; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
|
||||
define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
|
||||
entry:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -1,7 +1,5 @@
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=DEFAULT-SCRATCH %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=SI < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+huge-scratch-buffer -mcpu=tonga < %s | FileCheck --check-prefix=GCN --check-prefix=HUGE-SCRATCH %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; When a frame index offset is more than 12-bits, make sure we don't store
|
||||
; it in mubuf's offset field.
|
||||
@ -102,8 +100,7 @@ entry:
|
||||
}
|
||||
|
||||
; GCN-LABEL: @pos_vaddr_offse
|
||||
; DEFAULT-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
|
||||
; HUGE-SCRATCH: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16
|
||||
define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
|
||||
entry:
|
||||
%array = alloca [8192 x i32]
|
||||
|
Loading…
Reference in New Issue
Block a user