mirror of
https://github.com/RPCSX/llvm.git
synced 2025-05-13 10:56:01 +00:00
[AMDGPU] Account workgroup size in LDS occupancy limits
Functions matching LDS use to occupancy return results for a workgroup of 64 workitems. The numbers has to be adjusted for bigger workgroups. For example a workgroup of size 256 already occupies 4 waves just by itself. Given that all numbers of LDS use in the compiler are per workgroup, occupancy shall be multiplied by 4 in this case. Each 64 workitems still limited by the same number, but 4 subrgoups 64 workitems each can afford 4 times more LDS to get the same occupancy. In addition change initializes LDS size in the subtarget to a real value for SI+ targets. This is required since LDS size is a variable in these calculations. Differential Revision: https://reviews.llvm.org/D29423 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293837 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
faff8226b0
commit
a1d4ee75a4
@ -204,7 +204,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
|
unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
|
||||||
|
F);
|
||||||
|
|
||||||
// Restrict local memory usage so that we don't drastically reduce occupancy,
|
// Restrict local memory usage so that we don't drastically reduce occupancy,
|
||||||
// unless it is already significantly reduced.
|
// unless it is already significantly reduced.
|
||||||
@ -225,7 +226,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
|||||||
|
|
||||||
// Round up to the next tier of usage.
|
// Round up to the next tier of usage.
|
||||||
unsigned MaxSizeWithWaveCount
|
unsigned MaxSizeWithWaveCount
|
||||||
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
|
= ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
|
||||||
|
|
||||||
// Program is possibly broken by using more local mem than available.
|
// Program is possibly broken by using more local mem than available.
|
||||||
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
|
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
|
||||||
|
@ -132,62 +132,26 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
|||||||
initializeSubtargetDependencies(TT, GPU, FS);
|
initializeSubtargetDependencies(TT, GPU, FS);
|
||||||
}
|
}
|
||||||
|
|
||||||
// FIXME: These limits are for SI. Did they change with the larger maximum LDS
|
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
|
||||||
// size?
|
const Function &F) const {
|
||||||
unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
|
if (NWaves == 1)
|
||||||
switch (NWaves) {
|
|
||||||
case 10:
|
|
||||||
return 1638;
|
|
||||||
case 9:
|
|
||||||
return 1820;
|
|
||||||
case 8:
|
|
||||||
return 2048;
|
|
||||||
case 7:
|
|
||||||
return 2340;
|
|
||||||
case 6:
|
|
||||||
return 2730;
|
|
||||||
case 5:
|
|
||||||
return 3276;
|
|
||||||
case 4:
|
|
||||||
return 4096;
|
|
||||||
case 3:
|
|
||||||
return 5461;
|
|
||||||
case 2:
|
|
||||||
return 8192;
|
|
||||||
default:
|
|
||||||
return getLocalMemorySize();
|
return getLocalMemorySize();
|
||||||
}
|
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
|
||||||
|
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
|
||||||
|
unsigned MaxWaves = getMaxWavesPerEU();
|
||||||
|
return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
|
unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
|
||||||
if (Bytes <= 1638)
|
const Function &F) const {
|
||||||
return 10;
|
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
|
||||||
|
unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
|
||||||
if (Bytes <= 1820)
|
unsigned MaxWaves = getMaxWavesPerEU();
|
||||||
return 9;
|
unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
|
||||||
|
unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
|
||||||
if (Bytes <= 2048)
|
NumWaves = std::min(NumWaves, MaxWaves);
|
||||||
return 8;
|
NumWaves = std::max(NumWaves, 1u);
|
||||||
|
return NumWaves;
|
||||||
if (Bytes <= 2340)
|
|
||||||
return 7;
|
|
||||||
|
|
||||||
if (Bytes <= 2730)
|
|
||||||
return 6;
|
|
||||||
|
|
||||||
if (Bytes <= 3276)
|
|
||||||
return 5;
|
|
||||||
|
|
||||||
if (Bytes <= 4096)
|
|
||||||
return 4;
|
|
||||||
|
|
||||||
if (Bytes <= 5461)
|
|
||||||
return 3;
|
|
||||||
|
|
||||||
if (Bytes <= 8192)
|
|
||||||
return 2;
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
|
std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
|
||||||
|
@ -274,11 +274,12 @@ public:
|
|||||||
|
|
||||||
/// Return the amount of LDS that can be used that will not restrict the
|
/// Return the amount of LDS that can be used that will not restrict the
|
||||||
/// occupancy lower than WaveCount.
|
/// occupancy lower than WaveCount.
|
||||||
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
|
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
|
||||||
|
const Function &) const;
|
||||||
|
|
||||||
/// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
|
/// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
|
||||||
/// the given LDS memory size is the only constraint.
|
/// the given LDS memory size is the only constraint.
|
||||||
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
|
unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
|
||||||
|
|
||||||
bool hasFP16Denormals() const {
|
bool hasFP16Denormals() const {
|
||||||
return FP64FP16Denormals;
|
return FP64FP16Denormals;
|
||||||
|
@ -35,7 +35,8 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
|
|||||||
unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
|
unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
|
||||||
ST.getOccupancyWithNumVGPRs(VGPRs));
|
ST.getOccupancyWithNumVGPRs(VGPRs));
|
||||||
return std::min(MinRegOccupancy,
|
return std::min(MinRegOccupancy,
|
||||||
ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
|
ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
|
||||||
|
*MF.getFunction()));
|
||||||
}
|
}
|
||||||
|
|
||||||
void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
|
void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
|
||||||
|
@ -22,8 +22,8 @@ declare void @llvm.amdgcn.s.barrier() #0
|
|||||||
; CI-PROMOTE: ds_read_b64
|
; CI-PROMOTE: ds_read_b64
|
||||||
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
|
define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
|
||||||
%val = load double, double addrspace(1)* %in, align 8
|
%val = load double, double addrspace(1)* %in, align 8
|
||||||
%array = alloca [16 x double], align 8
|
%array = alloca [8 x double], align 8
|
||||||
%ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
|
%ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
|
||||||
store double %val, double* %ptr, align 8
|
store double %val, double* %ptr, align 8
|
||||||
call void @llvm.amdgcn.s.barrier()
|
call void @llvm.amdgcn.s.barrier()
|
||||||
%result = load double, double* %ptr, align 8
|
%result = load double, double* %ptr, align 8
|
||||||
@ -53,8 +53,8 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
|
|||||||
; CI-PROMOTE: ds_read2_b64
|
; CI-PROMOTE: ds_read2_b64
|
||||||
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
|
define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
|
||||||
%val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
|
%val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
|
||||||
%array = alloca [8 x <2 x double>], align 16
|
%array = alloca [4 x <2 x double>], align 16
|
||||||
%ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
|
%ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
|
||||||
store <2 x double> %val, <2 x double>* %ptr, align 16
|
store <2 x double> %val, <2 x double>* %ptr, align 16
|
||||||
call void @llvm.amdgcn.s.barrier()
|
call void @llvm.amdgcn.s.barrier()
|
||||||
%result = load <2 x double>, <2 x double>* %ptr, align 16
|
%result = load <2 x double>, <2 x double>* %ptr, align 16
|
||||||
@ -111,8 +111,8 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
|
|||||||
; CI-PROMOTE: ds_read2_b64
|
; CI-PROMOTE: ds_read2_b64
|
||||||
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
|
define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
|
||||||
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
|
%val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
|
||||||
%array = alloca [8 x <2 x i64>], align 16
|
%array = alloca [4 x <2 x i64>], align 16
|
||||||
%ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
|
%ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
|
||||||
store <2 x i64> %val, <2 x i64>* %ptr, align 16
|
store <2 x i64> %val, <2 x i64>* %ptr, align 16
|
||||||
call void @llvm.amdgcn.s.barrier()
|
call void @llvm.amdgcn.s.barrier()
|
||||||
%result = load <2 x i64>, <2 x i64>* %ptr, align 16
|
%result = load <2 x i64>, <2 x i64>* %ptr, align 16
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
|
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
|
||||||
|
; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
|
||||||
|
|
||||||
; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
|
; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
|
||||||
|
; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
|
||||||
|
|
||||||
define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||||
entry:
|
entry:
|
||||||
@ -22,7 +24,7 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
|
; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
|
||||||
|
|
||||||
define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
|
define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
|
||||||
entry:
|
entry:
|
||||||
@ -44,7 +46,7 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
|
; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
|
||||||
|
|
||||||
define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
|
define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
|
||||||
entry:
|
entry:
|
||||||
@ -66,8 +68,8 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_0(
|
; ALL-LABEL: @occupancy_0(
|
||||||
; CHECK: alloca [5 x i32]
|
; ALL: alloca [5 x i32]
|
||||||
define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
|
define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [5 x i32], align 4
|
%stack = alloca [5 x i32], align 4
|
||||||
@ -88,8 +90,8 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_max(
|
; ALL-LABEL: @occupancy_max(
|
||||||
; CHECK: alloca [5 x i32]
|
; ALL: alloca [5 x i32]
|
||||||
define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
|
define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [5 x i32], align 4
|
%stack = alloca [5 x i32], align 4
|
||||||
@ -110,8 +112,10 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_6(
|
; SI-LABEL: @occupancy_6(
|
||||||
; CHECK-NOT: alloca
|
; CI-LABEL: @occupancy_6(
|
||||||
|
; SI: alloca
|
||||||
|
; CI-NOT: alloca
|
||||||
define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
|
define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [42 x i8], align 4
|
%stack = alloca [42 x i8], align 4
|
||||||
@ -134,8 +138,8 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_6_over(
|
; ALL-LABEL: @occupancy_6_over(
|
||||||
; CHECK: alloca [43 x i8]
|
; ALL: alloca [43 x i8]
|
||||||
define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
|
define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [43 x i8], align 4
|
%stack = alloca [43 x i8], align 4
|
||||||
@ -158,8 +162,10 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_8(
|
; SI-LABEL: @occupancy_8(
|
||||||
; CHECK-NOT: alloca
|
; CI-LABEL: @occupancy_8(
|
||||||
|
; SI: alloca
|
||||||
|
; CI-NOT: alloca
|
||||||
define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
|
define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [32 x i8], align 4
|
%stack = alloca [32 x i8], align 4
|
||||||
@ -182,8 +188,8 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_8_over(
|
; ALL-LABEL: @occupancy_8_over(
|
||||||
; CHECK: alloca [33 x i8]
|
; ALL: alloca [33 x i8]
|
||||||
define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
|
define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [33 x i8], align 4
|
%stack = alloca [33 x i8], align 4
|
||||||
@ -206,8 +212,10 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_9(
|
; SI-LABEL: @occupancy_9(
|
||||||
; CHECK-NOT: alloca
|
; CI-LABEL: @occupancy_9(
|
||||||
|
; SI: alloca
|
||||||
|
; CI-NOT: alloca
|
||||||
define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
|
define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [28 x i8], align 4
|
%stack = alloca [28 x i8], align 4
|
||||||
@ -230,8 +238,8 @@ entry:
|
|||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK-LABEL: @occupancy_9_over(
|
; ALL-LABEL: @occupancy_9_over(
|
||||||
; CHECK: alloca [29 x i8]
|
; ALL: alloca [29 x i8]
|
||||||
define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
|
define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
|
||||||
entry:
|
entry:
|
||||||
%stack = alloca [29 x i8], align 4
|
%stack = alloca [29 x i8], align 4
|
||||||
|
Loading…
x
Reference in New Issue
Block a user