Relands "[HIP] Change default --gpu-max-threads-per-block value to 1024"

This reverts commit e384e94fbe.
This commit is contained in:
Yaxun (Sam) Liu 2021-02-12 10:21:27 -05:00
parent 07c5a800dc
commit 053e61d54e
5 changed files with 10 additions and 6 deletions

View File

@ -242,7 +242,7 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr function
LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions")
LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code")
LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP")
LANGOPT(GPUMaxThreadsPerBlock, 32, 256, "default max threads per block for kernel launch bounds for HIP")
LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kernel launch bounds for HIP")
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")

View File

@ -925,7 +925,7 @@ defm gpu_exclude_wrong_side_overloads : BoolFOption<"gpu-exclude-wrong-side-over
def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">,
Flags<[CC1Option]>,
HelpText<"Default max threads per block for kernel launch bounds for HIP">,
MarshallingInfoStringInt<LangOpts<"GPUMaxThreadsPerBlock">, "256">,
MarshallingInfoStringInt<LangOpts<"GPUMaxThreadsPerBlock">, "1024">,
ShouldParseIf<hip.KeyPath>;
def gpu_instrument_lib_EQ : Joined<["--"], "gpu-instrument-lib=">,
HelpText<"Instrument device library for HIP, which is a LLVM bitcode containing "

View File

@ -8998,9 +8998,13 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
assert(Max == 0 && "Max must be zero");
} else if (IsOpenCLKernel || IsHIPKernel) {
// By default, restrict the maximum size to a value specified by
// --gpu-max-threads-per-block=n or its default value.
// --gpu-max-threads-per-block=n or its default value for HIP.
const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
const unsigned DefaultMaxWorkGroupSize =
IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
: M.getLangOpts().GPUMaxThreadsPerBlock;
std::string AttrVal =
std::string("1,") + llvm::utostr(M.getLangOpts().GPUMaxThreadsPerBlock);
std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
}

View File

@ -39,7 +39,7 @@ __global__ void num_vgpr_64() {
// NAMD-NOT: "amdgpu-num-vgpr"
// NAMD-NOT: "amdgpu-num-sgpr"
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"{{.*}}"uniform-work-group-size"="true"
// DEFAULT-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"{{.*}}"uniform-work-group-size"="true"
// MAX1024-DAG: attributes [[FLAT_WORK_GROUP_SIZE_DEFAULT]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes [[FLAT_WORK_GROUP_SIZE_32_64]] = {{.*}}"amdgpu-flat-work-group-size"="32,64"
// CHECK-DAG: attributes [[WAVES_PER_EU_2]] = {{.*}}"amdgpu-waves-per-eu"="2"

View File

@ -39,4 +39,4 @@ int main() {
launch((void*)D.Empty());
return 0;
}
// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,256"
// CHECK: attributes #[[ATTR]] = {{.*}}"amdgpu-flat-work-group-size"="1,1024"