From c58a6a6fb4110ee1ffd0e45ad98872e55855b310 Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Fri, 30 Apr 2021 14:54:03 -0400 Subject: [PATCH] [HIP] Fix device lib selection Choose optimized device lib bitcode by fp options for performance. Reviewed by: Artem Belevich, Fangrui Song Differential Revision: https://reviews.llvm.org/D101654 --- clang/include/clang/Basic/CodeGenOptions.def | 3 +- clang/include/clang/Driver/Options.td | 9 +++- clang/lib/CodeGen/CGExprScalar.cpp | 6 ++- clang/lib/Driver/ToolChains/HIP.cpp | 16 ++++--- .../test/CodeGenCUDA/correctly-rounded-div.cu | 35 +++++++++++++++ clang/test/Driver/hip-device-libs.hip | 44 +++++++++++++++++++ 6 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 clang/test/CodeGenCUDA/correctly-rounded-div.cu diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index e0bfcf6ef3e4..4b1ff8e70afd 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -172,7 +172,8 @@ CODEGENOPT(NoInlineLineTables, 1, 0) ///< Whether debug info should contain CODEGENOPT(StackClashProtector, 1, 0) ///< Set when -fstack-clash-protection is enabled. CODEGENOPT(NoImplicitFloat , 1, 0) ///< Set when -mno-implicit-float is enabled. CODEGENOPT(NullPointerIsValid , 1, 0) ///< Assume Null pointer deference is defined. -CODEGENOPT(CorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt +CODEGENOPT(OpenCLCorrectlyRoundedDivSqrt, 1, 0) ///< -cl-fp32-correctly-rounded-divide-sqrt +CODEGENOPT(HIPCorrectlyRoundedDivSqrt, 1, 1) ///< -fno-hip-fp32-correctly-rounded-divide-sqrt CODEGENOPT(UniqueInternalLinkageNames, 1, 0) ///< Internal Linkage symbols get unique names. CODEGENOPT(SplitMachineFunctions, 1, 0) ///< Split machine functions using profile information. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a8a7177c82cc..ea1c861f4ac3 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -834,7 +834,7 @@ def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group; def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group, Flags<[CC1Option]>, HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">, - MarshallingInfoFlag>; + MarshallingInfoFlag>; def cl_uniform_work_group_size : Flag<["-"], "cl-uniform-work-group-size">, Group, Flags<[CC1Option]>, HelpText<"OpenCL only. Defines that the global work-size be a multiple of the work-group size specified to clEnqueueNDRangeKernel">, MarshallingInfoFlag>; @@ -939,6 +939,13 @@ defm hip_new_launch_api : BoolFOption<"hip-new-launch-api", LangOpts<"HIPUseNewLaunchAPI">, DefaultFalse, PosFlag, NegFlag, BothFlags<[], " new kernel launching API for HIP">>; +defm hip_fp32_correctly_rounded_divide_sqrt : BoolFOption<"hip-fp32-correctly-rounded-divide-sqrt", + CodeGenOpts<"HIPCorrectlyRoundedDivSqrt">, DefaultTrue, + PosFlag, + NegFlag, + BothFlags<[], " that single precision floating-point divide and sqrt used in " + "the program source are correctly rounded (HIP device compilation only)">>, + ShouldParseIf; defm gpu_allow_device_init : BoolFOption<"gpu-allow-device-init", LangOpts<"GPUAllowDeviceInit">, DefaultFalse, PosFlag, NegFlag, diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 42d07c1f5233..b469a4454f15 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -3216,8 +3216,10 @@ Value *ScalarExprEmitter::EmitDiv(const BinOpInfo &Ops) { llvm::Value *Val; CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Ops.FPFeatures); Val = Builder.CreateFDiv(Ops.LHS, Ops.RHS, "div"); - if (CGF.getLangOpts().OpenCL && - !CGF.CGM.getCodeGenOpts().CorrectlyRoundedDivSqrt) { + if ((CGF.getLangOpts().OpenCL && + !CGF.CGM.getCodeGenOpts().OpenCLCorrectlyRoundedDivSqrt) || + (CGF.getLangOpts().HIP && CGF.getLangOpts().CUDAIsDevice && + !CGF.CGM.getCodeGenOpts().HIPCorrectlyRoundedDivSqrt)) { // OpenCL v1.1 s7.4: minimum accuracy of single precision / is 2.5ulp // OpenCL v1.2 s5.6.4.2: The -cl-fp32-correctly-rounded-divide-sqrt // build option allows an application to specify that single precision diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp index 126eb65e3c4d..e483102593e1 100644 --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -404,11 +404,17 @@ HIPToolChain::getHIPDeviceLibs(const llvm::opt::ArgList &DriverArgs) const { bool DAZ = DriverArgs.hasFlag(options::OPT_fgpu_flush_denormals_to_zero, options::OPT_fno_gpu_flush_denormals_to_zero, getDefaultDenormsAreZeroForTarget(Kind)); - // TODO: Check standard C++ flags? - bool FiniteOnly = false; - bool UnsafeMathOpt = false; - bool FastRelaxedMath = false; - bool CorrectSqrt = true; + bool FiniteOnly = + DriverArgs.hasFlag(options::OPT_ffinite_math_only, + options::OPT_fno_finite_math_only, false); + bool UnsafeMathOpt = + DriverArgs.hasFlag(options::OPT_funsafe_math_optimizations, + options::OPT_fno_unsafe_math_optimizations, false); + bool FastRelaxedMath = DriverArgs.hasFlag( + options::OPT_ffast_math, options::OPT_fno_fast_math, false); + bool CorrectSqrt = DriverArgs.hasFlag( + options::OPT_fhip_fp32_correctly_rounded_divide_sqrt, + options::OPT_fno_hip_fp32_correctly_rounded_divide_sqrt); bool Wave64 = isWave64(DriverArgs, Kind); if (DriverArgs.hasFlag(options::OPT_fgpu_sanitize, diff --git a/clang/test/CodeGenCUDA/correctly-rounded-div.cu b/clang/test/CodeGenCUDA/correctly-rounded-div.cu new file mode 100644 index 000000000000..4f17220534ca --- /dev/null +++ b/clang/test/CodeGenCUDA/correctly-rounded-div.cu @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 %s -emit-llvm -o - -triple -amdgcn-amd-amdhsa \ +// RUN: -target-cpu gfx906 -fcuda-is-device -x hip \ +// RUN: | FileCheck --check-prefixes=COMMON,CRDIV %s +// RUN: %clang_cc1 %s -emit-llvm -o - -triple -amdgcn-amd-amdhsa \ +// RUN: -target-cpu gfx906 -fcuda-is-device -x hip \ +// RUN: -fno-hip-fp32-correctly-rounded-divide-sqrt \ +// RUN: | FileCheck --check-prefixes=COMMON,NCRDIV %s + +#include "Inputs/cuda.h" + +typedef __attribute__(( ext_vector_type(4) )) float float4; + +// COMMON-LABEL: @_Z11spscalardiv +// COMMON: fdiv{{.*}}, +// NCRDIV: !fpmath ![[MD:[0-9]+]] +// CRDIV-NOT: !fpmath +__device__ float spscalardiv(float a, float b) { + return a / b; +} + +// COMMON-LABEL: @_Z11spvectordiv +// COMMON: fdiv{{.*}}, +// NCRDIV: !fpmath ![[MD]] +// CRDIV-NOT: !fpmath +__device__ float4 spvectordiv(float4 a, float4 b) { + return a / b; +} + +// COMMON-LABEL: @_Z11dpscalardiv +// COMMON-NOT: !fpmath +__device__ double dpscalardiv(double a, double b) { + return a / b; +} + +// NCRDIV: ![[MD]] = !{float 2.500000e+00} diff --git a/clang/test/Driver/hip-device-libs.hip b/clang/test/Driver/hip-device-libs.hip index 62556bf9621a..1f95fc2e3c35 100644 --- a/clang/test/Driver/hip-device-libs.hip +++ b/clang/test/Driver/hip-device-libs.hip @@ -113,6 +113,30 @@ // RUN: %S/Inputs/hip_multiple_inputs/b.hip \ // RUN: 2>&1 | FileCheck %s --check-prefixes=ALL,INST +// Test -fast-math +// RUN: %clang -### -target x86_64-linux-gnu --offload-arch=gfx900 \ +// RUN: -ffast-math --rocm-path=%S/Inputs/rocm \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=FAST + +// Test -ffinite-math-only +// RUN: %clang -### -target x86_64-linux-gnu --offload-arch=gfx900 \ +// RUN: -ffinite-math-only --rocm-path=%S/Inputs/rocm \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=FINITE + +// Test -funsafe-math-optimizations +// RUN: %clang -### -target x86_64-linux-gnu --offload-arch=gfx900 \ +// RUN: -funsafe-math-optimizations --rocm-path=%S/Inputs/rocm \ +// RUN: %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=UNSAFE + +// Test -fno-hip-fp32-correctly-rounded-divide-sqrt +// RUN: %clang -### -target x86_64-linux-gnu --offload-arch=gfx900 \ +// RUN: -fno-hip-fp32-correctly-rounded-divide-sqrt \ +// RUN: --rocm-path=%S/Inputs/rocm %S/Inputs/hip_multiple_inputs/b.hip \ +// RUN: 2>&1 | FileCheck %s --check-prefixes=DIVSQRT + // ALL-NOT: error: // ALL: {{"[^"]*clang[^"]*"}} // ALL-SAME: "-mlink-builtin-bitcode" "{{.*}}hip.bc" @@ -128,3 +152,23 @@ // ALL-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_wavefrontsize64_on.bc" // ALL-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_isa_version_{{[0-9]+}}.bc" // INST-SAME: "-mlink-builtin-bitcode" "{{.*}}instrument.bc" + +// FAST: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc" +// FAST-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_on.bc" +// FAST-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_on.bc" +// FAST-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_on.bc" + +// FINITE: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc" +// FINITE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_off.bc" +// FINITE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_on.bc" +// FINITE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_on.bc" + +// UNSAFE: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc" +// UNSAFE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_on.bc" +// UNSAFE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_off.bc" +// UNSAFE-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_on.bc" + +// DIVSQRT: "-mlink-builtin-bitcode" "{{.*}}oclc_daz_opt_off.bc" +// DIVSQRT-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_unsafe_math_off.bc" +// DIVSQRT-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_finite_only_off.bc" +// DIVSQRT-SAME: "-mlink-builtin-bitcode" "{{.*}}oclc_correctly_rounded_sqrt_off.bc"