[OpenMP][DeviceRTL][AMDGPU] Support code object version 5

Update DeviceRTL and the AMDGPU plugin to support code
object version 5. Default is code object version 4.

CodeGen for __builtin_amdgpu_workgroup_size generates code
for cov4 as well as cov5 if -mcode-object-version=none
is specified. DeviceRTL compilation passes this argument
via Xclang option to generate abi-agnostic code.

Generated code for the above builtin uses a clang
control constant "llvm.amdgcn.abi.version" to branch on
the abi version, which is available during linking of
user's OpenMP code. Load of this constant gets eliminated
during linking.

AMDGPU plugin queries the ELF for code object version
and then prepares various implicitargs accordingly.

Differential Revision: https://reviews.llvm.org/D139730

Reviewed By: jhuber6, yaxunl
This commit is contained in:
Saiyedul Islam 2023-08-17 07:45:40 -05:00
parent 30b6b27385
commit f616c3eeb4
No known key found for this signature in database
GPG Key ID: 060B71C5AF15714F
14 changed files with 290 additions and 39 deletions

View File

@ -27,6 +27,7 @@
#include "clang/AST/OSLog.h"
#include "clang/Basic/TargetBuiltins.h"
#include "clang/Basic/TargetInfo.h"
#include "clang/Basic/TargetOptions.h"
#include "clang/CodeGen/CGFunctionInfo.h"
#include "clang/Frontend/FrontendDiagnostic.h"
#include "llvm/ADT/APFloat.h"
@ -17098,24 +17099,61 @@ Value *EmitAMDGPUImplicitArgPtr(CodeGenFunction &CGF) {
}
// \p Index is 0, 1, and 2 for x, y, and z dimension, respectively.
/// Emit code based on Code Object ABI version.
/// COV_4 : Emit code to use dispatch ptr
/// COV_5 : Emit code to use implicitarg ptr
/// COV_NONE : Emit code to load a global variable "llvm.amdgcn.abi.version"
/// and use its value for COV_4 or COV_5 approach. It is used for
/// compiling device libraries in an ABI-agnostic way.
///
/// Note: "llvm.amdgcn.abi.version" is supposed to be emitted and intialized by
/// clang during compilation of user code.
Value *EmitAMDGPUWorkGroupSize(CodeGenFunction &CGF, unsigned Index) {
bool IsCOV_5 = CGF.getTarget().getTargetOpts().CodeObjectVersion ==
clang::TargetOptions::COV_5;
Constant *Offset;
Value *DP;
if (IsCOV_5) {
llvm::LoadInst *LD;
auto Cov = CGF.getTarget().getTargetOpts().CodeObjectVersion;
if (Cov == clang::TargetOptions::COV_None) {
auto *ABIVersionC = CGF.CGM.GetOrCreateLLVMGlobal(
"llvm.amdgcn.abi.version", CGF.Int32Ty, LangAS::Default, nullptr,
CodeGen::NotForDefinition);
// This load will be eliminated by the IPSCCP because it is constant
// weak_odr without externally_initialized. Either changing it to weak or
// adding externally_initialized will keep the load.
Value *ABIVersion = CGF.Builder.CreateAlignedLoad(CGF.Int32Ty, ABIVersionC,
CGF.CGM.getIntAlign());
Value *IsCOV5 = CGF.Builder.CreateICmpSGE(
ABIVersion,
llvm::ConstantInt::get(CGF.Int32Ty, clang::TargetOptions::COV_5));
// Indexing the implicit kernarg segment.
Offset = llvm::ConstantInt::get(CGF.Int32Ty, 12 + Index * 2);
DP = EmitAMDGPUImplicitArgPtr(CGF);
} else {
Value *ImplicitGEP = CGF.Builder.CreateConstGEP1_32(
CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
// Indexing the HSA kernel_dispatch_packet struct.
Offset = llvm::ConstantInt::get(CGF.Int32Ty, 4 + Index * 2);
DP = EmitAMDGPUDispatchPtr(CGF);
Value *DispatchGEP = CGF.Builder.CreateConstGEP1_32(
CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
auto Result = CGF.Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
LD = CGF.Builder.CreateLoad(
Address(Result, CGF.Int16Ty, CharUnits::fromQuantity(2)));
} else {
Value *GEP = nullptr;
if (Cov == clang::TargetOptions::COV_5) {
// Indexing the implicit kernarg segment.
GEP = CGF.Builder.CreateConstGEP1_32(
CGF.Int8Ty, EmitAMDGPUImplicitArgPtr(CGF), 12 + Index * 2);
} else {
// Indexing the HSA kernel_dispatch_packet struct.
GEP = CGF.Builder.CreateConstGEP1_32(
CGF.Int8Ty, EmitAMDGPUDispatchPtr(CGF), 4 + Index * 2);
}
LD = CGF.Builder.CreateLoad(
Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
}
auto *GEP = CGF.Builder.CreateGEP(CGF.Int8Ty, DP, Offset);
auto *LD = CGF.Builder.CreateLoad(
Address(GEP, CGF.Int16Ty, CharUnits::fromQuantity(2)));
llvm::MDBuilder MDHelper(CGF.getLLVMContext());
llvm::MDNode *RNode = MDHelper.createRange(APInt(16, 1),
APInt(16, CGF.getTarget().getMaxOpenCLWorkGroupSize() + 1));

View File

@ -1203,6 +1203,8 @@ void CodeGenModule::Release() {
getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign",
getContext().getTargetInfo().getMaxTLSAlign());
getTargetCodeGenInfo().emitTargetGlobals(*this);
getTargetCodeGenInfo().emitTargetMetadata(*this, MangledDeclNames);
EmitBackendOptionsMetadata(getCodeGenOpts());

View File

@ -1571,6 +1571,11 @@ public:
void handleAMDGPUWavesPerEUAttr(llvm::Function *F,
const AMDGPUWavesPerEUAttr *A);
llvm::Constant *
GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace,
const VarDecl *D,
ForDefinition_t IsForDefinition = NotForDefinition);
private:
llvm::Constant *GetOrCreateLLVMFunction(
StringRef MangledName, llvm::Type *Ty, GlobalDecl D, bool ForVTable,
@ -1593,11 +1598,6 @@ private:
void UpdateMultiVersionNames(GlobalDecl GD, const FunctionDecl *FD,
StringRef &CurName);
llvm::Constant *
GetOrCreateLLVMGlobal(StringRef MangledName, llvm::Type *Ty, LangAS AddrSpace,
const VarDecl *D,
ForDefinition_t IsForDefinition = NotForDefinition);
bool GetCPUAndFeaturesAttributes(GlobalDecl GD,
llvm::AttrBuilder &AttrBuilder,
bool SetTargetFeatures = true);

View File

@ -81,6 +81,9 @@ public:
CodeGen::CodeGenModule &CGM,
const llvm::MapVector<GlobalDecl, StringRef> &MangledDeclNames) const {}
/// Provides a convenient hook to handle extra target-specific globals.
virtual void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const {}
/// Any further codegen related checks that need to be done on a function call
/// in a target specific manner.
virtual void checkFunctionCallABI(CodeGenModule &CGM, SourceLocation CallLoc,

View File

@ -8,6 +8,7 @@
#include "ABIInfoImpl.h"
#include "TargetInfo.h"
#include "clang/Basic/TargetOptions.h"
using namespace clang;
using namespace clang::CodeGen;
@ -274,6 +275,8 @@ public:
void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
CodeGenModule &CGM) const;
void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
CodeGen::CodeGenModule &M) const override;
unsigned getOpenCLKernelCallingConv() const override;
@ -354,6 +357,28 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
}
}
/// Emits control constants used to change per-architecture behaviour in the
/// AMDGPU ROCm device libraries.
void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
CodeGen::CodeGenModule &CGM) const {
StringRef Name = "llvm.amdgcn.abi.version";
if (CGM.getModule().getNamedGlobal(Name))
return;
auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
llvm::Constant *COV = llvm::ConstantInt::get(
Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);
// It needs to be constant weak_odr without externally_initialized so that
// the load instuction can be eliminated by the IPSCCP.
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));
GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);
}
void AMDGPUTargetCodeGenInfo::setTargetAttributes(
const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
if (requiresAMDGPUProtectedVisibility(D, GV)) {

View File

@ -1370,7 +1370,10 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOpenMPTargetArgs(
// matches the current toolchain triple. If it is not present
// at all, target and host share a toolchain.
if (A->getOption().matches(options::OPT_m_Group)) {
if (SameTripleAsHost)
// Pass code object version to device toolchain
// to correctly set metadata in intermediate files.
if (SameTripleAsHost ||
A->getOption().matches(options::OPT_mcode_object_version_EQ))
DAL->append(A);
else
Modified = true;

View File

@ -8645,6 +8645,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
CmdArgs.push_back("--device-debug");
}
// code-object-version=X needs to be passed to clang-linker-wrapper to ensure
// that it is used by lld.
if (const Arg *A = Args.getLastArg(options::OPT_mcode_object_version_EQ)) {
CmdArgs.push_back(Args.MakeArgString("-mllvm"));
CmdArgs.push_back(Args.MakeArgString(
Twine("--amdhsa-code-object-version=") + A->getValue()));
}
for (const auto &A : Args.getAllArgValues(options::OPT_Xcuda_ptxas))
CmdArgs.push_back(Args.MakeArgString("--ptxas-arg=" + A));

View File

@ -0,0 +1,96 @@
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \
// RUN: -mcode-object-version=4 -DUSER -x hip -o %t_4.bc %s
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \
// RUN: -mcode-object-version=5 -DUSER -x hip -o %t_5.bc %s
// RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm-bc \
// RUN: -mcode-object-version=none -DDEVICELIB -x hip -o %t_0.bc %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -O3 \
// RUN: %t_4.bc -mlink-builtin-bitcode %t_0.bc -o - |\
// RUN: FileCheck -check-prefix=LINKED4 %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -emit-llvm -O3 \
// RUN: %t_5.bc -mlink-builtin-bitcode %t_0.bc -o - |\
// RUN: FileCheck -check-prefix=LINKED5 %s
#include "Inputs/cuda.h"
// LINKED4: @llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
// LINKED4-LABEL: bar
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED4: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// LINKED4: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED4: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// LINKED4: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED4-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// LINKED4-NOT: icmp sge i32 %{{.*}}, 500
// LINKED4: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED4: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
// LINKED4: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// LINKED4: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
// LINKED4: select i1 false, ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
// LINKED4: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED4: "amdgpu_code_object_version", i32 400
// LINKED5: llvm.amdgcn.abi.version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
// LINKED5-LABEL: bar
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED5: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// LINKED5: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED5: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// LINKED5: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED5-NOT: load i32, ptr addrspacecast (ptr addrspace(4) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// LINKED5-NOT: icmp sge i32 %{{.*}}, 500
// LINKED5: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// LINKED5: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
// LINKED5: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// LINKED5: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
// LINKED5: select i1 true, ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
// LINKED5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// LINKED5: "amdgpu_code_object_version", i32 500
#ifdef DEVICELIB
__device__ void bar(int *x, int *y, int *z)
{
*x = __builtin_amdgcn_workgroup_size_x();
*y = __builtin_amdgcn_workgroup_size_y();
*z = __builtin_amdgcn_workgroup_size_z();
}
#endif
#ifdef USER
__device__ void bar(int *x, int *y, int *z);
__device__ void foo()
{
int *x, *y, *z;
bar(x, y, z);
}
#endif

View File

@ -7,6 +7,10 @@
// RUN: -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefix=COV5 %s
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
// RUN: -fcuda-is-device -mcode-object-version=none -emit-llvm -o - -x hip %s \
// RUN: | FileCheck -check-prefix=COVNONE %s
#include "Inputs/cuda.h"
// PRECOV5-LABEL: test_get_workgroup_size
@ -26,6 +30,36 @@
// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// COV5: getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
// COV5: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// COVNONE-LABEL: test_get_workgroup_size
// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// COVNONE: [[ABI5_X:%.*]] = icmp sge i32 %{{.*}}, 500
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// COVNONE: [[GEP_5_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 12
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// COVNONE: [[GEP_4_X:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 4
// COVNONE: select i1 [[ABI5_X]], ptr addrspace(4) [[GEP_5_X]], ptr addrspace(4) [[GEP_4_X]]
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// COVNONE: [[ABI5_Y:%.*]] = icmp sge i32 %{{.*}}, 500
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// COVNONE: [[GEP_5_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 14
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// COVNONE: [[GEP_4_Y:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 6
// COVNONE: select i1 [[ABI5_Y]], ptr addrspace(4) [[GEP_5_Y]], ptr addrspace(4) [[GEP_4_Y]]
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
// COVNONE: load i32, ptr addrspacecast (ptr addrspace(1) @llvm.amdgcn.abi.version to ptr), align {{.*}}
// COVNONE: [[ABI5_Z:%.*]] = icmp sge i32 %{{.*}}, 500
// COVNONE: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
// COVNONE: [[GEP_5_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 16
// COVNONE: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
// COVNONE: [[GEP_4_Z:%.*]] = getelementptr i8, ptr addrspace(4) %{{.*}}, i32 8
// COVNONE: select i1 [[ABI5_Z]], ptr addrspace(4) [[GEP_5_Z]], ptr addrspace(4) [[GEP_4_Z]]
// COVNONE: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
__device__ void test_get_workgroup_size(int d, int *out)
{
switch (d) {

View File

@ -1,5 +1,5 @@
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefixes=CHECK-COM,CHECK-SPIR
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefixes=CHECK-COM,CHECK-AMDGCN
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "spir-unknown-unknown" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-SPIR
// RUN: %clang_cc1 -cl-std=CL2.0 %s -triple "amdgcn--amdhsa" -emit-llvm -o - -O0 | FileCheck %s --check-prefix=CHECK-AMDGCN
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
#define CLK_NORMALIZED_COORDS_TRUE 1
@ -7,7 +7,6 @@
#define CLK_FILTER_LINEAR 0x20
constant sampler_t glb_smp = CLK_ADDRESS_CLAMP_TO_EDGE|CLK_NORMALIZED_COORDS_TRUE|CLK_FILTER_NEAREST;
// CHECK-COM-NOT: constant i32
void fnc1(image1d_t img) {}
// CHECK-SPIR: @fnc1(target("spirv.Image", void, 0, 0, 0, 0, 0, 0, 0)

View File

@ -403,6 +403,12 @@ Expected<StringRef> clang(ArrayRef<StringRef> InputFiles, const ArgList &Args) {
llvm::copy(LinkerArgs, std::back_inserter(CmdArgs));
}
// Pass on -mllvm options to the clang invocation.
for (const opt::Arg *Arg : Args.filtered(OPT_mllvm)) {
CmdArgs.push_back("-mllvm");
CmdArgs.push_back(Arg->getValue());
}
if (Args.hasArg(OPT_debug))
CmdArgs.push_back("-g");

View File

@ -288,7 +288,7 @@ add_custom_target(omptarget.devicertl.nvptx)
add_custom_target(omptarget.devicertl.amdgpu)
foreach(gpu_arch ${LIBOMPTARGET_DEVICE_ARCHITECTURES})
if("${gpu_arch}" IN_LIST all_amdgpu_architectures)
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa)
compileDeviceRTLLibrary(${gpu_arch} amdgpu amdgcn-amd-amdhsa -Xclang -mcode-object-version=none)
elseif("${gpu_arch}" IN_LIST all_nvptx_architectures)
compileDeviceRTLLibrary(${gpu_arch} nvptx nvptx64-nvidia-cuda --cuda-feature=+ptx61)
else()

View File

@ -381,6 +381,9 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
/// Get the executable.
hsa_executable_t getExecutable() const { return Executable; }
/// Get to Code Object Version of the ELF
uint16_t getELFABIVersion() const { return ELFABIVersion; }
/// Find an HSA device symbol by its name on the executable.
Expected<hsa_executable_symbol_t>
findDeviceSymbol(GenericDeviceTy &Device, StringRef SymbolName) const;
@ -401,6 +404,7 @@ private:
hsa_executable_t Executable;
hsa_code_object_t CodeObject;
StringMap<utils::KernelMetaDataTy> KernelInfoMap;
uint16_t ELFABIVersion;
};
/// Class implementing the AMDGPU kernel functionalities which derives from the
@ -408,8 +412,7 @@ private:
struct AMDGPUKernelTy : public GenericKernelTy {
/// Create an AMDGPU kernel with a name and an execution mode.
AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
: GenericKernelTy(Name, ExecutionMode),
ImplicitArgsSize(sizeof(utils::AMDGPUImplicitArgsTy)) {}
: GenericKernelTy(Name, ExecutionMode) {}
/// Initialize the AMDGPU kernel.
Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@ -450,6 +453,9 @@ struct AMDGPUKernelTy : public GenericKernelTy {
// TODO: Read the kernel descriptor for the max threads per block. May be
// read from the image.
ImplicitArgsSize = utils::getImplicitArgsSize(AMDImage.getELFABIVersion());
DP("ELFABIVersion: %d\n", AMDImage.getELFABIVersion());
// Get additional kernel info read from image
KernelInfo = AMDImage.getKernelInfo(getName());
if (!KernelInfo.has_value())
@ -476,6 +482,10 @@ struct AMDGPUKernelTy : public GenericKernelTy {
/// Get the HSA kernel object representing the kernel function.
uint64_t getKernelObject() const { return KernelObject; }
/// Get the size of implicitargs based on the code object version
/// @return 56 for cov4 and 256 for cov5
uint32_t getImplicitArgsSize() const { return ImplicitArgsSize; }
private:
/// The kernel object to execute.
uint64_t KernelObject;
@ -486,7 +496,7 @@ private:
uint32_t PrivateSize;
/// The size of implicit kernel arguments.
const uint32_t ImplicitArgsSize;
uint32_t ImplicitArgsSize;
/// Additional Info for the AMD GPU Kernel
std::optional<utils::KernelMetaDataTy> KernelInfo;
@ -2627,8 +2637,8 @@ Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
if (Result)
return Plugin::error("Loaded HSA executable does not validate");
if (auto Err =
utils::readAMDGPUMetaDataFromImage(getMemoryBuffer(), KernelInfoMap))
if (auto Err = utils::readAMDGPUMetaDataFromImage(
getMemoryBuffer(), KernelInfoMap, ELFABIVersion))
return Err;
return Plugin::success();
@ -2993,6 +3003,15 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (GenericDevice.getRPCServer())
Stream->setRPCServer(GenericDevice.getRPCServer());
// Only COV5 implicitargs needs to be set. COV4 implicitargs are not used.
if (getImplicitArgsSize() == sizeof(utils::AMDGPUImplicitArgsTy)) {
ImplArgs->BlockCountX = NumBlocks;
ImplArgs->GroupSizeX = NumThreads;
ImplArgs->GroupSizeY = 1;
ImplArgs->GroupSizeZ = 1;
ImplArgs->GridDims = 1;
}
// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, ArgsMemoryManager);

View File

@ -25,6 +25,7 @@
#include "llvm/Support/MemoryBufferRef.h"
#include "llvm/Support/YAMLTraits.h"
using namespace llvm::ELF;
namespace llvm {
namespace omp {
@ -32,19 +33,29 @@ namespace target {
namespace plugin {
namespace utils {
// The implicit arguments of AMDGPU kernels.
// The implicit arguments of COV5 AMDGPU kernels.
struct AMDGPUImplicitArgsTy {
uint64_t OffsetX;
uint64_t OffsetY;
uint64_t OffsetZ;
uint64_t HostcallPtr;
uint64_t Unused0;
uint64_t Unused1;
uint64_t Unused2;
uint32_t BlockCountX;
uint32_t BlockCountY;
uint32_t BlockCountZ;
uint16_t GroupSizeX;
uint16_t GroupSizeY;
uint16_t GroupSizeZ;
uint8_t Unused0[46]; // 46 byte offset.
uint16_t GridDims;
uint8_t Unused1[190]; // 190 byte offset.
};
static_assert(sizeof(AMDGPUImplicitArgsTy) == 56,
"Unexpected size of implicit arguments");
// Dummy struct for COV4 implicitargs.
struct AMDGPUImplicitArgsTyCOV4 {
uint8_t Unused[56];
};
uint32_t getImplicitArgsSize(uint16_t Version) {
return Version < ELF::ELFABIVERSION_AMDGPU_HSA_V5
? sizeof(AMDGPUImplicitArgsTyCOV4)
: sizeof(AMDGPUImplicitArgsTy);
}
/// Parse a TargetID to get processor arch and feature map.
/// Returns processor subarch.
@ -295,7 +306,8 @@ private:
/// Reads the AMDGPU specific metadata from the ELF file and propagates the
/// KernelInfoMap
Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
StringMap<KernelMetaDataTy> &KernelInfoMap) {
StringMap<KernelMetaDataTy> &KernelInfoMap,
uint16_t &ELFABIVersion) {
Error Err = Error::success(); // Used later as out-parameter
auto ELFOrError = object::ELF64LEFile::create(MemBuffer.getBuffer());
@ -305,6 +317,12 @@ Error readAMDGPUMetaDataFromImage(MemoryBufferRef MemBuffer,
const object::ELF64LEFile ELFObj = ELFOrError.get();
ArrayRef<object::ELF64LE::Shdr> Sections = cantFail(ELFObj.sections());
KernelInfoReader Reader(KernelInfoMap);
// Read the code object version from ELF image header
auto Header = ELFObj.getHeader();
ELFABIVersion = (uint8_t)(Header.e_ident[ELF::EI_ABIVERSION]);
DP("ELFABIVERSION Version: %u\n", ELFABIVersion);
for (const auto &S : Sections) {
if (S.sh_type != ELF::SHT_NOTE)
continue;