[HIP] Add hip input kind and codegen for kernel launching

HIP is a language similar to CUDA (https://github.com/ROCm-Developer-Tools/HIP/blob/master/docs/markdown/hip_kernel_language.md ). The language syntax is very similar, which allows a hip program to be compiled as a CUDA program by Clang. The main difference is the host API. HIP has a set of vendor neutral host API which can be implemented on different platforms. Currently there is open source implementation of HIP runtime on amdgpu target (https://github.com/ROCm-Developer-Tools/HIP). This patch adds support of input kind and language standard hip. When hip file is compiled, both LangOpts.CUDA and LangOpts.HIP is turned on. This allows compilation of hip program as CUDA in most cases and only special handling of hip program is needed LangOpts.HIP is checked. This patch also adds support of kernel launching of HIP program using HIP host API. When -x hip is not specified, there is no behaviour change for CUDA. Patch by Greg Rodgers. Revised and lit test added by Yaxun Liu. Differential Revision: https://reviews.llvm.org/D44984 llvm-svn: 330790
2024-12-04 20:20:54 +00:00 · 2018-04-25 01:10:37 +00:00 · 2018-04-25 01:10:37 +00:00 · 887c569bcb
commit 887c569bcb
parent 7282d320b7
13 changed files with 149 additions and 65 deletions
--- a/clang/include/clang/Basic/IdentifierTable.h
+++ b/clang/include/clang/Basic/IdentifierTable.h
@ -98,6 +98,12 @@ public:
           memcmp(getNameStart(), Str, StrLen-1) == 0;
  }

+  /// \brief Return true if this is the identifier for the specified StringRef.
+  bool isStr(llvm::StringRef Str) const {
+    llvm::StringRef ThisStr(getNameStart(), getLength());
+    return ThisStr == Str;
+  }
+
  /// \brief Return the beginning of the actual null-terminated string for this
  /// identifier.
  const char *getNameStart() const {
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@ -195,6 +195,7 @@ LANGOPT(NativeHalfType    , 1, 0, "Native half type support")
 LANGOPT(NativeHalfArgsAndReturns, 1, 0, "Native half args and returns")
 LANGOPT(HalfArgsAndReturns, 1, 0, "half args and returns")
 LANGOPT(CUDA              , 1, 0, "CUDA")
+LANGOPT(HIP               , 1, 0, "HIP")
 LANGOPT(OpenMP            , 32, 0, "OpenMP support and version of OpenMP (31, 40 or 45)")
 LANGOPT(OpenMPSimd        , 1, 0, "Use SIMD only OpenMP support.")
 LANGOPT(OpenMPUseTLS      , 1, 0, "Use TLS for threadprivates or runtime calls")
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@ -161,6 +161,7 @@ public:
    OpenCL,
    CUDA,
    RenderScript,
+    HIP,
    ///@}
  };

--- a/clang/include/clang/Frontend/LangStandards.def
+++ b/clang/include/clang/Frontend/LangStandards.def
@ -168,6 +168,10 @@ LANGSTANDARD_ALIAS_DEPR(opencl20, "CL2.0")
 LANGSTANDARD(cuda, "cuda", CUDA, "NVIDIA CUDA(tm)",
             LineComment | CPlusPlus | Digraphs)

+// HIP
+LANGSTANDARD(hip, "hip", HIP, "HIP",
+             LineComment | CPlusPlus | Digraphs)
+
 #undef LANGSTANDARD
 #undef LANGSTANDARD_ALIAS
 #undef LANGSTANDARD_ALIAS_DEPR
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@ -55,6 +55,8 @@ private:
  llvm::FunctionType *getRegisterGlobalsFnTy() const;
  llvm::FunctionType *getCallbackFnTy() const;
  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
+  std::string addPrefixToName(StringRef FuncName) const;
+  std::string addUnderscoredPrefixToName(StringRef FuncName) const;

  /// Creates a function to register all kernel stubs generated in this module.
  llvm::Function *makeRegisterGlobalsFn();
@ -114,6 +116,18 @@ public:

 }

+std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
+  if (CGM.getLangOpts().HIP)
+    return ((Twine("hip") + Twine(FuncName)).str());
+  return ((Twine("cuda") + Twine(FuncName)).str());
+}
+std::string
+CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
+  if (CGM.getLangOpts().HIP)
+    return ((Twine("__hip") + Twine(FuncName)).str());
+  return ((Twine("__cuda") + Twine(FuncName)).str());
+}
+
 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
      TheModule(CGM.getModule()),
@ -133,15 +147,21 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
 llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const {
  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
-  return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy,
-                                                           Params, false),
-                                   "cudaSetupArgument");
+  return CGM.CreateRuntimeFunction(
+      llvm::FunctionType::get(IntTy, Params, false),
+      addPrefixToName("SetupArgument"));
 }

 llvm::Constant *CGNVCUDARuntime::getLaunchFn() const {
-  // cudaError_t cudaLaunch(char *)
-  return CGM.CreateRuntimeFunction(
-      llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
+  if (CGM.getLangOpts().HIP) {
+    // hipError_t hipLaunchByPtr(char *);
+    return CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
+  } else {
+    // cudaError_t cudaLaunch(char *);
+    return CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
+  }
 }

 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
@ -222,7 +242,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {

  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
-      "__cuda_register_globals", &TheModule);
+      addUnderscoredPrefixToName("_register_globals"), &TheModule);
  llvm::BasicBlock *EntryBB =
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
  CGBuilderTy Builder(CGM, Context);
@ -235,7 +255,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
  llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
-      "__cudaRegisterFunction");
+      addUnderscoredPrefixToName("RegisterFunction"));

  // Extract GpuBinaryHandle passed as the first argument passed to
  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
@ -259,7 +279,7 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
                                     IntTy,        IntTy};
  llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(IntTy, RegisterVarParams, false),
-      "__cudaRegisterVar");
+      addUnderscoredPrefixToName("RegisterVar"));
  for (auto &Pair : DeviceVars) {
    llvm::GlobalVariable *Var = Pair.first;
    unsigned Flags = Pair.second;
@ -305,7 +325,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  // void ** __cudaRegisterFatBinary(void *);
  llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
-      "__cudaRegisterFatBinary");
+      addUnderscoredPrefixToName("RegisterFatBinary"));
  // struct { int magic, int version, void * gpu_binary, void * dont_care };
  llvm::StructType *FatbinWrapperTy =
      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
@ -324,7 +344,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {

  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule);
+      llvm::GlobalValue::InternalLinkage,
+      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
  llvm::BasicBlock *CtorEntryBB =
      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
  CGBuilderTy CtorBuilder(CGM, Context);
@ -357,7 +378,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
  // Unused in fatbin v1.
  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
-      "__cuda_fatbin_wrapper", CGM.getPointerAlign(),
+      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
      /*constant*/ true);
  FatbinWrapper->setSection(FatbinSectionName);

@ -370,7 +391,9 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
    GpuBinaryHandle = new llvm::GlobalVariable(
        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
+        llvm::ConstantPointerNull::get(VoidPtrPtrTy),
+        addUnderscoredPrefixToName("_gpubin_handle"));
+
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
                                   CGM.getPointerAlign());

@ -392,7 +415,8 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {

    // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *,
    // void *, void (*)(void **))
-    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
+    SmallString<128> RegisterLinkedBinaryName(
+        addUnderscoredPrefixToName("RegisterLinkedBinary"));
    RegisterLinkedBinaryName += NVModuleID;
    llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
@ -424,11 +448,13 @@ llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
  // void __cudaUnregisterFatBinary(void ** handle);
  llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
-      "__cudaUnregisterFatBinary");
+      addUnderscoredPrefixToName("UnregisterFatBinary"));

  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
-      llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule);
+      llvm::GlobalValue::InternalLinkage,
+      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
+
  llvm::BasicBlock *DtorEntryBB =
      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
  CGBuilderTy DtorBuilder(CGM, Context);
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@ -1608,6 +1608,7 @@ static InputKind ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
                .Case("c", InputKind::C)
                .Case("cl", InputKind::OpenCL)
                .Case("cuda", InputKind::CUDA)
+                .Case("hip", InputKind::HIP)
                .Case("c++", InputKind::CXX)
                .Case("objective-c", InputKind::ObjC)
                .Case("objective-c++", InputKind::ObjCXX)
@ -1887,6 +1888,9 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
    case InputKind::RenderScript:
      LangStd = LangStandard::lang_c99;
      break;
+    case InputKind::HIP:
+      LangStd = LangStandard::lang_hip;
+      break;
    }
  }

@ -1934,7 +1938,8 @@ void CompilerInvocation::setLangDefaults(LangOptions &Opts, InputKind IK,
    }
  }

-  Opts.CUDA = IK.getLanguage() == InputKind::CUDA;
+  Opts.HIP = IK.getLanguage() == InputKind::HIP;
+  Opts.CUDA = IK.getLanguage() == InputKind::CUDA || Opts.HIP;
  if (Opts.CUDA)
    // Set default FP_CONTRACT to FAST.
    Opts.setDefaultFPContractMode(LangOptions::FPC_Fast);
@ -2005,6 +2010,10 @@ static bool IsInputCompatibleWithStandard(InputKind IK,
    return S.getLanguage() == InputKind::CUDA ||
           S.getLanguage() == InputKind::CXX;

+  case InputKind::HIP:
+    return S.getLanguage() == InputKind::CXX ||
+           S.getLanguage() == InputKind::HIP;
+
  case InputKind::Asm:
    // Accept (and ignore) all -std= values.
    // FIXME: The -std= value is not ignored; it affects the tokenization
@ -2032,6 +2041,8 @@ static const StringRef GetInputKindName(InputKind IK) {
    return "CUDA";
  case InputKind::RenderScript:
    return "RenderScript";
+  case InputKind::HIP:
+    return "HIP";

  case InputKind::Asm:
    return "Asm";
--- a/clang/lib/Frontend/FrontendActions.cpp
+++ b/clang/lib/Frontend/FrontendActions.cpp
@ -733,6 +733,7 @@ void PrintPreambleAction::ExecuteAction() {
  case InputKind::ObjCXX:
  case InputKind::OpenCL:
  case InputKind::CUDA:
+  case InputKind::HIP:
    break;
      
  case InputKind::Unknown:
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@ -471,8 +471,10 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI,
  // Not "standard" per se, but available even with the -undef flag.
  if (LangOpts.AsmPreprocessor)
    Builder.defineMacro("__ASSEMBLER__");
-  if (LangOpts.CUDA)
+  if (LangOpts.CUDA && !LangOpts.HIP)
    Builder.defineMacro("__CUDA__");
+  if (LangOpts.HIP)
+    Builder.defineMacro("__HIP__");
 }

 /// Initialize the predefined C++ language feature test macros defined in
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@ -42,8 +42,9 @@ ExprResult Sema::ActOnCUDAExecConfigExpr(Scope *S, SourceLocation LLLLoc,
                                         SourceLocation GGGLoc) {
  FunctionDecl *ConfigDecl = Context.getcudaConfigureCallDecl();
  if (!ConfigDecl)
-    return ExprError(Diag(LLLLoc, diag::err_undeclared_var_use)
-                     << "cudaConfigureCall");
+    return ExprError(
+        Diag(LLLLoc, diag::err_undeclared_var_use)
+        << (getLangOpts().HIP ? "hipConfigureCall" : "cudaConfigureCall"));
  QualType ConfigQTy = ConfigDecl->getType();

  DeclRefExpr *ConfigDR = new (Context)
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@ -9056,11 +9056,13 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC,

  if (getLangOpts().CUDA) {
    IdentifierInfo *II = NewFD->getIdentifier();
-    if (II && II->isStr("cudaConfigureCall") && !NewFD->isInvalidDecl() &&
+    if (II &&
+        II->isStr(getLangOpts().HIP ? "hipConfigureCall"
+                                    : "cudaConfigureCall") &&
+        !NewFD->isInvalidDecl() &&
        NewFD->getDeclContext()->getRedeclContext()->isTranslationUnit()) {
      if (!R->getAs<FunctionType>()->getReturnType()->isScalarType())
        Diag(NewFD->getLocation(), diag::err_config_scalar_return);
-
      Context.setcudaConfigureCallDecl(NewFD);
    }

--- a/clang/test/CodeGenCUDA/Inputs/cuda.h
+++ b/clang/test/CodeGenCUDA/Inputs/cuda.h
@ -16,7 +16,12 @@ struct dim3 {

 typedef struct cudaStream *cudaStream_t;

+#ifdef __HIP__
+int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
+                     cudaStream_t stream = 0);
+#else
 int cudaConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0,
                      cudaStream_t stream = 0);
+#endif

 extern "C" __device__ int printf(const char*, ...);
--- a/clang/test/CodeGenCUDA/device-stub.cu
+++ b/clang/test/CodeGenCUDA/device-stub.cu
@ -1,16 +1,28 @@
 // RUN: echo "GPU binary would be here" > %t
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o - \
-// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,CUDA
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS \
 // RUN:   | FileCheck %s -check-prefix=NOGLOBALS
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
 // RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - \
-// RUN:   | FileCheck %s --check-prefixes=ALL,RDC
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,CUDA
 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \
 // RUN:   | FileCheck %s -check-prefix=NOGPUBIN

+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN:     -fcuda-include-gpubinary %t -o - -x hip\
+// RUN:   | FileCheck %s --check-prefixes=ALL,NORDC,HIP
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN:     -fcuda-include-gpubinary %t -o -  -DNOGLOBALS -x hip \
+// RUN:   | FileCheck %s -check-prefix=NOGLOBALS
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \
+// RUN:     -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \
+// RUN:   | FileCheck %s --check-prefixes=ALL,RDC,HIP
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\
+// RUN:   | FileCheck %s -check-prefix=NOGPUBIN
+
 #include "Inputs/cuda.h"

 #ifndef NOGLOBALS
@ -56,80 +68,83 @@ void use_pointers() {
 // NORDC-SAME: section ".nv_fatbin", align 8
 // RDC-SAME: section "__nv_relfatbin", align 8
 // * constant struct that wraps GPU binary
-// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } 
+// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant
+// CUDA-SAME: { i32, i32, i8*, i8* }
+// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant
+// HIP-SAME:  { i32, i32, i8*, i8* }
 // ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null }
 // ALL-SAME: section ".nvFatBinSegment"
 // * variable to save GPU binary handle after initialization
-// NORDC: @__cuda_gpubin_handle = internal global i8** null
+// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null
 // * constant unnamed string with NVModuleID
 // RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant
 // RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32
 // * Make sure our constructor was added to global ctor list.
-// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor
+// ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor
 // * In separate mode we also register a destructor.
-// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor
+// NORDC: @llvm.global_dtors = appending global {{.*}}@__[[PREFIX]]_module_dtor
 // * Alias to global symbol containing the NVModuleID.
 // RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* }
-// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper
+// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper

 // Test that we build the correct number of calls to cudaSetupArgument followed
 // by a call to cudaLaunch.

 // ALL: define{{.*}}kernelfunc
-// ALL: call{{.*}}cudaSetupArgument
-// ALL: call{{.*}}cudaSetupArgument
-// ALL: call{{.*}}cudaSetupArgument
-// ALL: call{{.*}}cudaLaunch
+// ALL: call{{.*}}[[PREFIX]]SetupArgument
+// ALL: call{{.*}}[[PREFIX]]SetupArgument
+// ALL: call{{.*}}[[PREFIX]]SetupArgument
+// ALL: call{{.*}}[[PREFIX]]Launch
 __global__ void kernelfunc(int i, int j, int k) {}

 // Test that we've built correct kernel launch sequence.
 // ALL: define{{.*}}hostfunc
-// ALL: call{{.*}}cudaConfigureCall
+// ALL: call{{.*}}[[PREFIX]]ConfigureCall
 // ALL: call{{.*}}kernelfunc
 void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); }
 #endif

 // Test that we've built a function to register kernels and global vars.
-// ALL: define internal void @__cuda_register_globals
-// ALL: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc
-// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
-// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
-// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
-// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
+// ALL: define internal void @__[[PREFIX]]_register_globals
+// ALL: call{{.*}}[[PREFIX]]RegisterFunction(i8** %0, {{.*}}kernelfunc
+// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0
+// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0
+// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0
+// ALL-DAG: call{{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0
 // ALL: ret void

 // Test that we've built a constructor.
-// ALL: define internal void @__cuda_module_ctor
+// ALL: define internal void @__[[PREFIX]]_module_ctor

-// In separate mode it calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper)
-// NORDC: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
-//   .. stores return value in __cuda_gpubin_handle
-// NORDC-NEXT: store{{.*}}__cuda_gpubin_handle
-//   .. and then calls __cuda_register_globals
-// NORDC-NEXT: call void @__cuda_register_globals
+// In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper)
+// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+//   .. stores return value in __[[PREFIX]]_gpubin_handle
+// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle
+//   .. and then calls __[[PREFIX]]_register_globals
+// NORDC-NEXT: call void @__[[PREFIX]]_register_globals

-// With relocatable device code we call __cudaRegisterLinkedBinary%NVModuleID%
-// RDC: call{{.*}}__cudaRegisterLinkedBinary[[MODULE_ID]](
-// RDC-SAME: __cuda_register_globals, {{.*}}__cuda_fatbin_wrapper
+// With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID%
+// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]](
+// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper
 // RDC-SAME: [[MODULE_ID_GLOBAL]]

 // Test that we've created destructor.
-// NORDC: define internal void @__cuda_module_dtor
-// NORDC: load{{.*}}__cuda_gpubin_handle
-// NORDC-NEXT: call void @__cudaUnregisterFatBinary
+// NORDC: define internal void @__[[PREFIX]]_module_dtor
+// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle
+// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary

-// There should be no __cuda_register_globals if we have no
+// There should be no __[[PREFIX]]_register_globals if we have no
 // device-side globals, but we still need to register GPU binary.
 // Skip GPU binary string first.
 // NOGLOBALS: @0 = private unnamed_addr constant{{.*}}
-// NOGLOBALS-NOT: define internal void @__cuda_register_globals
-// NOGLOBALS: define internal void @__cuda_module_ctor
-// NOGLOBALS: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper
-// NOGLOBALS-NOT: call void @__cuda_register_globals
-// NOGLOBALS: define internal void @__cuda_module_dtor
-// NOGLOBALS: call void @__cudaUnregisterFatBinary
+// NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals
+// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor
+// NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper
+// NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals
+// NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor
+// NOGLOBALS: call void @__[[PREFIX]]UnregisterFatBinary

 // There should be no constructors/destructors if we have no GPU binary.
-// NOGPUBIN-NOT: define internal void @__cuda_register_globals
-// NOGPUBIN-NOT: define internal void @__cuda_module_ctor
-// NOGPUBIN-NOT: define internal void @__cuda_module_dtor
+// NOGPUBIN-NOT: define internal void @__[[PREFIX]]_register_globals
+// NOGPUBIN-NOT: define internal void @__[[PREFIX]]_module_ctor
+// NOGPUBIN-NOT: define internal void @__[[PREFIX]]_module_dtor
--- a/clang/test/CodeGenCUDA/kernel-call.cu
+++ b/clang/test/CodeGenCUDA/kernel-call.cu
@ -1,11 +1,20 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CUDA,CHECK
+// RUN: %clang_cc1 -x hip -emit-llvm %s -o - | FileCheck %s --check-prefixes=HIP,CHECK
+

 #include "Inputs/cuda.h"

+// CHECK-LABEL: define void @_Z2g1i(i32 %x)
+// HIP: call{{.*}}hipSetupArgument
+// HIP: call{{.*}}hipLaunchByPtr
+// CUDA: call{{.*}}cudaSetupArgument
+// CUDA: call{{.*}}cudaLaunch
 __global__ void g1(int x) {}

+// CHECK-LABEL: define i32 @main
 int main(void) {
-  // CHECK: call{{.*}}cudaConfigureCall
+  // HIP: call{{.*}}hipConfigureCall
+  // CUDA: call{{.*}}cudaConfigureCall
  // CHECK: icmp
  // CHECK: br
  // CHECK: call{{.*}}g1