Reland "[CUDA][HIP] Fix overloading resolution in global var init" (#65606)

https://reviews.llvm.org/D158247 caused regressions for HIP on Windows and was reverted. A reduced test case is: ``` typedef void (__stdcall* funcTy)(); void invoke(funcTy f); static void __stdcall callee() noexcept { } void foo() { invoke(callee); } ``` It is due to clang missing handling host/device attributes for calling convention at a few places This patch fixes that.
2024-11-23 05:40:09 +00:00 · 2023-09-07 23:18:30 -04:00 · 2023-09-07 23:18:30 -04:00 · 9b7763821a
commit 9b7763821a
parent 4d2536c82f
13 changed files with 272 additions and 93 deletions
--- a/.gitignore
+++ b/.gitignore
@ -70,3 +70,4 @@ pythonenv*
 /clang/utils/analyzer/projects/*/RefScanBuildResults
 # automodapi puts generated documentation files here.
 /lldb/docs/python_api/
+/Debug/
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@ -1012,6 +1012,14 @@ public:
    }
  } DelayedDiagnostics;

+  enum CUDAFunctionTarget {
+    CFT_Device,
+    CFT_Global,
+    CFT_Host,
+    CFT_HostDevice,
+    CFT_InvalidTarget
+  };
+
  /// A RAII object to temporarily push a declaration context.
  class ContextRAII {
  private:
@ -4753,8 +4761,13 @@ public:
  bool isValidPointerAttrType(QualType T, bool RefOkay = false);

  bool CheckRegparmAttr(const ParsedAttr &attr, unsigned &value);
+
+  /// Check validaty of calling convention attribute \p attr. If \p FD
+  /// is not null pointer, use \p FD to determine the CUDA/HIP host/device
+  /// target. Otherwise, it is specified by \p CFT.
  bool CheckCallingConvAttr(const ParsedAttr &attr, CallingConv &CC,
-                            const FunctionDecl *FD = nullptr);
+                            const FunctionDecl *FD = nullptr,
+                            CUDAFunctionTarget CFT = CFT_InvalidTarget);
  bool CheckAttrTarget(const ParsedAttr &CurrAttr);
  bool CheckAttrNoArgs(const ParsedAttr &CurrAttr);
  bool checkStringLiteralArgumentAttr(const AttributeCommonInfo &CI,
@ -13266,14 +13279,6 @@ public:
  void checkTypeSupport(QualType Ty, SourceLocation Loc,
                        ValueDecl *D = nullptr);

-  enum CUDAFunctionTarget {
-    CFT_Device,
-    CFT_Global,
-    CFT_Host,
-    CFT_HostDevice,
-    CFT_InvalidTarget
-  };
-
  /// Determines whether the given function is a CUDA device/host/kernel/etc.
  /// function.
  ///
@ -13292,6 +13297,29 @@ public:
  /// Determines whether the given variable is emitted on host or device side.
  CUDAVariableTarget IdentifyCUDATarget(const VarDecl *D);

+  /// Defines kinds of CUDA global host/device context where a function may be
+  /// called.
+  enum CUDATargetContextKind {
+    CTCK_Unknown,       /// Unknown context
+    CTCK_InitGlobalVar, /// Function called during global variable
+                        /// initialization
+  };
+
+  /// Define the current global CUDA host/device context where a function may be
+  /// called. Only used when a function is called outside of any functions.
+  struct CUDATargetContext {
+    CUDAFunctionTarget Target = CFT_HostDevice;
+    CUDATargetContextKind Kind = CTCK_Unknown;
+    Decl *D = nullptr;
+  } CurCUDATargetCtx;
+
+  struct CUDATargetContextRAII {
+    Sema &S;
+    CUDATargetContext SavedCtx;
+    CUDATargetContextRAII(Sema &S_, CUDATargetContextKind K, Decl *D);
+    ~CUDATargetContextRAII() { S.CurCUDATargetCtx = SavedCtx; }
+  };
+
  /// Gets the CUDA target for the current context.
  CUDAFunctionTarget CurrentCUDATarget() {
    return IdentifyCUDATarget(dyn_cast<FunctionDecl>(CurContext));
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@ -2571,6 +2571,7 @@ Decl *Parser::ParseDeclarationAfterDeclaratorAndAttributes(
    }
  }

+  Sema::CUDATargetContextRAII X(Actions, Sema::CTCK_InitGlobalVar, ThisDecl);
  switch (TheInitKind) {
  // Parse declarator '=' initializer.
  case InitKind::Equal: {
--- a/clang/lib/Sema/SemaCUDA.cpp
+++ b/clang/lib/Sema/SemaCUDA.cpp
@ -105,19 +105,37 @@ Sema::IdentifyCUDATarget(const ParsedAttributesView &Attrs) {
 }

 template <typename A>
-static bool hasAttr(const FunctionDecl *D, bool IgnoreImplicitAttr) {
+static bool hasAttr(const Decl *D, bool IgnoreImplicitAttr) {
  return D->hasAttrs() && llvm::any_of(D->getAttrs(), [&](Attr *Attribute) {
           return isa<A>(Attribute) &&
                  !(IgnoreImplicitAttr && Attribute->isImplicit());
         });
 }

+Sema::CUDATargetContextRAII::CUDATargetContextRAII(Sema &S_,
+                                                   CUDATargetContextKind K,
+                                                   Decl *D)
+    : S(S_) {
+  SavedCtx = S.CurCUDATargetCtx;
+  assert(K == CTCK_InitGlobalVar);
+  auto *VD = dyn_cast_or_null<VarDecl>(D);
+  if (VD && VD->hasGlobalStorage() && !VD->isStaticLocal()) {
+    auto Target = CFT_Host;
+    if ((hasAttr<CUDADeviceAttr>(VD, /*IgnoreImplicit=*/true) &&
+         !hasAttr<CUDAHostAttr>(VD, /*IgnoreImplicit=*/true)) ||
+        hasAttr<CUDASharedAttr>(VD, /*IgnoreImplicit=*/true) ||
+        hasAttr<CUDAConstantAttr>(VD, /*IgnoreImplicit=*/true))
+      Target = CFT_Device;
+    S.CurCUDATargetCtx = {Target, K, VD};
+  }
+}
+
 /// IdentifyCUDATarget - Determine the CUDA compilation target for this function
 Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D,
                                                  bool IgnoreImplicitHDAttr) {
-  // Code that lives outside a function is run on the host.
+  // Code that lives outside a function gets the target from CurCUDATargetCtx.
  if (D == nullptr)
-    return CFT_Host;
+    return CurCUDATargetCtx.Target;

  if (D->hasAttr<CUDAInvalidTargetAttr>())
    return CFT_InvalidTarget;
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@ -5132,7 +5132,8 @@ static void handleCallConvAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
  // Diagnostic is emitted elsewhere: here we store the (valid) AL
  // in the Decl node for syntactic reasoning, e.g., pretty-printing.
  CallingConv CC;
-  if (S.CheckCallingConvAttr(AL, CC, /*FD*/nullptr))
+  if (S.CheckCallingConvAttr(AL, CC, /*FD*/ nullptr,
+                             S.IdentifyCUDATarget(dyn_cast<FunctionDecl>(D))))
    return;

  if (!isa<ObjCMethodDecl>(D)) {
@ -5317,7 +5318,8 @@ static void handleNoRandomizeLayoutAttr(Sema &S, Decl *D,
 }

 bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC,
-                                const FunctionDecl *FD) {
+                                const FunctionDecl *FD,
+                                CUDAFunctionTarget CFT) {
  if (Attrs.isInvalid())
    return true;

@ -5416,7 +5418,8 @@ bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC,
  // on their host/device attributes.
  if (LangOpts.CUDA) {
    auto *Aux = Context.getAuxTargetInfo();
-    auto CudaTarget = IdentifyCUDATarget(FD);
+    assert(FD || CFT != CFT_InvalidTarget);
+    auto CudaTarget = FD ? IdentifyCUDATarget(FD) : CFT;
    bool CheckHost = false, CheckDevice = false;
    switch (CudaTarget) {
    case CFT_HostDevice:
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@ -6697,17 +6697,19 @@ void Sema::AddOverloadCandidate(
  }

  // (CUDA B.1): Check for invalid calls between targets.
-  if (getLangOpts().CUDA)
-    if (const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true))
-      // Skip the check for callers that are implicit members, because in this
-      // case we may not yet know what the member's target is; the target is
-      // inferred for the member automatically, based on the bases and fields of
-      // the class.
-      if (!Caller->isImplicit() && !IsAllowedCUDACall(Caller, Function)) {
-        Candidate.Viable = false;
-        Candidate.FailureKind = ovl_fail_bad_target;
-        return;
-      }
+  if (getLangOpts().CUDA) {
+    const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true);
+    // Skip the check for callers that are implicit members, because in this
+    // case we may not yet know what the member's target is; the target is
+    // inferred for the member automatically, based on the bases and fields of
+    // the class.
+    if (!(Caller && Caller->isImplicit()) &&
+        !IsAllowedCUDACall(Caller, Function)) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_bad_target;
+      return;
+    }
+  }

  if (Function->getTrailingRequiresClause()) {
    ConstraintSatisfaction Satisfaction;
@ -7219,12 +7221,11 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,

  // (CUDA B.1): Check for invalid calls between targets.
  if (getLangOpts().CUDA)
-    if (const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true))
-      if (!IsAllowedCUDACall(Caller, Method)) {
-        Candidate.Viable = false;
-        Candidate.FailureKind = ovl_fail_bad_target;
-        return;
-      }
+    if (!IsAllowedCUDACall(getCurFunctionDecl(/*AllowLambda=*/true), Method)) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_bad_target;
+      return;
+    }

  if (Method->getTrailingRequiresClause()) {
    ConstraintSatisfaction Satisfaction;
@ -12495,10 +12496,12 @@ private:
      return false;

    if (FunctionDecl *FunDecl = dyn_cast<FunctionDecl>(Fn)) {
-      if (S.getLangOpts().CUDA)
-        if (FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true))
-          if (!Caller->isImplicit() && !S.IsAllowedCUDACall(Caller, FunDecl))
-            return false;
+      if (S.getLangOpts().CUDA) {
+        FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true);
+        if (!(Caller && Caller->isImplicit()) &&
+            !S.IsAllowedCUDACall(Caller, FunDecl))
+          return false;
+      }
      if (FunDecl->isMultiVersion()) {
        const auto *TA = FunDecl->getAttr<TargetAttr>();
        if (TA && !TA->isDefaultVersion())
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@ -366,12 +366,14 @@ enum TypeAttrLocation {
  TAL_DeclName
 };

-static void processTypeAttrs(TypeProcessingState &state, QualType &type,
-                             TypeAttrLocation TAL,
-                             const ParsedAttributesView &attrs);
+static void
+processTypeAttrs(TypeProcessingState &state, QualType &type,
+                 TypeAttrLocation TAL, const ParsedAttributesView &attrs,
+                 Sema::CUDAFunctionTarget CFT = Sema::CFT_HostDevice);

 static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
-                                   QualType &type);
+                                   QualType &type,
+                                   Sema::CUDAFunctionTarget CFT);

 static bool handleMSPointerTypeQualifierAttr(TypeProcessingState &state,
                                             ParsedAttr &attr, QualType &type);
@ -617,7 +619,8 @@ static void distributeFunctionTypeAttr(TypeProcessingState &state,
 /// distributed, false if no location was found.
 static bool distributeFunctionTypeAttrToInnermost(
    TypeProcessingState &state, ParsedAttr &attr,
-    ParsedAttributesView &attrList, QualType &declSpecType) {
+    ParsedAttributesView &attrList, QualType &declSpecType,
+    Sema::CUDAFunctionTarget CFT) {
  Declarator &declarator = state.getDeclarator();

  // Put it on the innermost function chunk, if there is one.
@ -629,19 +632,20 @@ static bool distributeFunctionTypeAttrToInnermost(
    return true;
  }

-  return handleFunctionTypeAttr(state, attr, declSpecType);
+  return handleFunctionTypeAttr(state, attr, declSpecType, CFT);
 }

 /// A function type attribute was written in the decl spec.  Try to
 /// apply it somewhere.
-static void distributeFunctionTypeAttrFromDeclSpec(TypeProcessingState &state,
-                                                   ParsedAttr &attr,
-                                                   QualType &declSpecType) {
+static void
+distributeFunctionTypeAttrFromDeclSpec(TypeProcessingState &state,
+                                       ParsedAttr &attr, QualType &declSpecType,
+                                       Sema::CUDAFunctionTarget CFT) {
  state.saveDeclSpecAttrs();

  // Try to distribute to the innermost.
  if (distributeFunctionTypeAttrToInnermost(
-          state, attr, state.getCurrentAttributes(), declSpecType))
+          state, attr, state.getCurrentAttributes(), declSpecType, CFT))
    return;

  // If that failed, diagnose the bad attribute when the declarator is
@ -653,14 +657,14 @@ static void distributeFunctionTypeAttrFromDeclSpec(TypeProcessingState &state,
 /// Try to apply it somewhere.
 /// `Attrs` is the attribute list containing the declaration (either of the
 /// declarator or the declaration).
-static void distributeFunctionTypeAttrFromDeclarator(TypeProcessingState &state,
-                                                     ParsedAttr &attr,
-                                                     QualType &declSpecType) {
+static void distributeFunctionTypeAttrFromDeclarator(
+    TypeProcessingState &state, ParsedAttr &attr, QualType &declSpecType,
+    Sema::CUDAFunctionTarget CFT) {
  Declarator &declarator = state.getDeclarator();

  // Try to distribute to the innermost.
  if (distributeFunctionTypeAttrToInnermost(
-          state, attr, declarator.getAttributes(), declSpecType))
+          state, attr, declarator.getAttributes(), declSpecType, CFT))
    return;

  // If that failed, diagnose the bad attribute when the declarator is
@ -682,7 +686,8 @@ static void distributeFunctionTypeAttrFromDeclarator(TypeProcessingState &state,
 /// `Attrs` is the attribute list containing the declaration (either of the
 /// declarator or the declaration).
 static void distributeTypeAttrsFromDeclarator(TypeProcessingState &state,
-                                              QualType &declSpecType) {
+                                              QualType &declSpecType,
+                                              Sema::CUDAFunctionTarget CFT) {
  // The called functions in this loop actually remove things from the current
  // list, so iterating over the existing list isn't possible.  Instead, make a
  // non-owning copy and iterate over that.
@ -699,7 +704,7 @@ static void distributeTypeAttrsFromDeclarator(TypeProcessingState &state,
      break;

    FUNCTION_TYPE_ATTRS_CASELIST:
-      distributeFunctionTypeAttrFromDeclarator(state, attr, declSpecType);
+      distributeFunctionTypeAttrFromDeclarator(state, attr, declSpecType, CFT);
      break;

    MS_TYPE_ATTRS_CASELIST:
@ -3544,7 +3549,8 @@ static QualType GetDeclSpecTypeForDeclarator(TypeProcessingState &state,
  // Note: We don't need to distribute declaration attributes (i.e.
  // D.getDeclarationAttributes()) because those are always C++11 attributes,
  // and those don't get distributed.
-  distributeTypeAttrsFromDeclarator(state, T);
+  distributeTypeAttrsFromDeclarator(
+      state, T, SemaRef.IdentifyCUDATarget(D.getAttributes()));

  // Find the deduced type in this type. Look in the trailing return type if we
  // have one, otherwise in the DeclSpec type.
@ -4055,7 +4061,8 @@ static CallingConv getCCForDeclaratorChunk(
      // function type.  We'll diagnose the failure to apply them in
      // handleFunctionTypeAttr.
      CallingConv CC;
-      if (!S.CheckCallingConvAttr(AL, CC) &&
+      if (!S.CheckCallingConvAttr(AL, CC, /*FunctionDecl=*/nullptr,
+                                  S.IdentifyCUDATarget(D.getAttributes())) &&
          (!FTI.isVariadic || supportsVariadicCall(CC))) {
        return CC;
      }
@ -5727,7 +5734,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
    }

    // See if there are any attributes on this declarator chunk.
-    processTypeAttrs(state, T, TAL_DeclChunk, DeclType.getAttrs());
+    processTypeAttrs(state, T, TAL_DeclChunk, DeclType.getAttrs(),
+                     S.IdentifyCUDATarget(D.getAttributes()));

    if (DeclType.Kind != DeclaratorChunk::Paren) {
      if (ExpectNoDerefChunk && !IsNoDerefableChunk(DeclType))
@ -7801,7 +7809,8 @@ static bool checkMutualExclusion(TypeProcessingState &state,
 /// Process an individual function attribute.  Returns true to
 /// indicate that the attribute was handled, false if it wasn't.
 static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,
-                                   QualType &type) {
+                                   QualType &type,
+                                   Sema::CUDAFunctionTarget CFT) {
  Sema &S = state.getSema();

  FunctionTypeUnwrapper unwrapped(S, type);
@ -8032,7 +8041,7 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr,

  // Otherwise, a calling convention.
  CallingConv CC;
-  if (S.CheckCallingConvAttr(attr, CC))
+  if (S.CheckCallingConvAttr(attr, CC, /*FunctionDecl=*/nullptr, CFT))
    return true;

  const FunctionType *fn = unwrapped.get();
@ -8584,7 +8593,8 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State,

 static void processTypeAttrs(TypeProcessingState &state, QualType &type,
                             TypeAttrLocation TAL,
-                             const ParsedAttributesView &attrs) {
+                             const ParsedAttributesView &attrs,
+                             Sema::CUDAFunctionTarget CFT) {

  state.setParsedNoDeref(false);
  if (attrs.empty())
@ -8826,7 +8836,7 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
      // appertain to and hence should not use the "distribution" logic below.
      if (attr.isStandardAttributeSyntax() ||
          attr.isRegularKeywordAttribute()) {
-        if (!handleFunctionTypeAttr(state, attr, type)) {
+        if (!handleFunctionTypeAttr(state, attr, type, CFT)) {
          diagnoseBadTypeAttribute(state.getSema(), attr, type);
          attr.setInvalid();
        }
@ -8836,10 +8846,10 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
      // Never process function type attributes as part of the
      // declaration-specifiers.
      if (TAL == TAL_DeclSpec)
-        distributeFunctionTypeAttrFromDeclSpec(state, attr, type);
+        distributeFunctionTypeAttrFromDeclSpec(state, attr, type, CFT);

      // Otherwise, handle the possible delays.
-      else if (!handleFunctionTypeAttr(state, attr, type))
+      else if (!handleFunctionTypeAttr(state, attr, type, CFT))
        distributeFunctionTypeAttr(state, attr, type);
      break;
    case ParsedAttr::AT_AcquireHandle: {
--- a/clang/test/CodeGenCUDA/global-initializers.cu
+++ b/clang/test/CodeGenCUDA/global-initializers.cu
@ -0,0 +1,51 @@
+// RUN: %clang_cc1 %s -triple x86_64-linux-unknown -emit-llvm -o - \
+// RUN:   | FileCheck -check-prefix=HOST %s
+// RUN: %clang_cc1 %s -fcuda-is-device \
+// RUN:   -emit-llvm -o - -triple nvptx64 \
+// RUN:   -aux-triple x86_64-unknown-linux-gnu | FileCheck \
+// RUN:   -check-prefix=DEV %s
+
+#include "Inputs/cuda.h"
+
+// Check host/device-based overloding resolution in global variable initializer.
+double pow(double, double) { return 1.0; }
+
+__device__ double pow(double, int) { return 2.0; }
+
+// HOST-DAG: call {{.*}}double @_Z3powdd(double noundef 1.000000e+00, double noundef 1.000000e+00)
+double X = pow(1.0, 1);
+
+constexpr double cpow(double, double) { return 11.0; }
+
+constexpr __device__ double cpow(double, int) { return 12.0; }
+
+// HOST-DAG: @CX = global double 1.100000e+01
+double CX = cpow(11.0, 1);
+
+// DEV-DAG: @CY = addrspace(1) externally_initialized global double 1.200000e+01
+__device__ double CY = cpow(12.0, 1);
+
+struct A {
+  double pow(double, double) { return 3.0; }
+
+  __device__ double pow(double, int) { return 4.0; }
+};
+
+A a;
+
+// HOST-DAG: call {{.*}}double @_ZN1A3powEdd(ptr {{.*}}@a, double noundef 3.000000e+00, double noundef 1.000000e+00)
+double AX = a.pow(3.0, 1);
+
+struct CA {
+  constexpr double cpow(double, double) const { return 13.0; }
+
+  constexpr __device__ double cpow(double, int) const { return 14.0; }
+};
+
+const CA ca;
+
+// HOST-DAG: @CAX = global double 1.300000e+01
+double CAX = ca.cpow(13.0, 1);
+
+// DEV-DAG: @CAY = addrspace(1) externally_initialized global double 1.400000e+01
+__device__ double CAY = ca.cpow(14.0, 1);
--- a/clang/test/SemaCUDA/amdgpu-windows-vectorcall.cu
+++ b/clang/test/SemaCUDA/amdgpu-windows-vectorcall.cu
@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-pc-windows-msvc -fms-compatibility -fcuda-is-device -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fms-compatibility -fsyntax-only -verify %s

 __cdecl void hostf1();
 __vectorcall void (*hostf2)() = hostf1; // expected-error {{cannot initialize a variable of type 'void ((*))() __attribute__((vectorcall))' with an lvalue of type 'void () __attribute__((cdecl))'}}
--- a/clang/test/SemaCUDA/function-overload.cu
+++ b/clang/test/SemaCUDA/function-overload.cu
@ -222,7 +222,13 @@ __host__ __device__ void hostdevicef() {
 // Test for address of overloaded function resolution in the global context.
 HostFnPtr fp_h = h;
 HostFnPtr fp_ch = ch;
+#if defined (__CUDA_ARCH__)
+__device__
+#endif
 CurrentFnPtr fp_dh = dh;
+#if defined (__CUDA_ARCH__)
+__device__
+#endif
 CurrentFnPtr fp_cdh = cdh;
 GlobalFnPtr fp_g = g;

--- a/clang/test/SemaCUDA/global-initializers-host.cu
+++ b/clang/test/SemaCUDA/global-initializers-host.cu
@ -1,32 +0,0 @@
-// RUN: %clang_cc1 %s --std=c++11 -triple x86_64-linux-unknown -fsyntax-only -o - -verify
-
-#include "Inputs/cuda.h"
-
-// Check that we get an error if we try to call a __device__ function from a
-// module initializer.
-
-struct S {
-  __device__ S() {}
-  // expected-note@-1 {{'S' declared here}}
-};
-
-S s;
-// expected-error@-1 {{reference to __device__ function 'S' in global initializer}}
-
-struct T {
-  __host__ __device__ T() {}
-};
-T t;  // No error, this is OK.
-
-struct U {
-  __host__ U() {}
-  __device__ U(int) {}
-  // expected-note@-1 {{'U' declared here}}
-};
-U u(42);
-// expected-error@-1 {{reference to __device__ function 'U' in global initializer}}
-
-__device__ int device_fn() { return 42; }
-// expected-note@-1 {{'device_fn' declared here}}
-int n = device_fn();
-// expected-error@-1 {{reference to __device__ function 'device_fn' in global initializer}}
--- a/clang/test/SemaCUDA/global-initializers.cu
+++ b/clang/test/SemaCUDA/global-initializers.cu
@ -0,0 +1,72 @@
+// RUN: %clang_cc1 %s -triple x86_64-linux-unknown -fsyntax-only -o - -verify
+// RUN: %clang_cc1 %s -fcuda-is-device -triple nvptx -fsyntax-only -o - -verify
+
+#include "Inputs/cuda.h"
+
+// Check that we get an error if we try to call a __device__ function from a
+// module initializer.
+
+struct S {
+  // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}}
+  // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}}
+  __device__ S() {}
+  // expected-note@-1 {{candidate constructor not viable: call to __device__ function from __host__ function}}
+};
+
+S s;
+// expected-error@-1 {{no matching constructor for initialization of 'S'}}
+
+struct T {
+  __host__ __device__ T() {}
+};
+T t;  // No error, this is OK.
+
+struct U {
+  // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const U' for 1st argument}}
+  // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'U' for 1st argument}}
+  __host__ U() {}
+  // expected-note@-1 {{candidate constructor not viable: requires 0 arguments, but 1 was provided}}
+  __device__ U(int) {}
+  // expected-note@-1 {{candidate constructor not viable: call to __device__ function from __host__ function}}
+};
+U u(42);
+// expected-error@-1 {{no matching constructor for initialization of 'U'}}
+
+__device__ int device_fn() { return 42; }
+// expected-note@-1 {{candidate function not viable: call to __device__ function from __host__ function}}
+int n = device_fn();
+// expected-error@-1 {{no matching function for call to 'device_fn'}}
+
+// Check host/device-based overloding resolution in global variable initializer.
+double pow(double, double);
+
+__device__ double pow(double, int);
+
+double X = pow(1.0, 1);
+__device__ double Y = pow(2.0, 2); // expected-error{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
+
+constexpr double cpow(double, double) { return 1.0; }
+
+constexpr __device__ double cpow(double, int) { return 2.0; }
+
+const double CX = cpow(1.0, 1);
+const __device__ double CY = cpow(2.0, 2);
+
+struct A {
+  double pow(double, double);
+
+  __device__ double pow(double, int);
+
+  constexpr double cpow(double, double) const { return 1.0; }
+
+  constexpr __device__ double cpow(double, int) const { return 1.0; }
+
+};
+
+A a;
+double AX = a.pow(1.0, 1);
+__device__ double AY = a.pow(2.0, 2); // expected-error{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}}
+
+const A ca;
+const double CAX = ca.cpow(1.0, 1);
+const __device__ double CAY = ca.cpow(2.0, 2);
--- a/clang/test/SemaCUDA/windows-calling-conv.cu
+++ b/clang/test/SemaCUDA/windows-calling-conv.cu
@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple \
+// RUN:   x86_64-pc-windows-msvc -fms-compatibility -fcuda-is-device \
+// RUN:   -fsyntax-only -verify -x hip %s
+// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fms-compatibility \
+// RUN:   -fsyntax-only -verify -x hip %s
+
+// expected-no-diagnostics
+
+typedef void (__stdcall* funcTy)();
+void invoke(funcTy f);
+
+static void __stdcall callee() noexcept {
+}
+
+void foo() {
+   invoke(callee);
+}