[OpenMP][deviceRTLs] Build the deviceRTLs with OpenMP instead of target dependent language

From this patch (plus some landed patches), `deviceRTLs` is taken as a regular OpenMP program with just `declare target` regions. In this way, ideally, `deviceRTLs` can be written in OpenMP directly. No CUDA, no HIP anymore. (Well, AMD is still working on getting it work. For now AMDGCN still uses original way to compile) However, some target specific functions are still required, but they're no longer written in target specific language. For example, CUDA parts have all refined by replacing CUDA intrinsic and builtins with LLVM/Clang/NVVM intrinsics. Here're a list of changes in this patch. 1. For NVPTX, `DEVICE` is defined empty in order to make the common parts still work with AMDGCN. Later once AMDGCN is also available, we will completely remove `DEVICE` or probably some other macros. 2. Shared variable is implemented with OpenMP allocator, which is defined in `allocator.h`. Again, this feature is not available on AMDGCN, so two macros are redefined properly. 3. CUDA header `cuda.h` is dropped in the source code. In order to deal with code difference in various CUDA versions, we build one bitcode library for each supported CUDA version. For each CUDA version, the highest PTX version it supports will be used, just as what we currently use for CUDA compilation. 4. Correspondingly, compiler driver is also updated to support CUDA version encoded in the name of bitcode library. Now the bitcode library for NVPTX is named as `libomptarget-nvptx-cuda_[cuda_version]-sm_[sm_number].bc`, such as `libomptarget-nvptx-cuda_80-sm_20.bc`. With this change, there are also multiple features to be expected in the near future: 1. CUDA will be completely dropped when compiling OpenMP. By the time, we also build bitcode libraries for all supported SM, multiplied by all supported CUDA version. 2. Atomic operations used in `deviceRTLs` can be replaced by `omp atomic` if OpenMP 5.1 feature is fully supported. For now, the IR generated is totally wrong. 3. Target specific parts will be wrapped into `declare variant` with `isa` selector if it can work properly. No target specific macro is needed anymore. 4. (Maybe more...) Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D94745
2024-11-27 23:51:56 +00:00 · 2021-01-26 12:28:15 -05:00 · 2021-01-26 12:28:15 -05:00 · 7c03f7d7d0
commit 7c03f7d7d0
parent 90b8ae016b
12 changed files with 204 additions and 116 deletions
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@ -712,33 +712,30 @@ void CudaToolChain::addClangTargetOptions(
  CC1Args.push_back("-mlink-builtin-bitcode");
  CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));

+  std::string CudaVersionStr;
+
  // New CUDA versions often introduce new instructions that are only supported
  // by new PTX version, so we need to raise PTX level to enable them in NVPTX
  // back-end.
  const char *PtxFeature = nullptr;
  switch (CudaInstallation.version()) {
-  case CudaVersion::CUDA_110:
-    PtxFeature = "+ptx70";
-    break;
-  case CudaVersion::CUDA_102:
-    PtxFeature = "+ptx65";
-    break;
-  case CudaVersion::CUDA_101:
-    PtxFeature = "+ptx64";
-    break;
-  case CudaVersion::CUDA_100:
-    PtxFeature = "+ptx63";
-    break;
-  case CudaVersion::CUDA_92:
-    PtxFeature = "+ptx61";
-    break;
-  case CudaVersion::CUDA_91:
-    PtxFeature = "+ptx61";
-    break;
-  case CudaVersion::CUDA_90:
-    PtxFeature = "+ptx60";
+#define CASE_CUDA_VERSION(CUDA_VER, PTX_VER)                                   \
+  case CudaVersion::CUDA_##CUDA_VER:                                           \
+    CudaVersionStr = #CUDA_VER;                                                \
+    PtxFeature = "+ptx" #PTX_VER;                                              \
    break;
+    CASE_CUDA_VERSION(110, 70);
+    CASE_CUDA_VERSION(102, 65);
+    CASE_CUDA_VERSION(101, 64);
+    CASE_CUDA_VERSION(100, 63);
+    CASE_CUDA_VERSION(92, 61);
+    CASE_CUDA_VERSION(91, 61);
+    CASE_CUDA_VERSION(90, 60);
+#undef CASE_CUDA_VERSION
  default:
+    // If unknown CUDA version, we take it as CUDA 8.0. Same assumption is also
+    // made in libomptarget/deviceRTLs.
+    CudaVersionStr = "80";
    PtxFeature = "+ptx42";
  }
  CC1Args.append({"-target-feature", PtxFeature});
@ -784,8 +781,9 @@ void CudaToolChain::addClangTargetOptions(
    } else {
      bool FoundBCLibrary = false;

-      std::string LibOmpTargetName =
-          "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+      std::string LibOmpTargetName = "libomptarget-nvptx-cuda_" +
+                                     CudaVersionStr + "-" + GpuArch.str() +
+                                     ".bc";

      for (StringRef LibraryPath : LibraryPaths) {
        SmallString<128> LibOmpTargetFile(LibraryPath);
--- a/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_80-sm_20.bc
+++ b/clang/test/Driver/Inputs/libomptarget/libomptarget-nvptx-cuda_80-sm_20.bc
--- a/clang/test/Driver/openmp-offload-gpu.c
+++ b/clang/test/Driver/openmp-offload-gpu.c
@ -164,7 +164,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-USER %s

-// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-sm_20.bc
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-cuda_80-sm_20.bc
 // CHK-BCLIB-USER: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
 // CHK-BCLIB-NOT: {{error:|warning:}}

@ -177,7 +177,7 @@
 // RUN:   -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-BCLIB-WARN %s

-// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-cuda_80-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Please use --libomptarget-nvptx-bc-path to specify nvptx bitcode library.

 /// ###########################################################################

--- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h
@ -26,7 +26,8 @@
 #define DEVICE __attribute__((device))
 #define INLINE inline DEVICE
 #define NOINLINE __attribute__((noinline)) DEVICE
-#define SHARED __attribute__((shared))
+#define SHARED(NAME) __attribute__((shared)) NAME
+#define EXTERN_SHARED(NAME) __attribute__((shared)) NAME
 #define ALIGN(N) __attribute__((aligned(N)))

 ////////////////////////////////////////////////////////////////////////////////
--- a/openmp/libomptarget/deviceRTLs/common/allocator.h
+++ b/openmp/libomptarget/deviceRTLs/common/allocator.h
@ -0,0 +1,44 @@
+//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Macros for allocating variables in different address spaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_ALLOCATOR_H
+#define OMPTARGET_ALLOCATOR_H
+
+#if _OPENMP
+// Follows the pattern in interface.h
+// Clang sema checks this type carefully, needs to closely match that from omp.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME)                                                           \
+  NAME [[clang::loader_uninitialized]];                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+#define EXTERN_SHARED(NAME)                                                    \
+  NAME;                                                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+#endif
+
+#endif // OMPTARGET_ALLOCATOR_H
--- a/openmp/libomptarget/deviceRTLs/common/omptarget.h
+++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h
@ -14,11 +14,12 @@
 #ifndef OMPTARGET_H
 #define OMPTARGET_H

-#include "target_impl.h"
-#include "common/debug.h"     // debug
-#include "interface.h" // interfaces with omp, compiler, and user
+#include "common/allocator.h"
+#include "common/debug.h" // debug
 #include "common/state-queue.h"
 #include "common/support.h"
+#include "interface.h" // interfaces with omp, compiler, and user
+#include "target_impl.h"

 #define OMPTARGET_NVPTX_VERSION 1.1

@ -71,8 +72,8 @@ private:
  uint32_t nArgs;
 };

-extern DEVICE SHARED omptarget_nvptx_SharedArgs
-    omptarget_nvptx_globalArgs;
+extern DEVICE
+    omptarget_nvptx_SharedArgs EXTERN_SHARED(omptarget_nvptx_globalArgs);

 // Worker slot type which is initialized with the default worker slot
 // size of 4*32 bytes.
@ -94,7 +95,7 @@ struct DataSharingStateTy {
  __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number];
 };

-extern DEVICE SHARED DataSharingStateTy DataSharingState;
+extern DEVICE DataSharingStateTy EXTERN_SHARED(DataSharingState);

 ////////////////////////////////////////////////////////////////////////////////
 // task ICV and (implicit & explicit) task state
@ -273,9 +274,9 @@ private:
 /// Memory manager for statically allocated memory.
 class omptarget_nvptx_SimpleMemoryManager {
 private:
-  ALIGN(128) struct MemDataTy {
+  struct MemDataTy {
    volatile unsigned keys[OMP_STATE_COUNT];
-  } MemData[MAX_SM];
+  } MemData[MAX_SM] ALIGN(128);

  INLINE static uint32_t hash(unsigned key) {
    return key & (OMP_STATE_COUNT - 1);
@ -294,18 +295,23 @@ public:

 extern DEVICE omptarget_nvptx_SimpleMemoryManager
    omptarget_nvptx_simpleMemoryManager;
-extern DEVICE SHARED uint32_t usedMemIdx;
-extern DEVICE SHARED uint32_t usedSlotIdx;
-extern DEVICE SHARED uint8_t
-    parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-extern DEVICE SHARED uint16_t threadLimit;
-extern DEVICE SHARED uint16_t threadsInTeam;
-extern DEVICE SHARED uint16_t nThreads;
-extern DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+extern DEVICE uint32_t EXTERN_SHARED(usedMemIdx);
+extern DEVICE uint32_t EXTERN_SHARED(usedSlotIdx);
+#if _OPENMP
+extern DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+#else
+extern DEVICE
+    uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
+#endif
+extern DEVICE uint16_t EXTERN_SHARED(threadLimit);
+extern DEVICE uint16_t EXTERN_SHARED(threadsInTeam);
+extern DEVICE uint16_t EXTERN_SHARED(nThreads);
+extern DEVICE omptarget_nvptx_ThreadPrivateContext *
+    EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);

-extern DEVICE SHARED uint32_t execution_param;
-extern DEVICE SHARED void *ReductionScratchpadPtr;
+extern DEVICE uint32_t EXTERN_SHARED(execution_param);
+extern DEVICE void *EXTERN_SHARED(ReductionScratchpadPtr);

 ////////////////////////////////////////////////////////////////////////////////
 // work function (outlined parallel/simd functions) and arguments.
@ -313,8 +319,8 @@ extern DEVICE SHARED void *ReductionScratchpadPtr;
 ////////////////////////////////////////////////////////////////////////////////

 typedef void *omptarget_nvptx_WorkFn;
-extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
-    omptarget_nvptx_workFn;
+extern volatile DEVICE
+    omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);

 ////////////////////////////////////////////////////////////////////////////////
 // get private data structures
--- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu
@ -11,8 +11,9 @@
 //===----------------------------------------------------------------------===//
 #pragma omp declare target

-#include "common/omptarget.h"
+#include "common/allocator.h"
 #include "common/device_environment.h"
+#include "common/omptarget.h"

 ////////////////////////////////////////////////////////////////////////////////
 // global device environment
@ -28,44 +29,44 @@ DEVICE
    omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
        omptarget_nvptx_device_State[MAX_SM];

-DEVICE omptarget_nvptx_SimpleMemoryManager
-    omptarget_nvptx_simpleMemoryManager;
-DEVICE SHARED uint32_t usedMemIdx;
-DEVICE SHARED uint32_t usedSlotIdx;
+DEVICE omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
+DEVICE uint32_t SHARED(usedMemIdx);
+DEVICE uint32_t SHARED(usedSlotIdx);

-DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
-DEVICE SHARED uint16_t threadLimit;
-DEVICE SHARED uint16_t threadsInTeam;
-DEVICE SHARED uint16_t nThreads;
+DEVICE uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
+#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
+DEVICE uint16_t SHARED(threadLimit);
+DEVICE uint16_t SHARED(threadsInTeam);
+DEVICE uint16_t SHARED(nThreads);
 // Pointer to this team's OpenMP state object
-DEVICE SHARED
-    omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
+DEVICE omptarget_nvptx_ThreadPrivateContext *
+    SHARED(omptarget_nvptx_threadPrivateContext);

 ////////////////////////////////////////////////////////////////////////////////
 // The team master sets the outlined parallel function in this variable to
 // communicate with the workers.  Since it is in shared memory, there is one
 // copy of these variables for each kernel, instance, and team.
 ////////////////////////////////////////////////////////////////////////////////
-volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
+volatile DEVICE omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);

 ////////////////////////////////////////////////////////////////////////////////
 // OpenMP kernel execution parameters
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED uint32_t execution_param;
+DEVICE uint32_t SHARED(execution_param);

 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing state
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED DataSharingStateTy DataSharingState;
+DEVICE DataSharingStateTy SHARED(DataSharingState);

 ////////////////////////////////////////////////////////////////////////////////
 // Scratchpad for teams reduction.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED void *ReductionScratchpadPtr;
+DEVICE void *SHARED(ReductionScratchpadPtr);

 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
-DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+DEVICE omptarget_nvptx_SharedArgs SHARED(omptarget_nvptx_globalArgs);

 #pragma omp end declare target
--- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu
@ -208,8 +208,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
                         : /*Master thread only*/ 1;
  uint32_t TeamId = GetBlockIdInKernel();
  uint32_t NumTeams = GetNumberOfBlocksInKernel();
-  static SHARED unsigned Bound;
-  static SHARED unsigned ChunkTeamCount;
+  static unsigned SHARED(Bound);
+  static unsigned SHARED(ChunkTeamCount);

  // Block progress for teams greater than the current upper
  // limit. We always only allow a number of teams less or equal
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@ -10,6 +10,21 @@
 #
 ##===----------------------------------------------------------------------===##

+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
 get_filename_component(devicertl_base_directory
  ${CMAKE_CURRENT_SOURCE_DIR}
  DIRECTORY)
@ -79,61 +94,91 @@ if(LIBOMPTARGET_DEP_CUDA_FOUND)
    )

    # Set flags for LLVM Bitcode compilation.
-    set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}
+    set(bc_flags -S -x c++
+                 -target nvptx64
+                 -Xclang -emit-llvm-bc
+                 -Xclang -aux-triple -Xclang ${aux_triple}
+                 -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+                 -D__CUDACC__
                 -I${devicertl_base_directory}
                 -I${devicertl_nvptx_directory}/src)

    if(${LIBOMPTARGET_NVPTX_DEBUG})
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1)
    else()
-      set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0)
+      list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
    endif()

    # Create target to build all Bitcode libraries.
    add_custom_target(omptarget-nvptx-bc)

-    # Generate a Bitcode library for all the compute capabilities the user requested.
+    # This map is from clang/lib/Driver/ToolChains/Cuda.cpp.
+    # The last element is the default case.
+    set(cuda_version_list 110 102 101 100 92 91 90 80)
+    set(ptx_feature_list 70 65 64 63 61 61 60 42)
+    # The following two lines of ugly code is not needed when the minimal CMake
+    # version requirement is 3.17+.
+    list(LENGTH cuda_version_list num_version_supported)
+    math(EXPR loop_range "${num_version_supported} - 1")
+
+    # Generate a Bitcode library for all the compute capabilities the user
+    # requested and all PTX version we know for now.
    foreach(sm ${nvptx_sm_list})
-      set(cuda_arch --cuda-gpu-arch=sm_${sm})
+      set(sm_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")

-      # Compile CUDA files to bitcode.
-      set(bc_files "")
-      foreach(src ${cuda_src_files})
-        get_filename_component(infile ${src} ABSOLUTE)
-        get_filename_component(outfile ${src} NAME)
+      # Uncomment the following code and remove those ugly part if the feature
+      # is available.
+      # foreach(cuda_version ptx_num IN ZIP_LISTS cuda_version_list ptx_feature_list)
+      foreach(itr RANGE ${loop_range})
+        list(GET cuda_version_list ${itr} cuda_version)
+        list(GET ptx_feature_list ${itr} ptx_num)
+        set(cuda_flags ${sm_flags})
+        list(APPEND cuda_flags -Xclang -target-feature -Xclang +ptx${ptx_num})
+        list(APPEND cuda_flags "-DCUDA_VERSION=${cuda_version}00")

-        add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} ${MAX_SM_DEFINITION}
-            -c ${infile} -o ${outfile}-sm_${sm}.bc
-          DEPENDS ${infile}
-          IMPLICIT_DEPENDS CXX ${infile}
-          COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc"
-          VERBATIM
+        set(bc_files "")
+        foreach(src ${cuda_src_files})
+          get_filename_component(infile ${src} ABSOLUTE)
+          get_filename_component(outfile ${src} NAME)
+          set(outfile "${outfile}-cuda_${cuda_version}-sm_${sm}.bc")
+
+          add_custom_command(OUTPUT ${outfile}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags}
+              ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+            DEPENDS ${infile}
+            IMPLICIT_DEPENDS CXX ${infile}
+            COMMENT "Building LLVM bitcode ${outfile}"
+            VERBATIM
+          )
+          set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+          list(APPEND bc_files ${outfile})
+        endforeach()
+
+        set(bclib_name "libomptarget-nvptx-cuda_${cuda_version}-sm_${sm}.bc")
+
+        # Link to a bitcode library.
+        add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+            COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
+              -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+            DEPENDS ${bc_files}
+            COMMENT "Linking LLVM bitcode ${bclib_name}"
        )
-        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc)
+        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})

-        list(APPEND bc_files ${outfile}-sm_${sm}.bc)
+        set(bclib_target_name "omptarget-nvptx-cuda_${cuda_version}-sm_${sm}-bc")
+
+        add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+        add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+        # Copy library to destination.
+        add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                          COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                          ${LIBOMPTARGET_LIBRARY_DIR})
+
+        # Install bitcode library under the lib destination folder.
+        install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
      endforeach()
-
-      # Link to a bitcode library.
-      add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-          COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
-            -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files}
-          DEPENDS ${bc_files}
-          COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc"
-      )
-      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc)
-
-      add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc)
-      add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc)
-
-      # Copy library to destination.
-      add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc
-                         ${LIBOMPTARGET_LIBRARY_DIR})
-
-      # Install bitcode library under the lib destination folder.
-      install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}")
    endforeach()
  endif()

--- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h
@ -11,7 +11,8 @@

 #include <stdint.h>

-#define EXTERN extern "C" __device__
+#define EXTERN extern "C"
+
 typedef uint32_t __kmpc_impl_lanemask_t;
 typedef uint32_t omp_lock_t; /* arbitrary type of the right length */

--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu
@ -14,8 +14,6 @@
 #include "target_impl.h"
 #include "common/debug.h"

-#include <cuda.h>
-
 DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
  asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
 }
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h
@ -13,18 +13,16 @@
 #define _TARGET_IMPL_H_

 #include <assert.h>
-#include <cuda.h>
 #include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>

 #include "nvptx_interface.h"

-#define DEVICE __device__
-#define INLINE __forceinline__ DEVICE
-#define NOINLINE __noinline__ DEVICE
-#define SHARED __shared__
-#define ALIGN(N) __align__(N)
+#define DEVICE
+#define INLINE inline __attribute__((always_inline))
+#define NOINLINE __attribute__((noinline))
+#define ALIGN(N) __attribute__((aligned(N)))

 ////////////////////////////////////////////////////////////////////////////////
 // Kernel options
@ -96,10 +94,6 @@ DEVICE double __kmpc_impl_get_wtime();
 INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
 INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }

-#ifndef CUDA_VERSION
-#error CUDA_VERSION macro is undefined, something wrong with cuda.
-#endif
-
 DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask();

 DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,