mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-11-24 06:10:12 +00:00
[libomptarget][amdgcn] Build amdgcn devicertl as openmp
[libomptarget][amdgcn] Build amdgcn devicertl as openmp Change cmake to build as openmp and fix up some minor errors in the code. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D96533
This commit is contained in:
parent
06f53f2f09
commit
6f04addc8b
@ -90,22 +90,26 @@ else()
|
||||
endif()
|
||||
|
||||
# create libraries
|
||||
set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900)
|
||||
set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx906)
|
||||
if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
|
||||
set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
|
||||
endif()
|
||||
|
||||
macro(add_cuda_bc_library)
|
||||
set(cu_cmd ${AOMP_BINDIR}/clang++
|
||||
-xc++
|
||||
-c
|
||||
-std=c++14
|
||||
-fcuda-rdc
|
||||
-target amdgcn
|
||||
-emit-llvm
|
||||
-Xclang -aux-triple -Xclang x86_64-unknown-linux-gnu # see nvptx
|
||||
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
|
||||
-D__AMDGCN__
|
||||
-Xclang -target-cpu -Xclang ${mcpu}
|
||||
-fvisibility=default
|
||||
--cuda-device-only
|
||||
-Wno-unused-value
|
||||
-x hip
|
||||
-nogpulib -nogpuinc
|
||||
-nogpulib
|
||||
-O${optimization_level}
|
||||
--cuda-gpu-arch=${mcpu}
|
||||
${CUDA_DEBUG}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||
-I${devicertl_base_directory})
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define EXTERN extern "C" __attribute__((device))
|
||||
#define EXTERN extern "C"
|
||||
typedef uint64_t __kmpc_impl_lanemask_t;
|
||||
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
|
||||
|
||||
|
@ -26,6 +26,6 @@ DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
|
||||
DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
|
||||
DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
|
||||
DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
|
||||
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); }
|
||||
DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); return 0;}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
@ -53,7 +53,7 @@ enum {
|
||||
// bound on how many compute units are available. Some values in this
|
||||
// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
|
||||
|
||||
DEVICE uint32_t __kmpc_impl_smid() {
|
||||
EXTERN uint32_t __kmpc_impl_smid() {
|
||||
uint32_t cu_id = __builtin_amdgcn_s_getreg(
|
||||
ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
uint32_t se_id = __builtin_amdgcn_s_getreg(
|
||||
|
@ -22,11 +22,9 @@
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define DEVICE __attribute__((device))
|
||||
#define DEVICE
|
||||
#define INLINE inline DEVICE
|
||||
#define NOINLINE __attribute__((noinline)) DEVICE
|
||||
#define SHARED(NAME) __attribute__((shared)) NAME
|
||||
#define EXTERN_SHARED(NAME) __attribute__((shared)) NAME
|
||||
#define ALIGN(N) __attribute__((aligned(N)))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -11,7 +11,9 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target_impl.h"
|
||||
#include "target_interface.h"
|
||||
|
||||
// Implementations initially derived from hcc
|
||||
|
||||
@ -66,11 +68,12 @@ EXTERN int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
|
||||
return __builtin_amdgcn_ds_bpermute(index << 2, var);
|
||||
}
|
||||
|
||||
static DEVICE SHARED uint32_t L1_Barrier;
|
||||
uint32_t __kmpc_L1_Barrier [[clang::loader_uninitialized]];
|
||||
#pragma allocate(__kmpc_L1_Barrier) allocator(omp_pteam_mem_alloc)
|
||||
|
||||
EXTERN void __kmpc_impl_target_init() {
|
||||
// Don't have global ctors, and shared memory is not zero init
|
||||
__atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
|
||||
__atomic_store_n(&__kmpc_L1_Barrier, 0u, __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
@ -94,8 +97,8 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
bool isLowest = GetLaneId() == lowestActiveThread;
|
||||
|
||||
if (isLowest) {
|
||||
uint32_t load =
|
||||
__atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
|
||||
uint32_t load = __atomic_fetch_add(&__kmpc_L1_Barrier, 1,
|
||||
__ATOMIC_RELAXED); // commutative
|
||||
|
||||
// Record the number of times the barrier has been passed
|
||||
uint32_t generation = load & 0xffff0000u;
|
||||
@ -107,12 +110,12 @@ EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
load &= 0xffff0000u; // because bits zeroed second
|
||||
|
||||
// Reset the wave counter and release the waiting waves
|
||||
__atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(&__kmpc_L1_Barrier, load, __ATOMIC_RELAXED);
|
||||
} else {
|
||||
// more waves still to go, spin until generation counter changes
|
||||
do {
|
||||
__builtin_amdgcn_s_sleep(0);
|
||||
load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
|
||||
load = __atomic_load_n(&__kmpc_L1_Barrier, __ATOMIC_RELAXED);
|
||||
} while ((load & 0xffff0000u) == generation);
|
||||
}
|
||||
}
|
||||
|
@ -337,8 +337,9 @@ INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
|
||||
|
||||
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
|
||||
INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
|
||||
INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
|
||||
|
||||
#include "common/omptargeti.h"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user