[OpenMP][VE] Support OpenMP runtime on VE

Support OpenMP runtime library on VE.  This patch makes OpenMP compilable
for VE architecture.  Almost all tests run correctly on VE.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D159401
This commit is contained in:
Kazushi (Jam) Marukawa 2021-11-20 21:45:52 +09:00
parent 52b4bec939
commit 18b6724355
12 changed files with 243 additions and 9 deletions

View File

@ -30,7 +30,7 @@ if(${OPENMP_STANDALONE_BUILD})
# If adding a new architecture, take a look at cmake/LibompGetArchitecture.cmake
libomp_get_architecture(LIBOMP_DETECTED_ARCH)
set(LIBOMP_ARCH ${LIBOMP_DETECTED_ARCH} CACHE STRING
"The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64/riscv64/loongarch64).")
"The architecture to build for (x86_64/i386/arm/ppc64/ppc64le/aarch64/mic/mips/mips64/riscv64/loongarch64/ve).")
# Should assertions be enabled? They are on by default.
set(LIBOMP_ENABLE_ASSERTIONS TRUE CACHE BOOL
"enable assertions?")
@ -63,6 +63,8 @@ else() # Part of LLVM build
set(LIBOMP_ARCH riscv64)
elseif(LIBOMP_NATIVE_ARCH MATCHES "loongarch64")
set(LIBOMP_ARCH loongarch64)
elseif(LIBOMP_NATIVE_ARCH MATCHES "ve")
set(LIBOMP_ARCH ve)
else()
# last ditch effort
libomp_get_architecture(LIBOMP_ARCH)
@ -83,7 +85,7 @@ if(LIBOMP_ARCH STREQUAL "aarch64")
endif()
endif()
libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64 loongarch64)
libomp_check_variable(LIBOMP_ARCH 32e x86_64 32 i386 arm ppc64 ppc64le aarch64 aarch64_a64fx mic mips mips64 riscv64 loongarch64 ve)
set(LIBOMP_LIB_TYPE normal CACHE STRING
"Performance,Profiling,Stubs library (normal/profile/stubs)")
@ -162,6 +164,7 @@ set(MIPS64 FALSE)
set(MIPS FALSE)
set(RISCV64 FALSE)
set(LOONGARCH64 FALSE)
set(VE FALSE)
if("${LIBOMP_ARCH}" STREQUAL "i386" OR "${LIBOMP_ARCH}" STREQUAL "32") # IA-32 architecture
set(IA32 TRUE)
elseif("${LIBOMP_ARCH}" STREQUAL "x86_64" OR "${LIBOMP_ARCH}" STREQUAL "32e") # Intel(R) 64 architecture
@ -188,6 +191,8 @@ elseif("${LIBOMP_ARCH}" STREQUAL "riscv64") # RISCV64 architecture
set(RISCV64 TRUE)
elseif("${LIBOMP_ARCH}" STREQUAL "loongarch64") # LoongArch64 architecture
set(LOONGARCH64 TRUE)
elseif("${LIBOMP_ARCH}" STREQUAL "ve") # VE architecture
set(VE TRUE)
endif()
# Set some flags based on build_type

View File

@ -49,6 +49,8 @@ function(libomp_get_architecture return_arch)
#error ARCHITECTURE=riscv64
#elif defined(__loongarch__) && __loongarch_grlen == 64
#error ARCHITECTURE=loongarch64
#elif defined(__ve__)
#error ARCHITECTURE=ve
#else
#error ARCHITECTURE=UnknownArchitecture
#endif

View File

@ -111,6 +111,8 @@ function(libomp_get_legal_arch return_arch_string)
set(${return_arch_string} "RISCV64" PARENT_SCOPE)
elseif(${LOONGARCH64})
set(${return_arch_string} "LOONGARCH64" PARENT_SCOPE)
elseif(${VE})
set(${return_arch_string} "VE" PARENT_SCOPE)
else()
set(${return_arch_string} "${LIBOMP_ARCH}" PARENT_SCOPE)
libomp_warning_say("libomp_get_legal_arch(): Warning: Unknown architecture: Using ${LIBOMP_ARCH}")

View File

@ -1170,6 +1170,10 @@ extern void __kmp_init_target_task();
#elif KMP_ARCH_X86_64
#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
#define KMP_BACKUP_STKSIZE ((size_t)(2 * 1024 * 1024))
#elif KMP_ARCH_VE
// Minimum stack size for pthread for VE is 4MB.
// https://www.hpc.nec/documents/veos/en/glibc/Difference_Points_glibc.htm
#define KMP_DEFAULT_STKSIZE ((size_t)(4 * 1024 * 1024))
#else
#define KMP_DEFAULT_STKSIZE ((size_t)(1024 * 1024))
#endif

View File

@ -286,6 +286,17 @@ public:
#elif __NR_sched_getaffinity != 123
#error Wrong code for getaffinity system call.
#endif /* __NR_sched_getaffinity */
#elif KMP_ARCH_VE
#ifndef __NR_sched_setaffinity
#define __NR_sched_setaffinity 203
#elif __NR_sched_setaffinity != 203
#error Wrong code for setaffinity system call.
#endif /* __NR_sched_setaffinity */
#ifndef __NR_sched_getaffinity
#define __NR_sched_getaffinity 204
#elif __NR_sched_getaffinity != 204
#error Wrong code for getaffinity system call.
#endif /* __NR_sched_getaffinity */
#else
#error Unknown or unsupported architecture
#endif /* KMP_ARCH_* */

View File

@ -178,7 +178,7 @@ typedef unsigned long long kmp_uint64;
#if KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_MIPS
#define KMP_SIZE_T_SPEC KMP_UINT32_SPEC
#elif KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
#define KMP_SIZE_T_SPEC KMP_UINT64_SPEC
#else
#error "Can't determine size_t printf format specifier."
@ -1043,7 +1043,7 @@ extern kmp_real64 __kmp_xchg_real64(volatile kmp_real64 *p, kmp_real64 v);
#endif /* KMP_OS_WINDOWS */
#if KMP_ARCH_PPC64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS || \
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
#if KMP_OS_WINDOWS
#undef KMP_MB
#define KMP_MB() std::atomic_thread_fence(std::memory_order_seq_cst)

View File

@ -93,6 +93,7 @@
#define KMP_ARCH_MIPS64 0
#define KMP_ARCH_RISCV64 0
#define KMP_ARCH_LOONGARCH64 0
#define KMP_ARCH_VE 0
#if KMP_OS_WINDOWS
#if defined(_M_AMD64) || defined(__x86_64)
@ -142,6 +143,9 @@
#elif defined __loongarch__ && __loongarch_grlen == 64
#undef KMP_ARCH_LOONGARCH64
#define KMP_ARCH_LOONGARCH64 1
#elif defined __ve__
#undef KMP_ARCH_VE
#define KMP_ARCH_VE 1
#endif
#endif
@ -206,7 +210,7 @@
// TODO: Fixme - This is clever, but really fugly
#if (1 != KMP_ARCH_X86 + KMP_ARCH_X86_64 + KMP_ARCH_ARM + KMP_ARCH_PPC64 + \
KMP_ARCH_AARCH64 + KMP_ARCH_MIPS + KMP_ARCH_MIPS64 + \
KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64)
KMP_ARCH_RISCV64 + KMP_ARCH_LOONGARCH64 + KMP_ARCH_VE)
#error Unknown or unsupported architecture
#endif

View File

@ -8830,7 +8830,7 @@ __kmp_determine_reduction_method(
int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
#if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD

View File

@ -162,6 +162,10 @@
#define ITT_ARCH_ARM64 6
#endif /* ITT_ARCH_ARM64 */
#ifndef ITT_ARCH_VE
#define ITT_ARCH_VE 8
#endif /* ITT_ARCH_VE */
#ifndef ITT_ARCH
#if defined _M_IX86 || defined __i386__
#define ITT_ARCH ITT_ARCH_IA32
@ -175,6 +179,8 @@
#define ITT_ARCH ITT_ARCH_ARM64
#elif defined __powerpc64__
#define ITT_ARCH ITT_ARCH_PPC64
#elif defined __ve__
#define ITT_ARCH ITT_ARCH_VE
#endif
#endif

View File

@ -2060,6 +2060,198 @@ __kmp_invoke_microtask:
#endif /* KMP_ARCH_LOONGARCH64 */
#if KMP_ARCH_VE
//------------------------------------------------------------------------
//
// typedef void (*microtask_t)(int *gtid, int *tid, ...);
//
// int __kmp_invoke_microtask(microtask_t pkfn, int gtid, int tid, int argc,
// void *p_argv[]
// #if OMPT_SUPPORT
// ,
// void **exit_frame_ptr
// #endif
// ) {
// #if OMPT_SUPPORT
// *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
// #endif
//
// (*pkfn)(&gtid, &tid, argv[0], ...);
//
// return 1;
// }
//
// Parameters:
// s0: pkfn
// s1: gtid
// s2: tid
// s3: argc
// s4: p_argv
// s5: exit_frame_ptr
//
// Locals:
// __gtid: gtid param pushed on stack so can pass &gtid to pkfn
// __tid: tid param pushed on stack so can pass &tid to pkfn
//
// Temp. registers:
//
// s34: used to calculate the dynamic stack size
// s35: used as temporary for stack placement calculation
// s36: used as temporary for stack arguments
// s37: used as temporary for number of remaining pkfn parms
// s38: used to traverse p_argv array
//
// return: s0 (always 1/TRUE)
//
__gtid = -4
__tid = -8
// -- Begin __kmp_invoke_microtask
// mark_begin;
.text
.globl __kmp_invoke_microtask
// A function requires 8 bytes align.
.p2align 3
.type __kmp_invoke_microtask,@function
__kmp_invoke_microtask:
.cfi_startproc
// First, save fp and lr. VE stores them at caller stack frame.
st %fp, 0(, %sp)
st %lr, 8(, %sp)
or %fp, 0, %sp
.cfi_def_cfa %fp, 0
.cfi_offset %lr, 8
.cfi_offset %fp, 0
// Compute the dynamic stack size:
//
// - We need 8 bytes for storing 'gtid' and 'tid', so we can pass them
// by reference
// - We need 8 bytes for whole arguments. We have two + 'argc'
// arguments (condider &gtid and &tid). We need to reserve
// (argc + 2) * 8 bytes.
// - We need 176 bytes for RSA and others
//
// The total number of bytes is then (argc + 2) * 8 + 8 + 176.
//
// |------------------------------|
// | return address of callee | 8(%fp)
// |------------------------------|
// | frame pointer of callee | 0(%fp)
// |------------------------------| <------------------ %fp
// | __tid / __gtid | -8(%fp) / -4(%fp)
// |------------------------------|
// | argc+2 for arguments | 176(%sp)
// |------------------------------|
// | RSA |
// |------------------------------|
// | return address |
// |------------------------------|
// | frame pointer |
// |------------------------------| <------------------ %sp
adds.w.sx %s34, 2, %s3
sll %s34, %s34, 3
lea %s34, 184(, %s34)
subs.l %sp, %sp, %s34
// Align the stack to 16 bytes.
and %sp, -16, %sp
// Save pkfn.
or %s12, 0, %s0
// Call host to allocate stack if it is necessary.
brge.l %sp, %sl, .L_kmp_pass
ld %s61, 24(, %tp)
lea %s63, 0x13b
shm.l %s63, 0(%s61)
shm.l %sl, 8(%s61)
shm.l %sp, 16(%s61)
monc
.L_kmp_pass:
lea %s35, 176(, %sp)
adds.w.sx %s37, 0, %s3
or %s38, 0, %s4
#if OMPT_SUPPORT
// Save frame pointer into exit_frame.
st %fp, 0(%s5)
#endif
// Prepare arguments for the pkfn function (first 8 using s0-s7
// registers, but need to store stack also because of varargs).
stl %s1, __gtid(%fp)
stl %s2, __tid(%fp)
adds.l %s0, __gtid, %fp
st %s0, 0(, %s35)
adds.l %s1, __tid, %fp
st %s1, 8(, %s35)
breq.l 0, %s37, .L_kmp_call
ld %s2, 0(, %s38)
st %s2, 16(, %s35)
breq.l 1, %s37, .L_kmp_call
ld %s3, 8(, %s38)
st %s3, 24(, %s35)
breq.l 2, %s37, .L_kmp_call
ld %s4, 16(, %s38)
st %s4, 32(, %s35)
breq.l 3, %s37, .L_kmp_call
ld %s5, 24(, %s38)
st %s5, 40(, %s35)
breq.l 4, %s37, .L_kmp_call
ld %s6, 32(, %s38)
st %s6, 48(, %s35)
breq.l 5, %s37, .L_kmp_call
ld %s7, 40(, %s38)
st %s7, 56(, %s35)
breq.l 6, %s37, .L_kmp_call
// Prepare any additional argument passed through the stack.
adds.l %s37, -6, %s37
lea %s38, 48(, %s38)
lea %s35, 64(, %s35)
.L_kmp_loop:
ld %s36, 0(, %s38)
st %s36, 0(, %s35)
adds.l %s37, -1, %s37
adds.l %s38, 8, %s38
adds.l %s35, 8, %s35
brne.l 0, %s37, .L_kmp_loop
.L_kmp_call:
// Call pkfn function.
bsic %lr, (, %s12)
// Return value.
lea %s0, 1
// Restore stack and return.
or %sp, 0, %fp
ld %lr, 8(, %sp)
ld %fp, 0(, %sp)
b.l.t (, %lr)
.Lfunc_end0:
.size __kmp_invoke_microtask, .Lfunc_end0-__kmp_invoke_microtask
.cfi_endproc
// -- End __kmp_invoke_microtask
#endif /* KMP_ARCH_VE */
#if KMP_ARCH_ARM || KMP_ARCH_MIPS
.data
COMMON .gomp_critical_user_, 32, 3
@ -2073,7 +2265,8 @@ __kmp_unnamed_critical_addr:
#endif
#endif /* KMP_ARCH_ARM */
#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64
#if KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 || \
KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE
#ifndef KMP_PREFIX_UNDERSCORE
# define KMP_PREFIX_UNDERSCORE(x) x
#endif
@ -2088,7 +2281,7 @@ KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr):
.size KMP_PREFIX_UNDERSCORE(__kmp_unnamed_critical_addr),8
#endif
#endif /* KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 ||
KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 */
KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || KMP_ARCH_VE */
#if KMP_OS_LINUX
# if KMP_ARCH_ARM || KMP_ARCH_AARCH64

View File

@ -2456,7 +2456,7 @@ finish: // Clean up and exit.
#if !(KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_MIC || \
((KMP_OS_LINUX || KMP_OS_DARWIN) && KMP_ARCH_AARCH64) || \
KMP_ARCH_PPC64 || KMP_ARCH_RISCV64 || KMP_ARCH_LOONGARCH64 || \
KMP_ARCH_ARM)
KMP_ARCH_ARM || KMP_ARCH_VE)
// we really only need the case with 1 argument, because CLANG always build
// a struct of pointers to shared variables referenced in the outlined function

View File

@ -221,6 +221,13 @@ ompt_label_##id:
printf("%" PRIu64 ": current_address=%p or %p or %p\n", \
ompt_get_thread_data()->value, ((char *)addr) - 4, \
((char *)addr) - 8, ((char *)addr) - 12)
#elif KMP_ARCH_VE
// On VE the NOP instruction is 8 byte long. In addition, the compiler inserts
// a ??? instruction for non-void runtime functions which is ? bytes long.
#define print_possible_return_addresses(addr) \
printf("%" PRIu64 ": current_address=%p or %p\n", \
ompt_get_thread_data()->value, ((char *)addr) - 8, \
((char *)addr) - 8)
#else
#error Unsupported target architecture, cannot determine address offset!
#endif