mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-02-13 22:00:14 +00:00
[OpenMP] Initial implementation of OpenMP offloading library - libomptarget device RTLs.
This patch implements the device runtime library whose interface is used in the code generation for OpenMP offloading devices. Currently there is a single device RTL written in CUDA meant to CUDA enabled GPUs. The interface is a variation of the kmpc interface that includes some extra calls to do thread and storage management that only make sense for a GPU target. Differential revision: https://reviews.llvm.org/D14254 llvm-svn: 323649
This commit is contained in:
parent
e7264106d4
commit
0dd6ed74fd
@ -166,7 +166,7 @@ Options for ``libomp``
|
||||
Create the Fortran modules (requires Fortran compiler).
|
||||
|
||||
macOS* Fat Libraries
|
||||
""""""""""""""""""
|
||||
""""""""""""""""""""
|
||||
On macOS* machines, it is possible to build universal (or fat) libraries which
|
||||
include both i386 and x86_64 architecture objects in a single archive.
|
||||
|
||||
@ -254,6 +254,40 @@ Options for ``libomptarget``
|
||||
Path of the folder that contains ``libomp.so``. This is required for testing
|
||||
out-of-tree builds.
|
||||
|
||||
Options for ``NVPTX device RTL``
|
||||
--------------------------------
|
||||
|
||||
**LIBOMPTARGET_NVPTX_ENABLE_BCLIB** = ``OFF|ON``
|
||||
Enable CUDA LLVM bitcode offloading device RTL. This is used for link time
|
||||
optimization of the OMP runtime and application code.
|
||||
|
||||
**LIBOMPTARGET_NVPTX_CUDA_COMPILER** = ``""``
|
||||
Location of a CUDA compiler capable of emitting LLVM bitcode. Currently only
|
||||
the Clang compiler is supported. This is only used when building the CUDA LLVM
|
||||
bitcode offloading device RTL. If unspecified and the CMake C compiler is
|
||||
Clang, then Clang is used.
|
||||
|
||||
**LIBOMPTARGET_NVPTX_BC_LINKER** = ``""``
|
||||
Location of a linker capable of linking LLVM bitcode objects. This is only
|
||||
used when building the CUDA LLVM bitcode offloading device RTL. If unspecified
|
||||
and the CMake C compiler is Clang and there exists a llvm-link binary in the
|
||||
directory containing Clang, then this llvm-link binary is used.
|
||||
|
||||
**LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER** = ``""``
|
||||
Host compiler to use with NVCC. This compiler is not going to be used to
|
||||
produce any binary. Instead, this is used to overcome the input compiler
|
||||
checks done by NVCC. E.g. if using a default host compiler that is not
|
||||
compatible with NVCC, this option can be use to pass to NVCC a valid compiler
|
||||
to avoid the error.
|
||||
|
||||
**LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY** = ``35``
|
||||
CUDA compute capability that should be supported by the NVPTX device RTL. E.g.
|
||||
for compute capability 6.0, the option "60" should be used. Compute capability
|
||||
3.5 is the minimum required.
|
||||
|
||||
**LIBOMPTARGET_NVPTX_DEBUG** = ``OFF|ON``
|
||||
Enable printing of debug messages from the NVPTX device RTL.
|
||||
|
||||
Example Usages of CMake
|
||||
=======================
|
||||
|
||||
|
@ -67,6 +67,7 @@ endif()
|
||||
|
||||
# Build offloading plugins and device RTLs if they are available.
|
||||
add_subdirectory(plugins)
|
||||
add_subdirectory(deviceRTLs)
|
||||
|
||||
# Add tests.
|
||||
add_subdirectory(test)
|
||||
|
14
openmp/libomptarget/deviceRTLs/CMakeLists.txt
Normal file
14
openmp/libomptarget/deviceRTLs/CMakeLists.txt
Normal file
@ -0,0 +1,14 @@
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# The LLVM Compiler Infrastructure
|
||||
#
|
||||
# This file is dual licensed under the MIT and the University of Illinois Open
|
||||
# Source Licenses. See LICENSE.txt for details.
|
||||
#
|
||||
# ##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build a device RTL for each available machine available.
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
add_subdirectory(nvptx)
|
200
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
Normal file
200
openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
Normal file
@ -0,0 +1,200 @@
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# The LLVM Compiler Infrastructure
|
||||
#
|
||||
# This file is dual licensed under the MIT and the University of Illinois Open
|
||||
# Source Licenses. See LICENSE.txt for details.
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING
|
||||
"Path to alternate NVCC host compiler to be used by the NVPTX device RTL.")
|
||||
|
||||
if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER)
|
||||
find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER})
|
||||
if(NOT ALTERNATE_CUDA_HOST_COMPILER)
|
||||
libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.")
|
||||
endif()
|
||||
set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE)
|
||||
endif()
|
||||
|
||||
# We can't use clang as nvcc host preprocessor, so we attempt to replace it with
|
||||
# gcc.
|
||||
if(CUDA_HOST_COMPILER MATCHES clang)
|
||||
|
||||
find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc)
|
||||
|
||||
if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER)
|
||||
libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.")
|
||||
libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.")
|
||||
return()
|
||||
endif()
|
||||
set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE)
|
||||
endif()
|
||||
|
||||
if(LIBOMPTARGET_DEP_CUDA_FOUND)
|
||||
libomptarget_say("Building CUDA offloading device RTL.")
|
||||
|
||||
# We really don't have any host code, so we don't need to care about
|
||||
# propagating host flags.
|
||||
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
|
||||
|
||||
set(cuda_src_files
|
||||
src/cancel.cu
|
||||
src/critical.cu
|
||||
src/data_sharing.cu
|
||||
src/libcall.cu
|
||||
src/loop.cu
|
||||
src/omptarget-nvptx.cu
|
||||
src/parallel.cu
|
||||
src/reduction.cu
|
||||
src/sync.cu
|
||||
src/task.cu
|
||||
)
|
||||
|
||||
set(omp_data_objects src/omp_data.cu)
|
||||
|
||||
# Get the compute capability the user requested or use SM_35 by default.
|
||||
# SM_35 is what clang uses by default.
|
||||
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY 35 CACHE STRING
|
||||
"CUDA Compute Capability to be used to compile the NVPTX device RTL.")
|
||||
set(CUDA_ARCH -arch sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
|
||||
|
||||
# Activate RTL message dumps if requested by the user.
|
||||
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
|
||||
"Activate NVPTX device RTL debug messages.")
|
||||
if(${LIBOMPTARGET_NVPTX_DEBUG})
|
||||
set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v)
|
||||
endif()
|
||||
|
||||
# NVPTX runtime library has to be statically linked. Dynamic linking is not
|
||||
# yet supported by the CUDA toolchain on the device.
|
||||
set(BUILD_SHARED_LIBS OFF)
|
||||
set(CUDA_SEPARABLE_COMPILATION ON)
|
||||
|
||||
cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects}
|
||||
OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG})
|
||||
|
||||
# Install device RTL under the lib destination folder.
|
||||
install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "lib")
|
||||
|
||||
target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES})
|
||||
|
||||
# Check if we can create an LLVM bitcode implementation of the runtime library
|
||||
# that could be inlined in the user implementation.
|
||||
set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB FALSE CACHE BOOL
|
||||
"Enable CUDA LLVM bitcode offloading device RTL.")
|
||||
if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB})
|
||||
|
||||
# Find a clang compiler capable of compiling cuda files to LLVM bitcode and
|
||||
# an LLVM linker.
|
||||
# We use the one provided by the user, attempt to use the one used to build
|
||||
# libomptarget or just fail.
|
||||
|
||||
set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
|
||||
"Location of a CUDA compiler capable of emitting LLVM bitcode.")
|
||||
set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
|
||||
"Location of a linker capable of linking LLVM bitcode objects.")
|
||||
|
||||
if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
|
||||
set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
|
||||
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
|
||||
set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER})
|
||||
else()
|
||||
libomptarget_error_say("Cannot find a CUDA compiler capable of emitting LLVM bitcode.")
|
||||
libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_CUDA_COMPILER")
|
||||
endif()
|
||||
|
||||
# Get compiler directory to try to locate a suitable linker
|
||||
get_filename_component(COMPILER_DIR ${CMAKE_C_COMPILER} DIRECTORY)
|
||||
|
||||
if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
|
||||
set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER})
|
||||
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang" AND EXISTS "${COMPILER_DIR}/llvm-link")
|
||||
# Use llvm-link from the directory containing clang
|
||||
set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${COMPILER_DIR}/llvm-link)
|
||||
else()
|
||||
libomptarget_error_say("Cannot find a linker capable of linking LLVM bitcode objects.")
|
||||
libomptarget_error_say("Please configure with flag -DLIBOMPTARGET_NVPTX_BC_LINKER")
|
||||
endif()
|
||||
|
||||
if(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER AND LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER)
|
||||
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
|
||||
|
||||
# Decide which ptx version to use. Same choices as Clang.
|
||||
if(CUDA_VERSION_MAJOR GREATER 9 OR CUDA_VERSION_MAJOR EQUAL 9)
|
||||
set(CUDA_PTX_VERSION ptx60)
|
||||
else()
|
||||
set(CUDA_PTX_VERSION ptx42)
|
||||
endif()
|
||||
|
||||
# Set flags for Clang cuda compilation. Only Clang is supported because there is
|
||||
# no other compiler capable of generating bitcode from cuda sources.
|
||||
set(CUDA_FLAGS
|
||||
-emit-llvm
|
||||
-O1
|
||||
-Xclang -target-feature
|
||||
-Xclang +${CUDA_PTX_VERSION}
|
||||
--cuda-device-only
|
||||
-DOMPTARGET_NVPTX_TEST=0 -DOMPTARGET_NVPTX_DEBUG=0
|
||||
)
|
||||
|
||||
# CUDA 9 header files use the nv_weak attribute which clang is not yet prepared
|
||||
# to handle. Therefore, we use 'weak' instead. We are compiling only for the
|
||||
# device, so it should be equivalent.
|
||||
if(CUDA_VERSION_MAJOR EQUAL 9)
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Dnv_weak=weak)
|
||||
endif()
|
||||
|
||||
# Get the compute capability the user requested or use SM_35 by default.
|
||||
set(CUDA_ARCH "")
|
||||
set(CUDA_ARCH --cuda-gpu-arch=sm_${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY})
|
||||
|
||||
# Compile cuda files to bitcode.
|
||||
set(bc_files "")
|
||||
foreach(src ${cuda_src_files})
|
||||
get_filename_component(infile ${src} ABSOLUTE)
|
||||
get_filename_component(outfile ${src} NAME)
|
||||
|
||||
add_custom_command(OUTPUT ${outfile}.bc
|
||||
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${CUDA_FLAGS} ${CUDA_ARCH} ${CUDA_INCLUDES}
|
||||
-c ${infile} -o ${outfile}.bc
|
||||
DEPENDS ${infile}
|
||||
IMPLICIT_DEPENDS CXX ${infile}
|
||||
COMMENT "Building LLVM bitcode ${outfile}.bc"
|
||||
VERBATIM
|
||||
)
|
||||
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}.bc)
|
||||
|
||||
list(APPEND bc_files ${outfile}.bc)
|
||||
endforeach()
|
||||
|
||||
# Link to a bitcode library.
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
|
||||
COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER}
|
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc ${bc_files}
|
||||
DEPENDS ${bc_files}
|
||||
COMMENT "Linking LLVM bitcode libomptarget-nvptx.bc"
|
||||
)
|
||||
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx.bc)
|
||||
|
||||
add_custom_target(omptarget-nvptx-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc)
|
||||
|
||||
# Copy library to destination.
|
||||
add_custom_command(TARGET omptarget-nvptx-bc POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc
|
||||
$<TARGET_FILE_DIR:omptarget-nvptx>)
|
||||
|
||||
# Install device RTL under the lib destination folder.
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx.bc DESTINATION "lib")
|
||||
|
||||
endif()
|
||||
endif()
|
||||
|
||||
else()
|
||||
libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.")
|
||||
endif()
|
523
openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
Normal file
523
openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt
Normal file
@ -0,0 +1,523 @@
|
||||
|
||||
**Design document for OpenMP reductions on the GPU**
|
||||
|
||||
//Abstract: //In this document we summarize the new design for an OpenMP
|
||||
implementation of reductions on NVIDIA GPUs. This document comprises
|
||||
* a succinct background review,
|
||||
* an introduction to the decoupling of reduction algorithm and
|
||||
data-structure-specific processing routines,
|
||||
* detailed illustrations of reduction algorithms used and
|
||||
* a brief overview of steps we have made beyond the last implementation.
|
||||
|
||||
**Problem Review**
|
||||
|
||||
Consider a typical OpenMP program with reduction pragma.
|
||||
|
||||
```
|
||||
double foo, bar;
|
||||
#pragma omp parallel for reduction(+:foo, bar)
|
||||
for (int i = 0; i < N; i++) {
|
||||
foo+=A[i]; bar+=B[i];
|
||||
}
|
||||
```
|
||||
where 'foo' and 'bar' are reduced across all threads in the parallel region.
|
||||
Our primary goal is to efficiently aggregate the values of foo and bar in
|
||||
such manner that
|
||||
* makes the compiler logically concise.
|
||||
* efficiently reduces within warps, threads, blocks and the device.
|
||||
|
||||
**Introduction to Decoupling**
|
||||
In this section we address the problem of making the compiler
|
||||
//logically concise// by partitioning the task of reduction into two broad
|
||||
categories: data-structure specific routines and algorithmic routines.
|
||||
|
||||
The previous reduction implementation was highly coupled with
|
||||
the specificity of the reduction element data structures (e.g., sizes, data
|
||||
types) and operators of the reduction (e.g., addition, multiplication). In
|
||||
our implementation we strive to decouple them. In our final implementations,
|
||||
we could remove all template functions in our runtime system.
|
||||
|
||||
The (simplified) pseudo code generated by LLVM is as follows:
|
||||
|
||||
```
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn,
|
||||
interWarpCpyFn)
|
||||
where:
|
||||
struct ReduceData {
|
||||
double *foo;
|
||||
double *bar;
|
||||
} reduceData
|
||||
reduceData.foo = &foo_p
|
||||
reduceData.bar = &bar_p
|
||||
|
||||
shuffleReduceFn and interWarpCpyFn are two auxiliary functions
|
||||
generated to aid the runtime performing algorithmic steps
|
||||
while being data-structure agnostic about ReduceData.
|
||||
|
||||
In particular, shuffleReduceFn is a function that takes the following
|
||||
inputs:
|
||||
a. local copy of ReduceData
|
||||
b. its lane_id
|
||||
c. the offset of the lane_id which hosts a remote ReduceData
|
||||
relative to the current one
|
||||
d. an algorithm version paramter determining which reduction
|
||||
algorithm to use.
|
||||
This shuffleReduceFn retrieves the remote ReduceData through shuffle
|
||||
intrinsics and reduces, using the algorithm specified by the 4th
|
||||
parameter, the local ReduceData and with the remote ReduceData element
|
||||
wise, and places the resultant values into the local ReduceData.
|
||||
|
||||
Different reduction algorithms are implemented with different runtime
|
||||
functions, but they all make calls to this same shuffleReduceFn to
|
||||
perform the essential reduction step. Therefore, based on the 4th
|
||||
parameter, this shuffleReduceFn will behave slightly differently to
|
||||
cooperate with the runtime function to ensure correctness under
|
||||
different circumstances.
|
||||
|
||||
InterWarpCpyFn, as the name suggests, is a function that copies data
|
||||
across warps. Its function is to tunnel all the thread private
|
||||
ReduceData that is already reduced within a warp to a lane in the first
|
||||
warp with minimal shared memory footprint. This is an essential step to
|
||||
prepare for the last step of a block reduction.
|
||||
|
||||
(Warp, block, device level reduction routines that utilize these
|
||||
auxiliary functions will be discussed in the next section.)
|
||||
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
```
|
||||
|
||||
**Reduction Algorithms**
|
||||
|
||||
On the warp level, we have three versions of the algorithms:
|
||||
|
||||
1. Full Warp Reduction
|
||||
|
||||
```
|
||||
gpu_regular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr ShuffleReduceFn) {
|
||||
for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
|
||||
ShuffleReduceFn(reduce_data, 0, offset, 0);
|
||||
}
|
||||
```
|
||||
ShuffleReduceFn is used here with lane_id set to 0 because it is not used
|
||||
therefore we save instructions by not retrieving lane_id from the corresponding
|
||||
special registers. The 4th parameters, which represents the version of the
|
||||
algorithm being used here, is set to 0 to signify full warp reduction.
|
||||
|
||||
In this version specified (=0), the ShuffleReduceFn behaves, per element, as
|
||||
follows:
|
||||
|
||||
```
|
||||
//reduce_elem refers to an element in the local ReduceData
|
||||
//remote_elem is retrieved from a remote lane
|
||||
remote_elem = shuffle_down(reduce_elem, offset, 32);
|
||||
reduce_elem = reduce_elem @ remote_elem;
|
||||
|
||||
```
|
||||
|
||||
An illustration of this algorithm operating on a hypothetical 8-lane full-warp
|
||||
would be:
|
||||
{F74}
|
||||
The coloring invariant follows that elements with the same color will be
|
||||
combined and reduced in the next reduction step. As can be observed, no overhead
|
||||
is present, exactly log(2, N) steps are needed.
|
||||
|
||||
2. Contiguous Full Warp Reduction
|
||||
```
|
||||
gpu_irregular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
|
||||
int lane_id) {
|
||||
int curr_size;
|
||||
int offset;
|
||||
curr_size = size;
|
||||
mask = curr_size/2;
|
||||
while (offset>0) {
|
||||
ShuffleReduceFn(reduce_data, lane_id, offset, 1);
|
||||
curr_size = (curr_size+1)/2;
|
||||
offset = curr_size/2;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
In this version specified (=1), the ShuffleReduceFn behaves, per element, as
|
||||
follows:
|
||||
```
|
||||
//reduce_elem refers to an element in the local ReduceData
|
||||
//remote_elem is retrieved from a remote lane
|
||||
remote_elem = shuffle_down(reduce_elem, offset, 32);
|
||||
if (lane_id < offset) {
|
||||
reduce_elem = reduce_elem @ remote_elem
|
||||
} else {
|
||||
reduce_elem = remote_elem
|
||||
}
|
||||
```
|
||||
|
||||
An important invariant (also a restriction on the starting state of the
|
||||
reduction) is that this algorithm assumes that all unused ReduceData are
|
||||
located in a contiguous subset of threads in a warp starting from lane 0.
|
||||
|
||||
With the presence of a trailing active lane with an odd-numbered lane
|
||||
id, its value will not be aggregated with any other lane. Therefore,
|
||||
in order to preserve the invariant, such ReduceData is copied to the first lane
|
||||
whose thread-local ReduceData has already being used in a previous reduction
|
||||
and would therefore be useless otherwise.
|
||||
|
||||
An illustration of this algorithm operating on a hypothetical 8-lane partial
|
||||
warp woud be:
|
||||
{F75}
|
||||
|
||||
As illustrated, this version of the algorithm introduces overhead whenever
|
||||
we have odd number of participating lanes in any reduction step to
|
||||
copy data between lanes.
|
||||
|
||||
3. Dispersed Partial Warp Reduction
|
||||
```
|
||||
gpu_irregular_simt_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr ShuffleReduceFn) {
|
||||
int size, remote_id;
|
||||
int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
|
||||
do {
|
||||
remote_id = find_the_next_active_lane_id_right_after_me();
|
||||
// the above function returns 0 of no active lane
|
||||
// is present right after the current thread.
|
||||
size = get_number_of_active_lanes_in_this_warp();
|
||||
logical_lane_id /= 2;
|
||||
ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
|
||||
} while (logical_lane_id % 2 == 0 && size > 1);
|
||||
```
|
||||
|
||||
There is no assumption made about the initial state of the reduction.
|
||||
Any number of lanes (>=1) could be active at any position. The reduction
|
||||
result is kept in the first active lane.
|
||||
|
||||
In this version specified (=2), the ShuffleReduceFn behaves, per element, as
|
||||
follows:
|
||||
```
|
||||
//reduce_elem refers to an element in the local ReduceData
|
||||
//remote_elem is retrieved from a remote lane
|
||||
remote_elem = shuffle_down(reduce_elem, offset, 32);
|
||||
if (LaneId % 2 == 0 && Offset > 0) {
|
||||
reduce_elem = reduce_elem @ remote_elem
|
||||
} else {
|
||||
reduce_elem = remote_elem
|
||||
}
|
||||
```
|
||||
We will proceed with a brief explanation for some arguments passed in,
|
||||
it is important to notice that, in this section, we will introduce the
|
||||
concept of logical_lane_id, and it is important to distinguish it
|
||||
from physical lane_id as defined by nvidia.
|
||||
1. //logical_lane_id//: as the name suggests, it refers to the calculated
|
||||
lane_id (instead of the physical one defined by nvidia) that would make
|
||||
our algorithm logically concise. A thread with logical_lane_id k means
|
||||
there are (k-1) threads before it.
|
||||
2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
|
||||
id of the remote lane from which we will retrieve the ReduceData. We
|
||||
subtract (threadIdx+1) from it because we would like to maintain only one
|
||||
underlying shuffle intrinsic (which is used to communicate among lanes in a
|
||||
warp). This particular version of shuffle intrinsic we take accepts only
|
||||
offsets, instead of absolute lane_id. Therefore the subtraction is performed
|
||||
on the absolute lane_id we calculated to obtain the offset.
|
||||
|
||||
This algorithm is slightly different in 2 ways and it is not, conceptually, a
|
||||
generalization of the above algorithms.
|
||||
1. It reduces elements close to each other. For instance, values in the 0th lane
|
||||
is to be combined with that of the 1st lane; values in the 2nd lane is to be
|
||||
combined with that of the 3rd lane. We did not use the previous algorithm
|
||||
where the first half of the (partial) warp is reduced with the second half
|
||||
of the (partial) warp. This is because, the mapping
|
||||
f(x): logical_lane_id -> physical_lane_id;
|
||||
can be easily calculated whereas its inverse
|
||||
f^-1(x): physical_lane_id -> logical_lane_id
|
||||
cannot and performing such reduction requires the inverse to be known.
|
||||
2. Because this algorithm is agnostic about the positions of the lanes that are
|
||||
active, we do not need to perform the coping step as in the second
|
||||
algorithm.
|
||||
An illustrative run would look like
|
||||
{F76}
|
||||
As observed, overhead is high because in each and every step of reduction,
|
||||
logical_lane_id is recalculated; so is the remote_id.
|
||||
|
||||
On a block level, we have implemented the following block reduce algorithm:
|
||||
|
||||
```
|
||||
gpu_irregular_block_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shuflReduceFn,
|
||||
kmp_InterWarpCopyFctPtr interWarpCpyFn,
|
||||
int size) {
|
||||
|
||||
int wid = threadIdx.x/WARPSIZE;
|
||||
int lane_id = threadIdx.x%WARPSIZE;
|
||||
|
||||
int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
|
||||
|
||||
unsigned tnum = __ballot(1);
|
||||
int thread_num = __popc(tnum);
|
||||
|
||||
//full warp reduction
|
||||
if (thread_num == WARPSIZE) {
|
||||
gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
|
||||
}
|
||||
//partial warp reduction
|
||||
if (thread_num < WARPSIZE) {
|
||||
gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
|
||||
lane_id);
|
||||
}
|
||||
//Gather all the reduced values from each warp
|
||||
//to the first warp
|
||||
//named_barrier inside this function to ensure
|
||||
//correctness. It is effectively a sync_thread
|
||||
//that won't deadlock.
|
||||
interWarpCpyFn(reduce_data, warp_needed);
|
||||
|
||||
//This is to reduce data gathered from each "warp master".
|
||||
if (wid==0) {
|
||||
gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
|
||||
lane_id);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
```
|
||||
In this function, no ShuffleReduceFn is directly called as it makes calls
|
||||
to various versions of the warp-reduction functions. It first reduces
|
||||
ReduceData warp by warp; in the end, we end up with the number of
|
||||
ReduceData equal to the number of warps present in this thread
|
||||
block. We then proceed to gather all such ReduceData to the first warp.
|
||||
|
||||
As observed, in this algorithm we make use of the function InterWarpCpyFn,
|
||||
which copies data from each of the "warp master" (0th lane of each warp, where
|
||||
a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
|
||||
mathematical sense) the problem of reduction across warp masters in a block to
|
||||
the problem of warp reduction which we already have solutions to.
|
||||
|
||||
We can thus completely avoid the use of atomics to reduce in a threadblock.
|
||||
|
||||
**Efficient Cross Block Reduce**
|
||||
|
||||
The next challenge is to reduce values across threadblocks. We aim to do this
|
||||
without atomics or critical sections.
|
||||
|
||||
Let a kernel be started with TB threadblocks.
|
||||
Let the GPU have S SMs.
|
||||
There can be at most N active threadblocks per SM at any time.
|
||||
|
||||
Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of
|
||||
at most 'N' active threadblocks on SM s. Let each threadblock active on an SM
|
||||
be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id)
|
||||
uniquely identifies an active threadblock on the GPU.
|
||||
|
||||
To efficiently implement cross block reduce, we first allocate an array for
|
||||
each value to be reduced of size S*N (which is the maximum number of active
|
||||
threadblocks at any time on the device).
|
||||
|
||||
Each threadblock reduces its value to slot [s][id]. This can be done without
|
||||
locking since no other threadblock can write to the same slot concurrently.
|
||||
|
||||
As a final stage, we reduce the values in the array as follows:
|
||||
|
||||
```
|
||||
// Compiler generated wrapper function for each target region with a reduction
|
||||
clause.
|
||||
target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1
|
||||
thread.
|
||||
// Use dynamic parallelism to launch M teams, N threads as requested by the
|
||||
user to execute the target region.
|
||||
|
||||
target_function<<M, N>>(map_args)
|
||||
|
||||
Reduce values in reduction_array
|
||||
|
||||
```
|
||||
|
||||
**Comparison with Last Version**
|
||||
|
||||
|
||||
The (simplified) pseudo code generated by LLVM on the host is as follows:
|
||||
|
||||
|
||||
```
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
|
||||
where:
|
||||
struct ReduceData {
|
||||
double *foo;
|
||||
double *bar;
|
||||
} reduceData
|
||||
reduceData.foo = &foo_p
|
||||
reduceData.bar = &bar_p
|
||||
|
||||
reduceFn is a pointer to a function that takes in two inputs
|
||||
of type ReduceData, "reduces" them element wise, and places the
|
||||
result in the first input:
|
||||
reduceFn(ReduceData *a, ReduceData *b)
|
||||
a = a @ b
|
||||
|
||||
Every thread in the parallel region calls kmpc_reduce_nowait with
|
||||
its private copy of reduceData. The runtime reduces across the
|
||||
threads (using tree reduction on the operator 'reduceFn?) and stores
|
||||
the final result in the master thread if successful.
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
5. else if ret == 2:
|
||||
In this case kmpc_reduce_nowait() could not use tree reduction,
|
||||
so use atomics instead:
|
||||
each thread atomically writes to foo
|
||||
each thread atomically writes to bar
|
||||
```
|
||||
|
||||
On a GPU, a similar reduction may need to be performed across SIMT threads,
|
||||
warps, and threadblocks. The challenge is to do so efficiently in a fashion
|
||||
that is compatible with the LLVM OpenMP implementation.
|
||||
|
||||
In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
|
||||
the salient steps of the code generated are as follows:
|
||||
|
||||
|
||||
```
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
|
||||
status = can_block_reduce()
|
||||
if status == 1:
|
||||
reduce efficiently to thread 0 using shuffles and shared memory.
|
||||
return 1
|
||||
else
|
||||
cannot use efficient block reduction, fallback to atomics
|
||||
return 2
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
5. else if ret == 2:
|
||||
In this case kmpc_reduce_nowait() could not use tree reduction,
|
||||
so use atomics instead:
|
||||
each thread atomically writes to foo
|
||||
each thread atomically writes to bar
|
||||
```
|
||||
|
||||
The function can_block_reduce() is defined as follows:
|
||||
|
||||
|
||||
```
|
||||
int32_t can_block_reduce() {
|
||||
int tid = GetThreadIdInTeam();
|
||||
int nt = GetNumberOfOmpThreads(tid);
|
||||
if (nt != blockDim.x)
|
||||
return 0;
|
||||
unsigned tnum = __ballot(1);
|
||||
if (tnum != (~0x0)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
```
|
||||
|
||||
This function permits the use of the efficient block reduction algorithm
|
||||
using shuffles and shared memory (return 1) only if (a) all SIMT threads in
|
||||
a warp are active (i.e., number of threads in the parallel region is a
|
||||
multiple of 32) and (b) the number of threads in the parallel region
|
||||
(set by the num_threads clause) equals blockDim.x.
|
||||
|
||||
If either of these preconditions is not true, each thread in the threadblock
|
||||
updates the global value using atomics.
|
||||
|
||||
Atomics and compare-and-swap operations are expensive on many threaded
|
||||
architectures such as GPUs and we must avoid them completely.
|
||||
|
||||
|
||||
**Appendix: Implementation Details**
|
||||
|
||||
|
||||
```
|
||||
// Compiler generated function.
|
||||
reduceFn(ReduceData *a, ReduceData *b)
|
||||
a->foo = a->foo + b->foo
|
||||
a->bar = a->bar + b->bar
|
||||
|
||||
// Compiler generated function.
|
||||
swapAndReduceFn(ReduceData *thread_private, int lane)
|
||||
ReduceData *remote = new ReduceData()
|
||||
remote->foo = shuffle_double(thread_private->foo, lane)
|
||||
remote->bar = shuffle_double(thread_private->bar, lane)
|
||||
reduceFn(thread_private, remote)
|
||||
|
||||
// OMP runtime function.
|
||||
warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
|
||||
offset = 16
|
||||
while (offset > 0)
|
||||
swapAndReduceFn(thread_private, offset)
|
||||
offset /= 2
|
||||
|
||||
// OMP runtime function.
|
||||
warpReduce_irregular():
|
||||
...
|
||||
|
||||
// OMP runtime function.
|
||||
kmpc_reduce_warp(reduceData, swapAndReduceFn)
|
||||
if all_lanes_active:
|
||||
warpReduce_regular(reduceData, swapAndReduceFn)
|
||||
else:
|
||||
warpReduce_irregular(reduceData, swapAndReduceFn)
|
||||
if in_simd_region:
|
||||
// all done, reduce to global in simd lane 0
|
||||
return 1
|
||||
else if in_parallel_region:
|
||||
// done reducing to one value per warp, now reduce across warps
|
||||
return 3
|
||||
|
||||
// OMP runtime function; one for each basic type.
|
||||
kmpc_reduce_block_double(double *a)
|
||||
if lane == 0:
|
||||
shared[wid] = *a
|
||||
named_barrier(1, num_threads)
|
||||
if wid == 0
|
||||
block_reduce(shared)
|
||||
if lane == 0
|
||||
*a = shared[0]
|
||||
named_barrier(1, num_threads)
|
||||
if wid == 0 and lane == 0
|
||||
return 1 // write back reduced result
|
||||
else
|
||||
return 0 // don't do anything
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
```
|
||||
// Compiler generated code.
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
5. else if ret == 3:
|
||||
ret = block_reduce_double(reduceData.foo)
|
||||
if ret == 1:
|
||||
foo += reduceData.foo
|
||||
ret = block_reduce_double(reduceData.bar)
|
||||
if ret == 1:
|
||||
bar += reduceData.bar
|
||||
```
|
||||
|
||||
**Notes**
|
||||
|
||||
1. This scheme requires that the CUDA OMP runtime can call llvm generated
|
||||
functions. This functionality now works.
|
||||
2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
|
||||
(including calls through function pointers) are optimized away.
|
||||
3. If we are reducing multiple to multiple variables in a parallel region,
|
||||
the reduce operations are all performed in warpReduce_[ir]regular(). This
|
||||
results in more instructions in the loop and should result in fewer
|
||||
stalls due to data dependencies. Unfortunately we cannot do the same in
|
||||
kmpc_reduce_block_double() without increasing shared memory usage.
|
28
openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu
Normal file
28
openmp/libomptarget/deviceRTLs/nvptx/src/cancel.cu
Normal file
@ -0,0 +1,28 @@
|
||||
//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Interface to be used in the implementation of OpenMP cancel.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t cancelVal) {
|
||||
PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", cancelVal);
|
||||
// disabled
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t cancelVal) {
|
||||
PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", cancelVal);
|
||||
// disabled
|
||||
return FALSE;
|
||||
}
|
51
openmp/libomptarget/deviceRTLs/nvptx/src/counter_group.h
Normal file
51
openmp/libomptarget/deviceRTLs/nvptx/src/counter_group.h
Normal file
@ -0,0 +1,51 @@
|
||||
//===------ counter_group.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Interface to implement OpenMP loop scheduling
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OMPTARGET_NVPTX_COUNTER_GROUP_H_
|
||||
#define _OMPTARGET_NVPTX_COUNTER_GROUP_H_
|
||||
|
||||
#include "option.h"
|
||||
|
||||
// counter group type for synchronizations
|
||||
class omptarget_nvptx_CounterGroup {
|
||||
public:
|
||||
// getters and setters
|
||||
INLINE Counter &Event() { return v_event; }
|
||||
INLINE volatile Counter &Start() { return v_start; }
|
||||
INLINE Counter &Init() { return v_init; }
|
||||
|
||||
// Synchronization Interface
|
||||
|
||||
INLINE void Clear(); // first time start=event
|
||||
INLINE void Reset(); // init = first
|
||||
INLINE void Init(Counter &priv); // priv = init
|
||||
INLINE Counter Next(); // just counts number of events
|
||||
|
||||
// set priv to n, to be used in later waitOrRelease
|
||||
INLINE void Complete(Counter &priv, Counter n);
|
||||
|
||||
// check priv and decide if we have to wait or can free the other warps
|
||||
INLINE void Release(Counter priv, Counter current_event_value);
|
||||
INLINE void WaitOrRelease(Counter priv, Counter current_event_value);
|
||||
|
||||
private:
|
||||
Counter v_event; // counter of events (atomic)
|
||||
|
||||
// volatile is needed to force loads to read from global
|
||||
// memory or L2 cache and see the write by the last master
|
||||
volatile Counter v_start; // signal when events registered are finished
|
||||
|
||||
Counter v_init; // used to initialize local thread variables
|
||||
};
|
||||
|
||||
#endif /* SRC_COUNTER_GROUP_H_ */
|
82
openmp/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
Normal file
82
openmp/libomptarget/deviceRTLs/nvptx/src/counter_groupi.h
Normal file
@ -0,0 +1,82 @@
|
||||
//===----- counter_groupi.h - NVPTX OpenMP loop scheduling ------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Interface implementation for OpenMP loop scheduling
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "option.h"
|
||||
|
||||
INLINE void omptarget_nvptx_CounterGroup::Clear() {
|
||||
PRINT0(LD_SYNCD, "clear counters\n")
|
||||
v_event = 0;
|
||||
v_start = 0;
|
||||
// v_init does not need to be reset (its value is dead)
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_CounterGroup::Reset() {
|
||||
// done by master before entering parallel
|
||||
ASSERT(LT_FUSSY, v_event == v_start,
|
||||
"error, entry %lld !=start %lld at reset\n", P64(v_event),
|
||||
P64(v_start));
|
||||
v_init = v_start;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_CounterGroup::Init(Counter &priv) {
|
||||
PRINT(LD_SYNCD, "init priv counter 0x%llx with val %lld\n", P64(&priv),
|
||||
P64(v_start));
|
||||
priv = v_start;
|
||||
}
|
||||
|
||||
// just counts number of events
|
||||
INLINE Counter omptarget_nvptx_CounterGroup::Next() {
|
||||
Counter oldVal = atomicAdd(&v_event, (Counter)1);
|
||||
PRINT(LD_SYNCD, "next event counter 0x%llx with val %lld->%lld\n",
|
||||
P64(&v_event), P64(oldVal), P64(oldVal + 1));
|
||||
|
||||
return oldVal;
|
||||
}
|
||||
|
||||
// set priv to n, to be used in later waitOrRelease
|
||||
INLINE void omptarget_nvptx_CounterGroup::Complete(Counter &priv, Counter n) {
|
||||
PRINT(LD_SYNCD, "complete priv counter 0x%llx with val %lld->%lld (+%d)\n",
|
||||
P64(&priv), P64(priv), P64(priv + n), n);
|
||||
priv += n;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_CounterGroup::Release(Counter priv,
|
||||
Counter current_event_value) {
|
||||
if (priv - 1 == current_event_value) {
|
||||
PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n",
|
||||
P64(&v_start), P64(v_start), P64(priv));
|
||||
v_start = priv;
|
||||
}
|
||||
}
|
||||
|
||||
// check priv and decide if we have to wait or can free the other warps
|
||||
INLINE void
|
||||
omptarget_nvptx_CounterGroup::WaitOrRelease(Counter priv,
|
||||
Counter current_event_value) {
|
||||
if (priv - 1 == current_event_value) {
|
||||
PRINT(LD_SYNCD, "Release start counter 0x%llx with val %lld->%lld\n",
|
||||
P64(&v_start), P64(v_start), P64(priv));
|
||||
v_start = priv;
|
||||
} else {
|
||||
PRINT(LD_SYNCD,
|
||||
"Start waiting while start counter 0x%llx with val %lld < %lld\n",
|
||||
P64(&v_start), P64(v_start), P64(priv));
|
||||
while (priv > v_start) {
|
||||
// IDLE LOOP
|
||||
// start is volatile: it will be re-loaded at each while loop
|
||||
}
|
||||
PRINT(LD_SYNCD,
|
||||
"Done waiting as start counter 0x%llx with val %lld >= %lld\n",
|
||||
P64(&v_start), P64(v_start), P64(priv));
|
||||
}
|
||||
}
|
32
openmp/libomptarget/deviceRTLs/nvptx/src/critical.cu
Normal file
32
openmp/libomptarget/deviceRTLs/nvptx/src/critical.cu
Normal file
@ -0,0 +1,32 @@
|
||||
//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of critical with KMPC interface
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
EXTERN
|
||||
void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
|
||||
kmp_CriticalName *lck) {
|
||||
PRINT0(LD_IO, "call to kmpc_critical()\n");
|
||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||
omp_set_lock(teamDescr.CriticalLock());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
|
||||
kmp_CriticalName *lck) {
|
||||
PRINT0(LD_IO, "call to kmpc_end_critical()\n");
|
||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||
omp_unset_lock(teamDescr.CriticalLock());
|
||||
}
|
324
openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
Normal file
324
openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
Normal file
@ -0,0 +1,324 @@
|
||||
//===----- data_sharing.cu - NVPTX OpenMP debug utilities -------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of data sharing environments/
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "omptarget-nvptx.h"
|
||||
#include <stdio.h>
|
||||
|
||||
// Number of threads in the CUDA block.
|
||||
__device__ static unsigned getNumThreads() { return blockDim.x; }
|
||||
// Thread ID in the CUDA block
|
||||
__device__ static unsigned getThreadId() { return threadIdx.x; }
|
||||
// Warp ID in the CUDA block
|
||||
__device__ static unsigned getWarpId() { return threadIdx.x / WARPSIZE; }
|
||||
|
||||
// The CUDA thread ID of the master thread.
|
||||
__device__ static unsigned getMasterThreadId() {
|
||||
unsigned Mask = WARPSIZE - 1;
|
||||
return (getNumThreads() - 1) & (~Mask);
|
||||
}
|
||||
|
||||
// Find the active threads in the warp - return a mask whose n-th bit is set if
|
||||
// the n-th thread in the warp is active.
|
||||
__device__ static unsigned getActiveThreadsMask() {
|
||||
return __BALLOT_SYNC(0xFFFFFFFF, true);
|
||||
}
|
||||
|
||||
// Return true if this is the first active thread in the warp.
|
||||
__device__ static bool IsWarpMasterActiveThread() {
|
||||
unsigned long long Mask = getActiveThreadsMask();
|
||||
unsigned long long ShNum = WARPSIZE - (getThreadId() % WARPSIZE);
|
||||
unsigned long long Sh = Mask << ShNum;
|
||||
return Sh == 0;
|
||||
}
|
||||
// Return true if this is the master thread.
|
||||
__device__ static bool IsMasterThread() {
|
||||
return getMasterThreadId() == getThreadId();
|
||||
}
|
||||
|
||||
/// Return the provided size aligned to the size of a pointer.
|
||||
__device__ static size_t AlignVal(size_t Val) {
|
||||
const size_t Align = (size_t)sizeof(void *);
|
||||
if (Val & (Align - 1)) {
|
||||
Val += Align;
|
||||
Val &= ~(Align - 1);
|
||||
}
|
||||
return Val;
|
||||
}
|
||||
|
||||
#define DSFLAG 0
|
||||
#define DSFLAG_INIT 0
|
||||
#define DSPRINT(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (_flag) { \
|
||||
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \
|
||||
} \
|
||||
}
|
||||
#define DSPRINT0(_flag, _str) \
|
||||
{ \
|
||||
if (_flag) { \
|
||||
/*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \
|
||||
} \
|
||||
}
|
||||
|
||||
// Initialize the shared data structures. This is expected to be called for the
|
||||
// master thread and warp masters. \param RootS: A pointer to the root of the
|
||||
// data sharing stack. \param InitialDataSize: The initial size of the data in
|
||||
// the slot.
|
||||
EXTERN void
|
||||
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS,
|
||||
size_t InitialDataSize) {
|
||||
|
||||
DSPRINT0(DSFLAG_INIT,
|
||||
"Entering __kmpc_initialize_data_sharing_environment\n");
|
||||
|
||||
unsigned WID = getWarpId();
|
||||
DSPRINT(DSFLAG_INIT, "Warp ID: %d\n", WID);
|
||||
|
||||
omptarget_nvptx_TeamDescr *teamDescr =
|
||||
&omptarget_nvptx_threadPrivateContext->TeamContext();
|
||||
__kmpc_data_sharing_slot *RootS = teamDescr->RootS(WID);
|
||||
|
||||
DataSharingState.SlotPtr[WID] = RootS;
|
||||
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
|
||||
|
||||
// We don't need to initialize the frame and active threads.
|
||||
|
||||
DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", InitialDataSize);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (long long)RootS);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n",
|
||||
(long long)RootS->DataEnd);
|
||||
DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", (long long)RootS->Next);
|
||||
DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n",
|
||||
(long long)DataSharingState.SlotPtr[WID]);
|
||||
DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n",
|
||||
(long long)DataSharingState.StackPtr[WID]);
|
||||
|
||||
DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n");
|
||||
}
|
||||
|
||||
EXTERN void *__kmpc_data_sharing_environment_begin(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, int32_t *SavedActiveThreads,
|
||||
size_t SharingDataSize, size_t SharingDefaultDataSize,
|
||||
int16_t IsOMPRuntimeInitialized) {
|
||||
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
|
||||
|
||||
// If the runtime has been elided, used __shared__ memory for master-worker
|
||||
// data sharing.
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return (void *)&DataSharingState;
|
||||
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
|
||||
|
||||
unsigned WID = getWarpId();
|
||||
unsigned CurActiveThreads = getActiveThreadsMask();
|
||||
|
||||
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
|
||||
void *&StackP = DataSharingState.StackPtr[WID];
|
||||
void *&FrameP = DataSharingState.FramePtr[WID];
|
||||
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
|
||||
|
||||
DSPRINT0(DSFLAG, "Save current slot/stack values.\n");
|
||||
// Save the current values.
|
||||
*SavedSharedSlot = SlotP;
|
||||
*SavedSharedStack = StackP;
|
||||
*SavedSharedFrame = FrameP;
|
||||
*SavedActiveThreads = ActiveT;
|
||||
|
||||
DSPRINT(DSFLAG, "Warp ID: %d\n", WID);
|
||||
DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (long long)StackP);
|
||||
DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
|
||||
|
||||
// Only the warp active master needs to grow the stack.
|
||||
if (IsWarpMasterActiveThread()) {
|
||||
// Save the current active threads.
|
||||
ActiveT = CurActiveThreads;
|
||||
|
||||
// Make sure we use aligned sizes to avoid rematerialization of data.
|
||||
SharingDataSize = AlignVal(SharingDataSize);
|
||||
// FIXME: The default data size can be assumed to be aligned?
|
||||
SharingDefaultDataSize = AlignVal(SharingDefaultDataSize);
|
||||
|
||||
// Check if we have room for the data in the current slot.
|
||||
const uintptr_t CurrentStartAddress = (uintptr_t)StackP;
|
||||
const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd;
|
||||
const uintptr_t RequiredEndAddress =
|
||||
CurrentStartAddress + (uintptr_t)SharingDataSize;
|
||||
|
||||
DSPRINT(DSFLAG, "Data Size %016llx\n", SharingDataSize);
|
||||
DSPRINT(DSFLAG, "Default Data Size %016llx\n", SharingDefaultDataSize);
|
||||
DSPRINT(DSFLAG, "Current Start Address %016llx\n", CurrentStartAddress);
|
||||
DSPRINT(DSFLAG, "Current End Address %016llx\n", CurrentEndAddress);
|
||||
DSPRINT(DSFLAG, "Required End Address %016llx\n", RequiredEndAddress);
|
||||
DSPRINT(DSFLAG, "Active Threads %08x\n", ActiveT);
|
||||
|
||||
// If we require a new slot, allocate it and initialize it (or attempt to
|
||||
// reuse one). Also, set the shared stack and slot pointers to the new
|
||||
// place. If we do not need to grow the stack, just adapt the stack and
|
||||
// frame pointers.
|
||||
if (CurrentEndAddress < RequiredEndAddress) {
|
||||
size_t NewSize = (SharingDataSize > SharingDefaultDataSize)
|
||||
? SharingDataSize
|
||||
: SharingDefaultDataSize;
|
||||
__kmpc_data_sharing_slot *NewSlot = 0;
|
||||
|
||||
// Attempt to reuse an existing slot.
|
||||
if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) {
|
||||
uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd -
|
||||
(uintptr_t)(&ExistingSlot->Data[0]);
|
||||
if (ExistingSlotSize >= NewSize) {
|
||||
DSPRINT(DSFLAG, "Reusing stack slot %016llx\n",
|
||||
(long long)ExistingSlot);
|
||||
NewSlot = ExistingSlot;
|
||||
} else {
|
||||
DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n",
|
||||
(long long)SlotP->Next);
|
||||
free(ExistingSlot);
|
||||
}
|
||||
}
|
||||
|
||||
if (!NewSlot) {
|
||||
NewSlot = (__kmpc_data_sharing_slot *)malloc(
|
||||
sizeof(__kmpc_data_sharing_slot) + NewSize);
|
||||
DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n",
|
||||
(long long)NewSlot, NewSize);
|
||||
}
|
||||
|
||||
NewSlot->Next = 0;
|
||||
NewSlot->DataEnd = &NewSlot->Data[NewSize];
|
||||
|
||||
SlotP->Next = NewSlot;
|
||||
SlotP = NewSlot;
|
||||
StackP = &NewSlot->Data[SharingDataSize];
|
||||
FrameP = &NewSlot->Data[0];
|
||||
} else {
|
||||
|
||||
// Clean up any old slot that we may still have. The slot producers, do
|
||||
// not eliminate them because that may be used to return data.
|
||||
if (SlotP->Next) {
|
||||
DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n",
|
||||
(long long)SlotP->Next);
|
||||
free(SlotP->Next);
|
||||
SlotP->Next = 0;
|
||||
}
|
||||
|
||||
FrameP = StackP;
|
||||
StackP = (void *)RequiredEndAddress;
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Need to see the impact of doing it here.
|
||||
__threadfence_block();
|
||||
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n");
|
||||
|
||||
// All the threads in this warp get the frame they should work with.
|
||||
return FrameP;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_data_sharing_environment_end(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, int32_t *SavedActiveThreads,
|
||||
int32_t IsEntryPoint) {
|
||||
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n");
|
||||
|
||||
unsigned WID = getWarpId();
|
||||
|
||||
if (IsEntryPoint) {
|
||||
if (IsWarpMasterActiveThread()) {
|
||||
DSPRINT0(DSFLAG, "Doing clean up\n");
|
||||
|
||||
// The master thread cleans the saved slot, because this is an environment
|
||||
// only for the master.
|
||||
__kmpc_data_sharing_slot *S =
|
||||
IsMasterThread() ? *SavedSharedSlot : DataSharingState.SlotPtr[WID];
|
||||
|
||||
if (S->Next) {
|
||||
free(S->Next);
|
||||
S->Next = 0;
|
||||
}
|
||||
}
|
||||
|
||||
DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n");
|
||||
return;
|
||||
}
|
||||
|
||||
int32_t CurActive = getActiveThreadsMask();
|
||||
|
||||
// Only the warp master can restore the stack and frame information, and only
|
||||
// if there are no other threads left behind in this environment (i.e. the
|
||||
// warp diverged and returns in different places). This only works if we
|
||||
// assume that threads will converge right after the call site that started
|
||||
// the environment.
|
||||
if (IsWarpMasterActiveThread()) {
|
||||
int32_t &ActiveT = DataSharingState.ActiveThreads[WID];
|
||||
|
||||
DSPRINT0(DSFLAG, "Before restoring the stack\n");
|
||||
// Zero the bits in the mask. If it is still different from zero, then we
|
||||
// have other threads that will return after the current ones.
|
||||
ActiveT &= ~CurActive;
|
||||
|
||||
DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", CurActive,
|
||||
ActiveT);
|
||||
|
||||
if (!ActiveT) {
|
||||
// No other active threads? Great, lets restore the stack.
|
||||
|
||||
__kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
|
||||
void *&StackP = DataSharingState.StackPtr[WID];
|
||||
void *&FrameP = DataSharingState.FramePtr[WID];
|
||||
|
||||
SlotP = *SavedSharedSlot;
|
||||
StackP = *SavedSharedStack;
|
||||
FrameP = *SavedSharedFrame;
|
||||
ActiveT = *SavedActiveThreads;
|
||||
|
||||
DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", (long long)SlotP);
|
||||
DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", (long long)StackP);
|
||||
DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", (long long)FrameP);
|
||||
DSPRINT(DSFLAG, "Active threads: %08x \n", ActiveT);
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: Need to see the impact of doing it here.
|
||||
__threadfence_block();
|
||||
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n");
|
||||
return;
|
||||
}
|
||||
|
||||
EXTERN void *
|
||||
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
|
||||
int16_t IsOMPRuntimeInitialized) {
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
|
||||
|
||||
// If the runtime has been elided, use __shared__ memory for master-worker
|
||||
// data sharing. We're reusing the statically allocated data structure
|
||||
// that is used for standard data sharing.
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return (void *)&DataSharingState;
|
||||
|
||||
// Get the frame used by the requested thread.
|
||||
|
||||
unsigned SourceWID = SourceThreadID / WARPSIZE;
|
||||
|
||||
DSPRINT(DSFLAG, "Source warp: %d\n", SourceWID);
|
||||
|
||||
void *P = DataSharingState.FramePtr[SourceWID];
|
||||
DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n");
|
||||
return P;
|
||||
}
|
276
openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
Normal file
276
openmp/libomptarget/deviceRTLs/nvptx/src/debug.h
Normal file
@ -0,0 +1,276 @@
|
||||
//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains debug macros to be used in the application.
|
||||
//
|
||||
// Usage guide
|
||||
//
|
||||
// PRINT0(flag, str) : if debug flag is on, print (no arguments)
|
||||
// PRINT(flag, str, args) : if debug flag is on, print (arguments)
|
||||
// DON(flag) : return true if debug flag is on
|
||||
//
|
||||
// ASSERT(flag, cond, str, args): if test flag is on, test the condition
|
||||
// if the condition is false, print str+args
|
||||
// and assert.
|
||||
// CAUTION: cond may be evaluate twice
|
||||
// AON(flag) : return true if test flag is on
|
||||
//
|
||||
// WARNING(flag, str, args) : if warning flag is on, print the warning
|
||||
// WON(flag) : return true if warning flag is on
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OMPTARGET_NVPTX_DEBUG_H_
|
||||
#define _OMPTARGET_NVPTX_DEBUG_H_
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set desired level of debugging
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define LD_SET_NONE 0ULL /* none */
|
||||
#define LD_SET_ALL -1ULL /* all */
|
||||
|
||||
// pos 1
|
||||
#define LD_SET_LOOP 0x1ULL /* basic loop */
|
||||
#define LD_SET_LOOPD 0x2ULL /* basic loop */
|
||||
#define LD_SET_PAR 0x4ULL /* basic parallel */
|
||||
#define LD_SET_PARD 0x8ULL /* basic parallel */
|
||||
|
||||
// pos 2
|
||||
#define LD_SET_SYNC 0x10ULL /* sync info */
|
||||
#define LD_SET_SYNCD 0x20ULL /* sync info */
|
||||
#define LD_SET_WAIT 0x40ULL /* state when waiting */
|
||||
#define LD_SET_TASK 0x80ULL /* print task info (high level) */
|
||||
|
||||
// pos 3
|
||||
#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */
|
||||
#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */
|
||||
#define LD_SET_ENV 0x400ULL /* env info */
|
||||
#define LD_SET_CANCEL 0x800ULL /* print cancel info */
|
||||
|
||||
// pos 4
|
||||
#define LD_SET_MEM 0x1000ULL /* malloc / free */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set the desired flags to print selected output.
|
||||
|
||||
// these are some examples of possible definitions that can be used for
|
||||
// debugging.
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
|
||||
// on cuda buffer
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
|
||||
|
||||
#ifndef OMPTARGET_NVPTX_DEBUG
|
||||
#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
|
||||
#elif OMPTARGET_NVPTX_DEBUG
|
||||
#warning debug is used, not good for measurements
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set desired level of asserts
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// available flags
|
||||
|
||||
#define LT_SET_NONE 0x0 /* unsafe */
|
||||
#define LT_SET_SAFETY \
|
||||
0x1 /* check malloc type of stuff, input at creation, cheap */
|
||||
#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
|
||||
#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set the desired flags
|
||||
|
||||
#ifndef OMPTARGET_NVPTX_TEST
|
||||
#if OMPTARGET_NVPTX_DEBUG
|
||||
#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
|
||||
#else
|
||||
#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set desired level of warnings
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// available flags
|
||||
|
||||
#define LW_SET_ALL -1
|
||||
#define LW_SET_NONE 0x0
|
||||
#define LW_SET_ENV 0x1
|
||||
#define LW_SET_INPUT 0x2
|
||||
#define LW_SET_FUSSY 0x4
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set the desired flags
|
||||
|
||||
#if OMPTARGET_NVPTX_DEBUG
|
||||
#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
|
||||
#else
|
||||
#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// implemtation for debug
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
#if OMPTARGET_NVPTX_TEST
|
||||
#include <assert.h>
|
||||
#endif
|
||||
|
||||
// set flags that are tested (inclusion properties)
|
||||
|
||||
#define LD_ALL (LD_SET_ALL)
|
||||
|
||||
#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
|
||||
#define LD_LOOPD (LD_SET_LOOPD)
|
||||
#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
|
||||
#define LD_PARD (LD_SET_PARD)
|
||||
|
||||
// pos 2
|
||||
#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
|
||||
#define LD_SYNCD (LD_SET_SYNCD)
|
||||
#define LD_WAIT (LD_SET_WAIT)
|
||||
#define LD_TASK (LD_SET_TASK)
|
||||
|
||||
// pos 3
|
||||
#define LD_IO (LD_SET_IO | LD_SET_IOD)
|
||||
#define LD_IOD (LD_SET_IOD)
|
||||
#define LD_ENV (LD_SET_ENV)
|
||||
#define LD_CANCEL (LD_SET_CANCEL)
|
||||
|
||||
// pos 3
|
||||
#define LD_MEM (LD_SET_MEM)
|
||||
|
||||
// implement
|
||||
#if OMPTARGET_NVPTX_DEBUG
|
||||
|
||||
#define DON(_flag) ((OMPTARGET_NVPTX_DEBUG) & (_flag))
|
||||
|
||||
#define PRINT0(_flag, _str) \
|
||||
{ \
|
||||
if (DON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
|
||||
threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define PRINT(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (DON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d>: " _str, blockIdx.x, threadIdx.x, \
|
||||
threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
|
||||
#define DON(_flag) (FALSE)
|
||||
#define PRINT0(flag, str)
|
||||
#define PRINT(flag, str, _args...)
|
||||
|
||||
#endif
|
||||
|
||||
// for printing without worring about precision, pointers...
|
||||
#define P64(_x) ((unsigned long long)(_x))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// early defs for test
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
|
||||
#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
|
||||
#define LT_FUSSY (LT_SET_FUSSY)
|
||||
|
||||
#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
|
||||
|
||||
#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
|
||||
#define ASSERT0(_flag, _cond, _str) \
|
||||
{ \
|
||||
if (TON(_flag)) { \
|
||||
assert(_cond); \
|
||||
} \
|
||||
}
|
||||
#define ASSERT(_flag, _cond, _str, _args...) \
|
||||
{ \
|
||||
if (TON(_flag)) { \
|
||||
assert(_cond); \
|
||||
} \
|
||||
}
|
||||
|
||||
#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
|
||||
|
||||
#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
|
||||
#define ASSERT0(_flag, _cond, _str) \
|
||||
{ \
|
||||
if (TON(_flag) && !(_cond)) { \
|
||||
printf("<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n", blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
|
||||
assert(_cond); \
|
||||
} \
|
||||
}
|
||||
#define ASSERT(_flag, _cond, _str, _args...) \
|
||||
{ \
|
||||
if (TON(_flag) && !(_cond)) { \
|
||||
printf("<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
|
||||
assert(_cond); \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define TON(_flag) (FALSE)
|
||||
#define ASSERT0(_flag, _cond, _str)
|
||||
#define ASSERT(_flag, _cond, _str, _args...)
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// early defs for warning
|
||||
|
||||
#define LW_ALL (LW_SET_ALL)
|
||||
#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
|
||||
#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
|
||||
#define LW_FUSSY (LW_SET_FUSSY)
|
||||
|
||||
#if OMPTARGET_NVPTX_WARNING
|
||||
|
||||
#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
|
||||
#define WARNING0(_flag, _str) \
|
||||
{ \
|
||||
if (WON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F); \
|
||||
} \
|
||||
}
|
||||
#define WARNING(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (WON(_flag)) { \
|
||||
printf("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, blockIdx.x, \
|
||||
threadIdx.x, threadIdx.x / WARPSIZE, threadIdx.x & 0x1F, _args); \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define WON(_flag) (FALSE)
|
||||
#define WARNING0(_flag, _str)
|
||||
#define WARNING(_flag, _str, _args...)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
509
openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
Normal file
509
openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
Normal file
@ -0,0 +1,509 @@
|
||||
//===------- interface.h - NVPTX OpenMP interface definitions ---- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains debug macros to be used in the application.
|
||||
//
|
||||
// This file contains all the definitions that are relevant to
|
||||
// the interface. The first section contains the interface as
|
||||
// declared by OpenMP. A second section includes library private calls
|
||||
// (mostly debug, temporary?) The third section includes the compiler
|
||||
// specific interfaces.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _INTERFACES_H_
|
||||
#define _INTERFACES_H_
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
|
||||
typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
|
||||
|
||||
typedef enum omp_sched_t {
|
||||
omp_sched_static = 1, /* chunkSize >0 */
|
||||
omp_sched_dynamic = 2, /* chunkSize >0 */
|
||||
omp_sched_guided = 3, /* chunkSize >0 */
|
||||
omp_sched_auto = 4, /* no chunkSize */
|
||||
} omp_sched_t;
|
||||
|
||||
typedef enum omp_proc_bind_t {
|
||||
omp_proc_bind_false = 0,
|
||||
omp_proc_bind_true = 1,
|
||||
omp_proc_bind_master = 2,
|
||||
omp_proc_bind_close = 3,
|
||||
omp_proc_bind_spread = 4
|
||||
} omp_proc_bind_t;
|
||||
|
||||
EXTERN double omp_get_wtick(void);
|
||||
EXTERN double omp_get_wtime(void);
|
||||
|
||||
EXTERN void omp_set_num_threads(int num);
|
||||
EXTERN int omp_get_num_threads(void);
|
||||
EXTERN int omp_get_max_threads(void);
|
||||
EXTERN int omp_get_thread_limit(void);
|
||||
EXTERN int omp_get_thread_num(void);
|
||||
EXTERN int omp_get_num_procs(void);
|
||||
EXTERN int omp_in_parallel(void);
|
||||
EXTERN int omp_in_final(void);
|
||||
EXTERN void omp_set_dynamic(int flag);
|
||||
EXTERN int omp_get_dynamic(void);
|
||||
EXTERN void omp_set_nested(int flag);
|
||||
EXTERN int omp_get_nested(void);
|
||||
EXTERN void omp_set_max_active_levels(int level);
|
||||
EXTERN int omp_get_max_active_levels(void);
|
||||
EXTERN int omp_get_level(void);
|
||||
EXTERN int omp_get_active_level(void);
|
||||
EXTERN int omp_get_ancestor_thread_num(int level);
|
||||
EXTERN int omp_get_team_size(int level);
|
||||
|
||||
EXTERN void omp_init_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN void omp_destroy_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN void omp_set_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN void omp_unset_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN int omp_test_lock(omp_lock_t *lock);
|
||||
EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
|
||||
|
||||
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
|
||||
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
|
||||
EXTERN omp_proc_bind_t omp_get_proc_bind(void);
|
||||
EXTERN int omp_get_cancellation(void);
|
||||
EXTERN void omp_set_default_device(int deviceId);
|
||||
EXTERN int omp_get_default_device(void);
|
||||
EXTERN int omp_get_num_devices(void);
|
||||
EXTERN int omp_get_num_teams(void);
|
||||
EXTERN int omp_get_team_num(void);
|
||||
EXTERN int omp_is_initial_device(void);
|
||||
EXTERN int omp_get_initial_device(void);
|
||||
EXTERN int omp_get_max_task_priority(void);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OMPTARGET_NVPTX private (debug / temportary?) interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// for debug
|
||||
EXTERN void __kmpc_print_str(char *title);
|
||||
EXTERN void __kmpc_print_title_int(char *title, int data);
|
||||
EXTERN void __kmpc_print_index(char *title, int i);
|
||||
EXTERN void __kmpc_print_int(int data);
|
||||
EXTERN void __kmpc_print_double(double data);
|
||||
EXTERN void __kmpc_print_address_int64(int64_t data);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// file below is swiped from kmpc host interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// kmp specifc types
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef enum kmp_sched_t {
|
||||
kmp_sched_static_chunk = 33,
|
||||
kmp_sched_static_nochunk = 34,
|
||||
kmp_sched_dynamic = 35,
|
||||
kmp_sched_guided = 36,
|
||||
kmp_sched_runtime = 37,
|
||||
kmp_sched_auto = 38,
|
||||
|
||||
kmp_sched_static_ordered = 65,
|
||||
kmp_sched_static_nochunk_ordered = 66,
|
||||
kmp_sched_dynamic_ordered = 67,
|
||||
kmp_sched_guided_ordered = 68,
|
||||
kmp_sched_runtime_ordered = 69,
|
||||
kmp_sched_auto_ordered = 70,
|
||||
|
||||
kmp_sched_distr_static_chunk = 91,
|
||||
kmp_sched_distr_static_nochunk = 92,
|
||||
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
|
||||
|
||||
kmp_sched_default = kmp_sched_static_nochunk,
|
||||
kmp_sched_unordered_first = kmp_sched_static_chunk,
|
||||
kmp_sched_unordered_last = kmp_sched_auto,
|
||||
kmp_sched_ordered_first = kmp_sched_static_ordered,
|
||||
kmp_sched_ordered_last = kmp_sched_auto_ordered,
|
||||
kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
|
||||
kmp_sched_distribute_last =
|
||||
kmp_sched_distr_static_chunk_sched_static_chunkone,
|
||||
|
||||
/* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
|
||||
* Since we need to distinguish the three possible cases (no modifier,
|
||||
* monotonic modifier, nonmonotonic modifier), we need separate bits for
|
||||
* each modifier. The absence of monotonic does not imply nonmonotonic,
|
||||
* especially since 4.5 says that the behaviour of the "no modifier" case
|
||||
* is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
|
||||
*
|
||||
* Since we're passing a full 32 bit value, we can use a couple of high
|
||||
* bits for these flags; out of paranoia we avoid the sign bit.
|
||||
*
|
||||
* These modifiers can be or-ed into non-static schedules by the compiler
|
||||
* to pass the additional information. They will be stripped early in the
|
||||
* processing in __kmp_dispatch_init when setting up schedules, so
|
||||
* most of the code won't ever see schedules with these bits set.
|
||||
*/
|
||||
kmp_sched_modifier_monotonic = (1 << 29),
|
||||
/**< Set if the monotonic schedule modifier was present */
|
||||
kmp_sched_modifier_nonmonotonic = (1 << 30),
|
||||
/**< Set if the nonmonotonic schedule modifier was present */
|
||||
|
||||
#define SCHEDULE_WITHOUT_MODIFIERS(s) \
|
||||
(enum kmp_sched_t)( \
|
||||
(s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
|
||||
#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
|
||||
#define SCHEDULE_HAS_NONMONOTONIC(s) \
|
||||
(((s)&kmp_sched_modifier_nonmonotonic) != 0)
|
||||
#define SCHEDULE_HAS_NO_MODIFIERS(s) \
|
||||
(((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
|
||||
0)
|
||||
|
||||
} kmp_sched_t;
|
||||
|
||||
// parallel defs
|
||||
typedef void kmp_Indent;
|
||||
typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...);
|
||||
typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData);
|
||||
typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
|
||||
typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
|
||||
int16_t lane_offset,
|
||||
int16_t shortCircuit);
|
||||
typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad,
|
||||
int32_t index, int32_t width);
|
||||
typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad,
|
||||
int32_t index, int32_t width,
|
||||
int32_t reduce);
|
||||
|
||||
// task defs
|
||||
typedef struct kmp_TaskDescr kmp_TaskDescr;
|
||||
typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
|
||||
typedef struct kmp_TaskDescr {
|
||||
void *sharedPointerTable; // ptr to a table of shared var ptrs
|
||||
kmp_TaskFctPtr sub; // task subroutine
|
||||
int32_t partId; // unused
|
||||
kmp_TaskFctPtr destructors; // destructor of c++ first private
|
||||
} kmp_TaskDescr;
|
||||
// task dep defs
|
||||
#define KMP_TASKDEP_IN 0x1u
|
||||
#define KMP_TASKDEP_OUT 0x2u
|
||||
typedef struct kmp_TaskDep_Public {
|
||||
void *addr;
|
||||
size_t len;
|
||||
uint8_t flags; // bit 0: in, bit 1: out
|
||||
} kmp_TaskDep_Public;
|
||||
|
||||
// flags that interpret the interface part of tasking flags
|
||||
#define KMP_TASK_IS_TIED 0x1
|
||||
#define KMP_TASK_FINAL 0x2
|
||||
#define KMP_TASK_MERGED_IF0 0x4 /* unused */
|
||||
#define KMP_TASK_DESTRUCTOR_THUNK 0x8
|
||||
|
||||
// flags for task setup return
|
||||
#define KMP_CURRENT_TASK_NOT_SUSPENDED 0
|
||||
#define KMP_CURRENT_TASK_SUSPENDED 1
|
||||
|
||||
// sync defs
|
||||
typedef int32_t kmp_CriticalName[8];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// flags for kstate (all bits initially off)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp)
|
||||
#define KMP_REDUCTION_MASK 0x3
|
||||
#define KMP_SKIP_NEXT_CALL 0x4
|
||||
#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// data
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// external interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// query
|
||||
EXTERN int32_t __kmpc_global_num_threads(kmp_Indent *loc); // missing
|
||||
EXTERN int32_t __kmpc_bound_thread_num(kmp_Indent *loc); // missing
|
||||
EXTERN int32_t __kmpc_bound_num_threads(kmp_Indent *loc); // missing
|
||||
EXTERN int32_t __kmpc_in_parallel(kmp_Indent *loc); // missing
|
||||
|
||||
// parallel
|
||||
EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc);
|
||||
EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t num_threads);
|
||||
// simd
|
||||
EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t simd_limit);
|
||||
// aee ... not supported
|
||||
// EXTERN void __kmpc_fork_call(kmp_Indent *loc, int32_t argc, kmp_ParFctPtr
|
||||
// microtask, ...);
|
||||
EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid);
|
||||
EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
|
||||
uint32_t global_tid);
|
||||
EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid);
|
||||
|
||||
// proc bind
|
||||
EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t global_tid,
|
||||
int proc_bind);
|
||||
EXTERN int omp_get_num_places(void);
|
||||
EXTERN int omp_get_place_num_procs(int place_num);
|
||||
EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
|
||||
EXTERN int omp_get_place_num(void);
|
||||
EXTERN int omp_get_partition_num_places(void);
|
||||
EXTERN void omp_get_partition_place_nums(int *place_nums);
|
||||
|
||||
// for static (no chunk or chunk)
|
||||
EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_generic(kmp_Indent *loc,
|
||||
int32_t global_tid, int32_t sched,
|
||||
int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_generic(
|
||||
kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_generic(kmp_Indent *loc,
|
||||
int32_t global_tid, int32_t sched,
|
||||
int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_generic(
|
||||
kmp_Indent *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
|
||||
EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid);
|
||||
|
||||
// for dynamic
|
||||
EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t lower, int32_t upper,
|
||||
int32_t incr, int32_t chunk);
|
||||
EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, uint32_t lower,
|
||||
uint32_t upper, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, int64_t lower, int64_t upper,
|
||||
int64_t incr, int64_t chunk);
|
||||
EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t sched, uint64_t lower,
|
||||
uint64_t upper, int64_t incr,
|
||||
int64_t chunk);
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t *plastiter, int32_t *plower,
|
||||
int32_t *pupper, int32_t *pstride);
|
||||
EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t *plastiter, uint32_t *plower,
|
||||
uint32_t *pupper, int32_t *pstride);
|
||||
EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t *plastiter, int64_t *plower,
|
||||
int64_t *pupper, int64_t *pstride);
|
||||
EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t *plastiter, uint64_t *plower,
|
||||
uint64_t *pupper, int64_t *pstride);
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t global_tid);
|
||||
|
||||
// Support for reducing conditional lastprivate variables
|
||||
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc,
|
||||
int32_t global_tid,
|
||||
int32_t varNum, void *array);
|
||||
|
||||
// reduction
|
||||
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
|
||||
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct);
|
||||
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
|
||||
// sync barrier
|
||||
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid);
|
||||
EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid);
|
||||
EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid);
|
||||
EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc, int32_t global_tid);
|
||||
|
||||
// single
|
||||
EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid);
|
||||
|
||||
// sync
|
||||
EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_critical(kmp_Indent *loc, int32_t global_tid,
|
||||
kmp_CriticalName *crit);
|
||||
EXTERN void __kmpc_end_critical(kmp_Indent *loc, int32_t global_tid,
|
||||
kmp_CriticalName *crit);
|
||||
EXTERN void __kmpc_flush(kmp_Indent *loc);
|
||||
|
||||
// vote
|
||||
EXTERN int32_t __kmpc_warp_active_thread_mask();
|
||||
|
||||
// tasks
|
||||
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Indent *loc,
|
||||
uint32_t global_tid, int32_t flag,
|
||||
size_t sizeOfTaskInclPrivate,
|
||||
size_t sizeOfSharedTable,
|
||||
kmp_TaskFctPtr sub);
|
||||
EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr);
|
||||
EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum,
|
||||
void *noAliasDepList);
|
||||
EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr);
|
||||
EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr);
|
||||
EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum, void *noAliasDepList);
|
||||
EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid);
|
||||
EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid);
|
||||
EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid,
|
||||
int end_part);
|
||||
EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid);
|
||||
EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr, int if_val,
|
||||
uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
|
||||
int32_t sched, uint64_t grainsize, void *task_dup);
|
||||
|
||||
// cancel
|
||||
EXTERN int32_t __kmpc_cancellationpoint(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t cancelVal);
|
||||
EXTERN int32_t __kmpc_cancel(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t cancelVal);
|
||||
|
||||
// non standard
|
||||
EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr);
|
||||
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime);
|
||||
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
||||
int16_t RequiresDataSharing);
|
||||
EXTERN void __kmpc_spmd_kernel_deinit();
|
||||
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN void __kmpc_kernel_end_parallel();
|
||||
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
|
||||
bool *IsFinal,
|
||||
int32_t *LaneSource);
|
||||
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer);
|
||||
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
|
||||
bool *IsFinal, int32_t *LaneSource,
|
||||
int32_t *LaneId, int32_t *NumLanes);
|
||||
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
|
||||
|
||||
// The slot used for data sharing by the master and worker threads. We use a
|
||||
// complete (default size version and an incomplete one so that we allow sizes
|
||||
// greater than the default).
|
||||
struct __kmpc_data_sharing_slot {
|
||||
__kmpc_data_sharing_slot *Next;
|
||||
void *DataEnd;
|
||||
char Data[];
|
||||
};
|
||||
EXTERN void
|
||||
__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS,
|
||||
size_t InitialDataSize);
|
||||
EXTERN void *__kmpc_data_sharing_environment_begin(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, int32_t *SavedActiveThreads,
|
||||
size_t SharingDataSize, size_t SharingDefaultDataSize,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
EXTERN void __kmpc_data_sharing_environment_end(
|
||||
__kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack,
|
||||
void **SavedSharedFrame, int32_t *SavedActiveThreads, int32_t IsEntryPoint);
|
||||
|
||||
EXTERN void *
|
||||
__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
|
||||
int16_t IsOMPRuntimeInitialized);
|
||||
#endif
|
462
openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
Normal file
462
openmp/libomptarget/deviceRTLs/nvptx/src/libcall.cu
Normal file
@ -0,0 +1,462 @@
|
||||
//===------------ libcall.cu - NVPTX OpenMP user calls ----------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements the OpenMP runtime functions that can be
|
||||
// invoked by the user in an OpenMP region
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
// Timer precision is 1ns
|
||||
#define TIMER_PRECISION ((double)1E-9)
|
||||
|
||||
EXTERN double omp_get_wtick(void) {
|
||||
PRINT(LD_IO, "omp_get_wtick() returns %g\n", TIMER_PRECISION);
|
||||
return TIMER_PRECISION;
|
||||
}
|
||||
|
||||
EXTERN double omp_get_wtime(void) {
|
||||
unsigned long long nsecs;
|
||||
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
|
||||
double rc = (double)nsecs * TIMER_PRECISION;
|
||||
PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_num_threads(int num) {
|
||||
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
|
||||
if (num <= 0) {
|
||||
WARNING0(LW_INPUT, "expected positive num; ignore\n");
|
||||
} else {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
currTaskDescr->NThreads() = num;
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_threads(void) {
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
int rc = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_max_threads(void) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
int rc = 1; // default is 1 thread avail
|
||||
if (!currTaskDescr->InParallelRegion()) {
|
||||
// not currently in a parallel region... all are available
|
||||
rc = GetNumberOfProcsInTeam();
|
||||
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_max_threads() return %\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_thread_limit(void) {
|
||||
// per contention group.. meaning threads in current team
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
int rc = currTaskDescr->ThreadLimit();
|
||||
PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_thread_num() {
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
int rc = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_procs(void) {
|
||||
int rc = GetNumberOfProcsInDevice();
|
||||
PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_in_parallel(void) {
|
||||
int rc = 0;
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
if (currTaskDescr->InParallelRegion()) {
|
||||
rc = 1;
|
||||
}
|
||||
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_in_final(void) {
|
||||
// treat all tasks as final... Specs may expect runtime to keep
|
||||
// track more precisely if a task was actively set by users... This
|
||||
// is not explicitely specified; will treat as if runtime can
|
||||
// actively decide to put a non-final task into a final one.
|
||||
int rc = 1;
|
||||
PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_dynamic(int flag) {
|
||||
PRINT(LD_IO, "call omp_set_dynamic(%d)\n", flag);
|
||||
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
if (flag) {
|
||||
currTaskDescr->SetDynamic();
|
||||
} else {
|
||||
currTaskDescr->ClearDynamic();
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN int omp_get_dynamic(void) {
|
||||
int rc = 0;
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
if (currTaskDescr->IsDynamic()) {
|
||||
rc = 1;
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_nested(int flag) {
|
||||
PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
|
||||
flag);
|
||||
}
|
||||
|
||||
EXTERN int omp_get_nested(void) {
|
||||
int rc = 0;
|
||||
PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_max_active_levels(int level) {
|
||||
PRINT(LD_IO,
|
||||
"call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
|
||||
level);
|
||||
}
|
||||
|
||||
EXTERN int omp_get_max_active_levels(void) {
|
||||
int rc = 1;
|
||||
PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_level(void) {
|
||||
int level = 0;
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
ASSERT0(LT_FUSSY, currTaskDescr,
|
||||
"do not expect fct to be called in a non-active thread");
|
||||
do {
|
||||
if (currTaskDescr->IsParallelConstruct()) {
|
||||
level++;
|
||||
}
|
||||
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
|
||||
} while (currTaskDescr);
|
||||
PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
|
||||
return level;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_active_level(void) {
|
||||
int level = 0; // no active level parallelism
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
ASSERT0(LT_FUSSY, currTaskDescr,
|
||||
"do not expect fct to be called in a non-active thread");
|
||||
do {
|
||||
if (currTaskDescr->ThreadsInTeam() > 1) {
|
||||
// has a parallel with more than one thread in team
|
||||
level = 1;
|
||||
break;
|
||||
}
|
||||
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
|
||||
} while (currTaskDescr);
|
||||
PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
|
||||
return level;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_ancestor_thread_num(int level) {
|
||||
int rc = 0; // default at level 0
|
||||
if (level >= 0) {
|
||||
int totLevel = omp_get_level();
|
||||
if (level <= totLevel) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
int steps = totLevel - level;
|
||||
PRINT(LD_IO, "backtrack %d steps\n", steps);
|
||||
ASSERT0(LT_FUSSY, currTaskDescr,
|
||||
"do not expect fct to be called in a non-active thread");
|
||||
do {
|
||||
if (DON(LD_IOD)) {
|
||||
// print current state
|
||||
omp_sched_t sched = currTaskDescr->GetRuntimeSched();
|
||||
PRINT(LD_ALL,
|
||||
"task descr %s %d: %s, in par %d, dyn %d, rt sched %d,"
|
||||
" chunk %lld; tid %d, tnum %d, nthreads %d\n",
|
||||
"ancestor", steps,
|
||||
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
|
||||
currTaskDescr->InParallelRegion(), currTaskDescr->IsDynamic(),
|
||||
sched, currTaskDescr->RuntimeChunkSize(),
|
||||
currTaskDescr->ThreadId(), currTaskDescr->ThreadsInTeam(),
|
||||
currTaskDescr->NThreads());
|
||||
}
|
||||
|
||||
if (currTaskDescr->IsParallelConstruct()) {
|
||||
// found the level
|
||||
if (!steps) {
|
||||
rc = currTaskDescr->ThreadId();
|
||||
break;
|
||||
}
|
||||
steps--;
|
||||
}
|
||||
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
|
||||
} while (currTaskDescr);
|
||||
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
|
||||
}
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
|
||||
rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_team_size(int level) {
|
||||
int rc = 1; // default at level 0
|
||||
if (level >= 0) {
|
||||
int totLevel = omp_get_level();
|
||||
if (level <= totLevel) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
int steps = totLevel - level;
|
||||
ASSERT0(LT_FUSSY, currTaskDescr,
|
||||
"do not expect fct to be called in a non-active thread");
|
||||
do {
|
||||
if (currTaskDescr->IsParallelConstruct()) {
|
||||
if (!steps) {
|
||||
// found the level
|
||||
rc = currTaskDescr->ThreadsInTeam();
|
||||
break;
|
||||
}
|
||||
steps--;
|
||||
}
|
||||
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
|
||||
} while (currTaskDescr);
|
||||
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
|
||||
}
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
*kind = currTaskDescr->GetRuntimeSched();
|
||||
*modifier = currTaskDescr->RuntimeChunkSize();
|
||||
PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
|
||||
(int)*kind, *modifier);
|
||||
}
|
||||
|
||||
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
|
||||
PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
|
||||
modifier);
|
||||
if (kind >= omp_sched_static && kind < omp_sched_auto) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor();
|
||||
currTaskDescr->SetRuntimeSched(kind);
|
||||
currTaskDescr->RuntimeChunkSize() = modifier;
|
||||
PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %d\n",
|
||||
(int)currTaskDescr->GetRuntimeSched(),
|
||||
currTaskDescr->RuntimeChunkSize());
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
|
||||
PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
|
||||
return omp_proc_bind_true;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_places(void) {
|
||||
PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_place_num_procs(int place_num) {
|
||||
PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
|
||||
PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_get_place_num(void) {
|
||||
PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_partition_num_places(void) {
|
||||
PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void omp_get_partition_place_nums(int *place_nums) {
|
||||
PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_get_cancellation(void) {
|
||||
int rc = FALSE; // currently false only
|
||||
PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_default_device(int deviceId) {
|
||||
PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_get_default_device(void) {
|
||||
PRINT0(LD_IO,
|
||||
"call omp_get_default_device() is undef on device, returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_devices(void) {
|
||||
PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_teams(void) {
|
||||
int rc = GetNumberOfOmpTeams();
|
||||
PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_team_num() {
|
||||
int rc = GetOmpTeamId();
|
||||
PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_is_initial_device(void) {
|
||||
PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n");
|
||||
return 0; // 0 by def on device
|
||||
}
|
||||
|
||||
// Unspecified on the device.
|
||||
EXTERN int omp_get_initial_device(void) {
|
||||
PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Unused for now.
|
||||
EXTERN int omp_get_max_task_priority(void) {
|
||||
PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// locks
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define __OMP_SPIN 1000
|
||||
#define UNSET 0
|
||||
#define SET 1
|
||||
|
||||
EXTERN void omp_init_lock(omp_lock_t *lock) {
|
||||
*lock = UNSET;
|
||||
PRINT0(LD_IO, "call omp_init_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN void omp_destroy_lock(omp_lock_t *lock) {
|
||||
PRINT0(LD_IO, "call omp_destroy_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN void omp_set_lock(omp_lock_t *lock) {
|
||||
// int atomicCAS(int* address, int compare, int val);
|
||||
// (old == compare ? val : old)
|
||||
int compare = UNSET;
|
||||
int val = SET;
|
||||
|
||||
// TODO: not sure spinning is a good idea here..
|
||||
while (atomicCAS(lock, compare, val) != UNSET) {
|
||||
|
||||
clock_t start = clock();
|
||||
clock_t now;
|
||||
for (;;) {
|
||||
now = clock();
|
||||
clock_t cycles = now > start ? now - start : now + (0xffffffff - start);
|
||||
if (cycles >= __OMP_SPIN * blockIdx.x) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // wait for 0 to be the read value
|
||||
|
||||
PRINT0(LD_IO, "call omp_set_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN void omp_unset_lock(omp_lock_t *lock) {
|
||||
int compare = SET;
|
||||
int val = UNSET;
|
||||
int old = atomicCAS(lock, compare, val);
|
||||
|
||||
PRINT0(LD_IO, "call omp_unset_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_test_lock(omp_lock_t *lock) {
|
||||
// int atomicCAS(int* address, int compare, int val);
|
||||
// (old == compare ? val : old)
|
||||
int compare = UNSET;
|
||||
int val = SET;
|
||||
|
||||
int ret = atomicCAS(lock, compare, val);
|
||||
|
||||
PRINT(LD_IO, "call omp_test_lock() return %d\n", ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// for xlf Fotran
|
||||
// Fotran, the return is LOGICAL type
|
||||
|
||||
#define FLOGICAL long
|
||||
EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() {
|
||||
int ret = omp_is_initial_device();
|
||||
if (ret == 0)
|
||||
return (FLOGICAL)0;
|
||||
else
|
||||
return (FLOGICAL)1;
|
||||
}
|
||||
|
||||
EXTERN int __xlf_omp_is_initial_device_i4() {
|
||||
int ret = omp_is_initial_device();
|
||||
if (ret == 0)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
}
|
||||
|
||||
EXTERN long __xlf_omp_get_team_num_i4() {
|
||||
int ret = omp_get_team_num();
|
||||
return (long)ret;
|
||||
}
|
||||
|
||||
EXTERN long __xlf_omp_get_num_teams_i4() {
|
||||
int ret = omp_get_num_teams();
|
||||
return (long)ret;
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_int(int *p) {
|
||||
printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_long(long *p) {
|
||||
printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_float(float *p) {
|
||||
printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_double(double *p) {
|
||||
printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p);
|
||||
}
|
||||
|
||||
EXTERN void xlf_debug_print_addr(void *p) {
|
||||
printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p);
|
||||
}
|
772
openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
Normal file
772
openmp/libomptarget/deviceRTLs/nvptx/src/loop.cu
Normal file
@ -0,0 +1,772 @@
|
||||
//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of the KMPC interface
|
||||
// for the loop construct plus other worksharing constructs that use the same
|
||||
// interface as loops.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// template class that encapsulate all the helper functions
|
||||
//
|
||||
// T is loop iteration type (32 | 64) (unsigned | signed)
|
||||
// ST is the signed version of T
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
|
||||
public:
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop with static scheduling with chunk
|
||||
|
||||
// Generic implementation of OMP loop scheduling with static policy
|
||||
/*! \brief Calculate initial bounds for static loop and stride
|
||||
* @param[in] loc location in code of the call (not used here)
|
||||
* @param[in] global_tid global thread id
|
||||
* @param[in] schetype type of scheduling (see omptarget-nvptx.h)
|
||||
* @param[in] plastiter pointer to last iteration
|
||||
* @param[in,out] pointer to loop lower bound. it will contain value of
|
||||
* lower bound of first chunk
|
||||
* @param[in,out] pointer to loop upper bound. It will contain value of
|
||||
* upper bound of first chunk
|
||||
* @param[in,out] pointer to loop stride. It will contain value of stride
|
||||
* between two successive chunks executed by the same thread
|
||||
* @param[in] loop increment bump
|
||||
* @param[in] chunk size
|
||||
*/
|
||||
|
||||
// helper function for static chunk
|
||||
INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
|
||||
ST chunk, T entityId, T numberOfEntities) {
|
||||
// each thread executes multiple chunks all of the same size, except
|
||||
// the last one
|
||||
|
||||
// distance between two successive chunks
|
||||
stride = numberOfEntities * chunk;
|
||||
lb = lb + entityId * chunk;
|
||||
T inputUb = ub;
|
||||
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||
// Say ub' is the begining of the last chunk. Then who ever has a
|
||||
// lower bound plus a multiple of the increment equal to ub' is
|
||||
// the last one.
|
||||
T beginingLastChunk = inputUb - (inputUb % chunk);
|
||||
last = ((beginingLastChunk - lb) % stride) == 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop with static scheduling without chunk
|
||||
|
||||
// helper function for static no chunk
|
||||
INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
|
||||
ST &chunk, T entityId,
|
||||
T numberOfEntities) {
|
||||
// No chunk size specified. Each thread or warp gets at most one
|
||||
// chunk; chunks are all almost of equal size
|
||||
T loopSize = ub - lb + 1;
|
||||
|
||||
chunk = loopSize / numberOfEntities;
|
||||
T leftOver = loopSize - chunk * numberOfEntities;
|
||||
|
||||
if (entityId < leftOver) {
|
||||
chunk++;
|
||||
lb = lb + entityId * chunk;
|
||||
} else {
|
||||
lb = lb + entityId * chunk + leftOver;
|
||||
}
|
||||
|
||||
T inputUb = ub;
|
||||
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||
last = ub == inputUb;
|
||||
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for Static Init
|
||||
|
||||
INLINE static void for_static_init(int32_t schedtype, int32_t *plastiter,
|
||||
T *plower, T *pupper, ST *pstride,
|
||||
ST chunk, bool IsSPMDExecutionMode,
|
||||
bool IsOMPRuntimeUnavailable = false) {
|
||||
// When IsOMPRuntimeUnavailable is true, we assume that the caller is
|
||||
// in an L0 parallel region and that all worker threads participate.
|
||||
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
|
||||
// Assume we are in teams region or that we use a single block
|
||||
// per target region
|
||||
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(
|
||||
tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
|
||||
|
||||
// All warps that are in excess of the maximum requested, do
|
||||
// not execute the loop
|
||||
PRINT(LD_LOOP,
|
||||
"OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
|
||||
"%d, num tids %d\n",
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
|
||||
schedtype, P64(chunk),
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable),
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable));
|
||||
ASSERT0(
|
||||
LT_FUSSY,
|
||||
(GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable)) <
|
||||
(GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable)),
|
||||
"current thread is not needed here; error");
|
||||
|
||||
// copy
|
||||
int lastiter = 0;
|
||||
T lb = *plower;
|
||||
T ub = *pupper;
|
||||
ST stride = *pstride;
|
||||
T entityId, numberOfEntities;
|
||||
// init
|
||||
switch (schedtype) {
|
||||
case kmp_sched_static_chunk: {
|
||||
if (chunk > 0) {
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
break;
|
||||
}
|
||||
} // note: if chunk <=0, use nochunk
|
||||
case kmp_sched_static_nochunk: {
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable);
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_static_chunk: {
|
||||
if (chunk > 0) {
|
||||
entityId = GetOmpTeamId();
|
||||
numberOfEntities = GetNumberOfOmpTeams();
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
break;
|
||||
} // note: if chunk <=0, use nochunk
|
||||
}
|
||||
case kmp_sched_distr_static_nochunk: {
|
||||
entityId = GetOmpTeamId();
|
||||
numberOfEntities = GetNumberOfOmpTeams();
|
||||
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_static_chunk_sched_static_chunkone: {
|
||||
entityId =
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable) *
|
||||
GetOmpTeamId() +
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
|
||||
numberOfEntities = GetNumberOfOmpTeams() *
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
ASSERT(LT_FUSSY, FALSE, "unknown schedtype %d", schedtype);
|
||||
PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
|
||||
schedtype);
|
||||
entityId =
|
||||
GetOmpThreadId(tid, IsSPMDExecutionMode, IsOMPRuntimeUnavailable);
|
||||
numberOfEntities = GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, entityId,
|
||||
numberOfEntities);
|
||||
}
|
||||
}
|
||||
// copy back
|
||||
*plastiter = lastiter;
|
||||
*plower = lb;
|
||||
*pupper = ub;
|
||||
*pstride = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld\n",
|
||||
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode,
|
||||
IsOMPRuntimeUnavailable),
|
||||
GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
|
||||
P64(*pstride));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for dispatch Init
|
||||
|
||||
INLINE static int OrderedSchedule(kmp_sched_t schedule) {
|
||||
return schedule >= kmp_sched_ordered_first &&
|
||||
schedule <= kmp_sched_ordered_last;
|
||||
}
|
||||
|
||||
INLINE static void dispatch_init(kmp_sched_t schedule, T lb, T ub, ST st,
|
||||
ST chunk) {
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
T tnum = currTaskDescr->ThreadsInTeam();
|
||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||
ASSERT0(
|
||||
LT_FUSSY,
|
||||
GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
"current thread is not needed here; error");
|
||||
|
||||
/* Currently just ignore the monotonic and non-monotonic modifiers
|
||||
* (the compiler isn't producing them * yet anyway).
|
||||
* When it is we'll want to look at them somewhere here and use that
|
||||
* information to add to our schedule choice. We shouldn't need to pass
|
||||
* them on, they merely affect which schedule we can legally choose for
|
||||
* various dynamic cases. (In paritcular, whether or not a stealing scheme
|
||||
* is legal).
|
||||
*/
|
||||
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
|
||||
|
||||
// Process schedule.
|
||||
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
|
||||
PRINT(LD_LOOP,
|
||||
"go sequential as tnum=%d, trip count %lld, ordered sched=%d\n",
|
||||
tnum, P64(tripCount), schedule);
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = tripCount; // one thread gets the whole loop
|
||||
|
||||
} else if (schedule == kmp_sched_runtime) {
|
||||
// process runtime
|
||||
omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
|
||||
chunk = currTaskDescr->RuntimeChunkSize();
|
||||
switch (rtSched) {
|
||||
case omp_sched_static: {
|
||||
if (chunk > 0)
|
||||
schedule = kmp_sched_static_chunk;
|
||||
else
|
||||
schedule = kmp_sched_static_nochunk;
|
||||
break;
|
||||
}
|
||||
case omp_sched_auto: {
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = 1;
|
||||
break;
|
||||
}
|
||||
case omp_sched_dynamic:
|
||||
case omp_sched_guided: {
|
||||
schedule = kmp_sched_dynamic;
|
||||
break;
|
||||
}
|
||||
}
|
||||
PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", schedule,
|
||||
P64(chunk));
|
||||
} else if (schedule == kmp_sched_auto) {
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = 1;
|
||||
PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", schedule,
|
||||
P64(chunk));
|
||||
} else {
|
||||
PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", schedule, P64(chunk));
|
||||
ASSERT(LT_FUSSY,
|
||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
|
||||
"unknown schedule %d & chunk %lld\n", schedule, P64(chunk));
|
||||
}
|
||||
|
||||
// save sched state
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
|
||||
// init schedules
|
||||
if (schedule == kmp_sched_static_chunk) {
|
||||
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
|
||||
// save ub
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
// compute static chunk
|
||||
ST stride;
|
||||
T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
int lastiter = 0;
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %lld,"
|
||||
"next lower bound = %lld, stride = %lld\n",
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
|
||||
} else if (schedule == kmp_sched_static_nochunk) {
|
||||
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
|
||||
// save ub
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
// compute static chunk
|
||||
ST stride;
|
||||
T threadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
int lastiter = 0;
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (static nochunk) : num threads = %d, ub = %lld,"
|
||||
"next lower bound = %lld, stride = %lld\n",
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid));
|
||||
|
||||
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
|
||||
if (chunk < 1)
|
||||
chunk = 1;
|
||||
Counter eventNum = ((tripCount - 1) / chunk) + 1; // number of chunks
|
||||
// but each thread (but one) must discover that it is last
|
||||
eventNum += tnum;
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->EventsNumber(tid) = eventNum;
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (dyn) : num threads = %d, ub = %lld, chunk %lld, "
|
||||
"events number = %lld\n",
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid),
|
||||
omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for dispatch next
|
||||
|
||||
INLINE static int DynamicNextChunk(omptarget_nvptx_CounterGroup &cg,
|
||||
Counter priv, T &lb, T &ub,
|
||||
Counter &chunkId, Counter ¤tEvent,
|
||||
T chunkSize, T loopUpperBound) {
|
||||
// get next event atomically
|
||||
Counter nextEvent = cg.Next();
|
||||
// calculate chunk Id (priv was initialized upon entering the loop to
|
||||
// 'start' == 'event')
|
||||
chunkId = nextEvent - priv;
|
||||
// calculate lower bound for all lanes in the warp
|
||||
lb = chunkId * chunkSize; // this code assume normalization of LB
|
||||
ub = lb + chunkSize - 1; // Clang uses i <= ub
|
||||
|
||||
// 3 result cases:
|
||||
// a. lb and ub < loopUpperBound --> NOT_FINISHED
|
||||
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
|
||||
// NOT_FINISHED
|
||||
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
|
||||
currentEvent = nextEvent;
|
||||
// a.
|
||||
if (ub <= loopUpperBound) {
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", P64(lb),
|
||||
P64(ub), P64(loopUpperBound));
|
||||
return NOT_FINISHED;
|
||||
}
|
||||
// b.
|
||||
if (lb <= loopUpperBound) {
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
|
||||
P64(lb), P64(ub), P64(loopUpperBound));
|
||||
ub = loopUpperBound;
|
||||
return LAST_CHUNK;
|
||||
}
|
||||
// c. if we are here, we are in case 'c'
|
||||
lb = loopUpperBound + 1;
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", P64(lb),
|
||||
P64(ub), P64(loopUpperBound));
|
||||
return FINISHED;
|
||||
}
|
||||
|
||||
// On Pascal, with inlining of the runtime into the user application,
|
||||
// this code deadlocks. This is probably because different threads
|
||||
// in a warp cannot make independent progress.
|
||||
NOINLINE static int dispatch_next(int32_t *plast, T *plower, T *pupper,
|
||||
ST *pstride) {
|
||||
// ID of a thread in its own warp
|
||||
|
||||
// automatically selects thread or warp ID based on selected implementation
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
ASSERT0(
|
||||
LT_FUSSY,
|
||||
GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized()) <
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
"current thread is not needed here; error");
|
||||
// retrieve schedule
|
||||
kmp_sched_t schedule =
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
|
||||
|
||||
// xxx reduce to one
|
||||
if (schedule == kmp_sched_static_chunk ||
|
||||
schedule == kmp_sched_static_nochunk) {
|
||||
T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
|
||||
T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
|
||||
// finished?
|
||||
if (myLb > ub) {
|
||||
PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
|
||||
P64(myLb), P64(ub));
|
||||
return DISPATCH_FINISHED;
|
||||
}
|
||||
// not finished, save current bounds
|
||||
ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
|
||||
*plower = myLb;
|
||||
T myUb = myLb + chunk - 1; // Clang uses i <= ub
|
||||
if (myUb > ub)
|
||||
myUb = ub;
|
||||
*pupper = myUb;
|
||||
*plast = (int32_t)(myUb == ub);
|
||||
|
||||
// increment next lower bound by the stride
|
||||
ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
|
||||
PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
|
||||
P64(*plower), P64(*pupper));
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
ASSERT0(LT_FUSSY,
|
||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
|
||||
"bad sched");
|
||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||
T myLb, myUb;
|
||||
Counter chunkId;
|
||||
// xxx current event is now local
|
||||
omptarget_nvptx_CounterGroup &cg = teamDescr.WorkDescr().CounterGroup();
|
||||
int finished = DynamicNextChunk(
|
||||
cg, omptarget_nvptx_threadPrivateContext->Priv(tid), myLb, myUb,
|
||||
chunkId, omptarget_nvptx_threadPrivateContext->CurrentEvent(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
|
||||
|
||||
if (finished == FINISHED) {
|
||||
cg.Complete(omptarget_nvptx_threadPrivateContext->Priv(tid),
|
||||
omptarget_nvptx_threadPrivateContext->EventsNumber(tid));
|
||||
cg.Release(omptarget_nvptx_threadPrivateContext->Priv(tid),
|
||||
omptarget_nvptx_threadPrivateContext->CurrentEvent(tid));
|
||||
|
||||
return DISPATCH_FINISHED;
|
||||
}
|
||||
|
||||
// not finished (either not finished or last chunk)
|
||||
*plast = (int32_t)(
|
||||
myUb == omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
|
||||
*plower = myLb;
|
||||
*pupper = myUb;
|
||||
*pstride = 1;
|
||||
|
||||
PRINT(LD_LOOP,
|
||||
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld\n",
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized()),
|
||||
GetNumberOfWorkersInTeam(), P64(*plower), P64(*pupper),
|
||||
P64(*pstride));
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
|
||||
INLINE static void dispatch_fini() {
|
||||
// nothing
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// end of template class that encapsulate all the helper functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP interface implementation (dyn loops)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// init
|
||||
EXTERN void __kmpc_dispatch_init_4(kmp_Indent *loc, int32_t tid,
|
||||
int32_t schedule, int32_t lb, int32_t ub,
|
||||
int32_t st, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
|
||||
(kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_init_4u(kmp_Indent *loc, int32_t tid,
|
||||
int32_t schedule, uint32_t lb, uint32_t ub,
|
||||
int32_t st, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
|
||||
(kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_init_8(kmp_Indent *loc, int32_t tid,
|
||||
int32_t schedule, int64_t lb, int64_t ub,
|
||||
int64_t st, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
|
||||
(kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_init_8u(kmp_Indent *loc, int32_t tid,
|
||||
int32_t schedule, uint64_t lb, uint64_t ub,
|
||||
int64_t st, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
|
||||
(kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
// next
|
||||
EXTERN int __kmpc_dispatch_next_4(kmp_Indent *loc, int32_t tid, int32_t *p_last,
|
||||
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
|
||||
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_4u(kmp_Indent *loc, int32_t tid,
|
||||
int32_t *p_last, uint32_t *p_lb,
|
||||
uint32_t *p_ub, int32_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
|
||||
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_8(kmp_Indent *loc, int32_t tid, int32_t *p_last,
|
||||
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
|
||||
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_8u(kmp_Indent *loc, int32_t tid,
|
||||
int32_t *p_last, uint64_t *p_lb,
|
||||
uint64_t *p_ub, int64_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
|
||||
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
|
||||
p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
// fini
|
||||
EXTERN void __kmpc_dispatch_fini_4(kmp_Indent *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_4u(kmp_Indent *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_8(kmp_Indent *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_8u(kmp_Indent *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP interface implementation (static loops)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_for_static_init_4(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_init_4u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_init_8(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_init_8u(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk, isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/true,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter, uint32_t *plower,
|
||||
uint32_t *pupper, int32_t *pstride,
|
||||
int32_t incr, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/true,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/true,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_spmd(kmp_Indent *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter, uint64_t *plower,
|
||||
uint64_t *pupper, int64_t *pstride,
|
||||
int64_t incr, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/true,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_generic(
|
||||
kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/false,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_generic(
|
||||
kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/false,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_generic(
|
||||
kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/false,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_generic(
|
||||
kmp_Indent *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
|
||||
uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*isSPMDExecutionMode=*/false,
|
||||
/*IsOMPRuntimeUnavailable=*/true);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_fini(kmp_Indent *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_fini\n");
|
||||
}
|
||||
|
||||
namespace {
|
||||
INLINE void syncWorkersInGenericMode(uint32_t NumThreads) {
|
||||
int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
// On Volta and newer architectures we require that all lanes in
|
||||
// a warp (at least, all present for the kernel launch) participate in the
|
||||
// barrier. This is enforced when launching the parallel region. An
|
||||
// exception is when there are < WARPSIZE workers. In this case only 1 worker
|
||||
// is started, so we don't need a barrier.
|
||||
if (NumThreads > 1) {
|
||||
#endif
|
||||
named_sync(L1_BARRIER, WARPSIZE * NumWarps);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}; // namespace
|
||||
|
||||
EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Indent *loc, int32_t gtid,
|
||||
int32_t varNum, void *array) {
|
||||
PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n");
|
||||
|
||||
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
|
||||
int tid = GetOmpThreadId(GetLogicalThreadIdInBlock(), isSPMDMode(),
|
||||
isRuntimeUninitialized());
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(
|
||||
GetLogicalThreadIdInBlock(), isSPMDMode(), isRuntimeUninitialized());
|
||||
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
|
||||
for (unsigned i = 0; i < varNum; i++) {
|
||||
// Reset buffer.
|
||||
if (tid == 0)
|
||||
*Buffer = 0; // Reset to minimum loop iteration value.
|
||||
|
||||
// Barrier.
|
||||
syncWorkersInGenericMode(NumThreads);
|
||||
|
||||
// Atomic max of iterations.
|
||||
uint64_t *varArray = (uint64_t *)array;
|
||||
uint64_t elem = varArray[i];
|
||||
(void)atomicMax((unsigned long long int *)Buffer,
|
||||
(unsigned long long int)elem);
|
||||
|
||||
// Barrier.
|
||||
syncWorkersInGenericMode(NumThreads);
|
||||
|
||||
// Read max value and update thread private array.
|
||||
varArray[i] = *Buffer;
|
||||
|
||||
// Barrier.
|
||||
syncWorkersInGenericMode(NumThreads);
|
||||
}
|
||||
}
|
48
openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
Normal file
48
openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
Normal file
@ -0,0 +1,48 @@
|
||||
//===------------ omp_data.cu - NVPTX OpenMP GPU objects --------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the data objects used on the GPU device.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data holding OpenMP state information
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__device__
|
||||
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_State[MAX_SM];
|
||||
|
||||
// Pointer to this team's OpenMP state object
|
||||
__device__ __shared__
|
||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The team master sets the outlined parallel function in this variable to
|
||||
// communicate with the workers. Since it is in shared memory, there is one
|
||||
// copy of these variables for each kernel, instance, and team.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP kernel execution parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ uint32_t execution_param;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Data sharing state
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ DataSharingStateTy DataSharingState;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Scratchpad for teams reduction.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ void *ReductionScratchpadPtr;
|
188
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
Normal file
188
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
Normal file
@ -0,0 +1,188 @@
|
||||
//===--- omptarget-nvptx.cu - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the initialization code for the GPU
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data tables
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern __device__
|
||||
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_State[MAX_SM];
|
||||
|
||||
extern __device__ __shared__
|
||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||
|
||||
//
|
||||
// The team master sets the outlined function and its arguments in these
|
||||
// variables to communicate with the workers. Since they are in shared memory,
|
||||
// there is one copy of these variables for each kernel, instance, and team.
|
||||
//
|
||||
extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
|
||||
omptarget_nvptx_workFn;
|
||||
extern __device__ __shared__ uint32_t execution_param;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// init entry points
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE unsigned smid() {
|
||||
unsigned id;
|
||||
asm("mov.u32 %0, %%smid;" : "=r"(id));
|
||||
return id;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_init_params(void *Ptr) {
|
||||
PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n",
|
||||
OMPTARGET_NVPTX_VERSION);
|
||||
|
||||
SetTeamsReductionScratchpadPtr(Ptr);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
|
||||
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
|
||||
OMPTARGET_NVPTX_VERSION);
|
||||
|
||||
if (!RequiresOMPRuntime) {
|
||||
// If OMP runtime is not required don't initialize OMP state.
|
||||
setExecutionParameters(Generic, RuntimeUninitialized);
|
||||
return;
|
||||
}
|
||||
setExecutionParameters(Generic, RuntimeInitialized);
|
||||
|
||||
int threadIdInBlock = GetThreadIdInBlock();
|
||||
ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
|
||||
"__kmpc_kernel_init() must be called by team master warp only!");
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
|
||||
|
||||
// Get a state object from the queue.
|
||||
int slot = smid() % MAX_SM;
|
||||
omptarget_nvptx_threadPrivateContext =
|
||||
omptarget_nvptx_device_State[slot].Dequeue();
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
omptarget_nvptx_threadPrivateContext->SetSourceQueue(slot);
|
||||
#endif
|
||||
|
||||
// init thread private
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
|
||||
|
||||
// init team context
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
currTeamDescr.InitTeamDescr();
|
||||
// this thread will start execution... has to update its task ICV
|
||||
// to point to the level zero task ICV. That ICV was init in
|
||||
// InitTeamDescr()
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, currTeamDescr.LevelZeroTaskDescr());
|
||||
|
||||
// set number of threads and thread limit in team to started value
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
|
||||
currTaskDescr->ThreadLimit() = ThreadLimit;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {
|
||||
if (IsOMPRuntimeInitialized) {
|
||||
// Enqueue omp state object for use by another team.
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
int slot = omptarget_nvptx_threadPrivateContext->GetSourceQueue();
|
||||
#else
|
||||
int slot = smid() % MAX_SM;
|
||||
#endif
|
||||
omptarget_nvptx_device_State[slot].Enqueue(
|
||||
omptarget_nvptx_threadPrivateContext);
|
||||
}
|
||||
// Done with work. Kill the workers.
|
||||
omptarget_nvptx_workFn = 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
|
||||
int16_t RequiresDataSharing) {
|
||||
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
|
||||
|
||||
if (!RequiresOMPRuntime) {
|
||||
// If OMP runtime is not required don't initialize OMP state.
|
||||
setExecutionParameters(Spmd, RuntimeUninitialized);
|
||||
return;
|
||||
}
|
||||
setExecutionParameters(Spmd, RuntimeInitialized);
|
||||
|
||||
//
|
||||
// Team Context Initialization.
|
||||
//
|
||||
// In SPMD mode there is no master thread so use any cuda thread for team
|
||||
// context initialization.
|
||||
int threadId = GetThreadIdInBlock();
|
||||
if (threadId == 0) {
|
||||
// Get a state object from the queue.
|
||||
int slot = smid() % MAX_SM;
|
||||
omptarget_nvptx_threadPrivateContext =
|
||||
omptarget_nvptx_device_State[slot].Dequeue();
|
||||
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
// init team context
|
||||
currTeamDescr.InitTeamDescr();
|
||||
// init counters (copy start to init)
|
||||
workDescr.CounterGroup().Reset();
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
|
||||
//
|
||||
// Initialize task descr for each thread.
|
||||
//
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||
newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
|
||||
currTeamDescr.LevelZeroTaskDescr());
|
||||
newTaskDescr->ThreadLimit() = ThreadLimit;
|
||||
// install new top descriptor
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
|
||||
// init thread private from init value
|
||||
workDescr.CounterGroup().Init(
|
||||
omptarget_nvptx_threadPrivateContext->Priv(threadId));
|
||||
PRINT(LD_PAR,
|
||||
"thread will execute parallel region with id %d in a team of "
|
||||
"%d threads\n",
|
||||
newTaskDescr->ThreadId(), newTaskDescr->NThreads());
|
||||
|
||||
if (RequiresDataSharing && threadId % WARPSIZE == 0) {
|
||||
// Warp master innitializes data sharing environment.
|
||||
unsigned WID = threadId / WARPSIZE;
|
||||
__kmpc_data_sharing_slot *RootS = currTeamDescr.RootS(WID);
|
||||
DataSharingState.SlotPtr[WID] = RootS;
|
||||
DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_spmd_kernel_deinit() {
|
||||
// We're not going to pop the task descr stack of each thread since
|
||||
// there are no more parallel regions in SPMD mode.
|
||||
__syncthreads();
|
||||
int threadId = GetThreadIdInBlock();
|
||||
if (threadId == 0) {
|
||||
// Enqueue omp state object for use by another team.
|
||||
int slot = smid() % MAX_SM;
|
||||
omptarget_nvptx_device_State[slot].Enqueue(
|
||||
omptarget_nvptx_threadPrivateContext);
|
||||
}
|
||||
}
|
362
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
Normal file
362
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
Normal file
@ -0,0 +1,362 @@
|
||||
//===---- omptarget-nvptx.h - NVPTX OpenMP GPU initialization ---- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the declarations of all library macros, types,
|
||||
// and functions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __OMPTARGET_NVPTX_H
|
||||
#define __OMPTARGET_NVPTX_H
|
||||
|
||||
// std includes
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// cuda includes
|
||||
#include <cuda.h>
|
||||
#include <math.h>
|
||||
|
||||
// local includes
|
||||
#include "counter_group.h"
|
||||
#include "debug.h" // debug
|
||||
#include "interface.h" // interfaces with omp, compiler, and user
|
||||
#include "option.h" // choices we have
|
||||
#include "state-queue.h"
|
||||
#include "support.h"
|
||||
|
||||
#define OMPTARGET_NVPTX_VERSION 1.1
|
||||
|
||||
// used by the library for the interface with the app
|
||||
#define DISPATCH_FINISHED 0
|
||||
#define DISPATCH_NOTFINISHED 1
|
||||
|
||||
// used by dynamic scheduling
|
||||
#define FINISHED 0
|
||||
#define NOT_FINISHED 1
|
||||
#define LAST_CHUNK 2
|
||||
|
||||
#define BARRIER_COUNTER 0
|
||||
#define ORDERED_COUNTER 1
|
||||
|
||||
// Macros for Cuda intrinsics
|
||||
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
|
||||
// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
|
||||
#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
|
||||
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
|
||||
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
|
||||
__shfl_down_sync((mask), (var), (delta), (width))
|
||||
#define __BALLOT_SYNC(mask, predicate) __ballot_sync((mask), (predicate))
|
||||
#define __ACTIVEMASK() __activemask()
|
||||
#else
|
||||
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
|
||||
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
|
||||
__shfl_down((var), (delta), (width))
|
||||
#define __BALLOT_SYNC(mask, predicate) __ballot((predicate))
|
||||
#define __ACTIVEMASK() __ballot(1)
|
||||
#endif
|
||||
|
||||
// Data sharing related quantities, need to match what is used in the compiler.
|
||||
enum DATA_SHARING_SIZES {
|
||||
// The maximum number of workers in a kernel.
|
||||
DS_Max_Worker_Threads = 992,
|
||||
// The size reserved for data in a shared memory slot.
|
||||
DS_Slot_Size = 256,
|
||||
// The slot size that should be reserved for a working warp.
|
||||
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = 32,
|
||||
};
|
||||
|
||||
// Data structure to keep in shared memory that traces the current slot, stack,
|
||||
// and frame pointer as well as the active threads that didn't exit the current
|
||||
// environment.
|
||||
struct DataSharingStateTy {
|
||||
__kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number];
|
||||
void *StackPtr[DS_Max_Warp_Number];
|
||||
void *FramePtr[DS_Max_Warp_Number];
|
||||
int32_t ActiveThreads[DS_Max_Warp_Number];
|
||||
};
|
||||
// Additional worker slot type which is initialized with the default worker slot
|
||||
// size of 4*32 bytes.
|
||||
struct __kmpc_data_sharing_worker_slot_static {
|
||||
__kmpc_data_sharing_slot *Next;
|
||||
void *DataEnd;
|
||||
char Data[DS_Worker_Warp_Slot_Size];
|
||||
};
|
||||
// Additional master slot type which is initialized with the default master slot
|
||||
// size of 4 bytes.
|
||||
struct __kmpc_data_sharing_master_slot_static {
|
||||
__kmpc_data_sharing_slot *Next;
|
||||
void *DataEnd;
|
||||
char Data[DS_Slot_Size];
|
||||
};
|
||||
extern __device__ __shared__ DataSharingStateTy DataSharingState;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// task ICV and (implicit & explicit) task state
|
||||
|
||||
class omptarget_nvptx_TaskDescr {
|
||||
public:
|
||||
// methods for flags
|
||||
INLINE omp_sched_t GetRuntimeSched();
|
||||
INLINE void SetRuntimeSched(omp_sched_t sched);
|
||||
INLINE int IsDynamic() { return data.items.flags & TaskDescr_IsDynamic; }
|
||||
INLINE void SetDynamic() {
|
||||
data.items.flags = data.items.flags | TaskDescr_IsDynamic;
|
||||
}
|
||||
INLINE void ClearDynamic() {
|
||||
data.items.flags = data.items.flags & (~TaskDescr_IsDynamic);
|
||||
}
|
||||
INLINE int InParallelRegion() { return data.items.flags & TaskDescr_InPar; }
|
||||
INLINE int InL2OrHigherParallelRegion() {
|
||||
return data.items.flags & TaskDescr_InParL2P;
|
||||
}
|
||||
INLINE int IsParallelConstruct() {
|
||||
return data.items.flags & TaskDescr_IsParConstr;
|
||||
}
|
||||
INLINE int IsTaskConstruct() { return !IsParallelConstruct(); }
|
||||
// methods for other fields
|
||||
INLINE uint16_t &NThreads() { return data.items.nthreads; }
|
||||
INLINE uint16_t &ThreadLimit() { return data.items.threadlimit; }
|
||||
INLINE uint16_t &ThreadId() { return data.items.threadId; }
|
||||
INLINE uint16_t &ThreadsInTeam() { return data.items.threadsInTeam; }
|
||||
INLINE uint64_t &RuntimeChunkSize() { return data.items.runtimeChunkSize; }
|
||||
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() { return prev; }
|
||||
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
|
||||
prev = taskDescr;
|
||||
}
|
||||
// init & copy
|
||||
INLINE void InitLevelZeroTaskDescr();
|
||||
INLINE void InitLevelOneTaskDescr(uint16_t tnum,
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
||||
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
||||
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
|
||||
uint16_t tnum);
|
||||
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
|
||||
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
|
||||
uint16_t tid, uint16_t tnum);
|
||||
|
||||
private:
|
||||
// bits for flags: (7 used, 1 free)
|
||||
// 3 bits (SchedMask) for runtime schedule
|
||||
// 1 bit (IsDynamic) for dynamic schedule (false = static)
|
||||
// 1 bit (InPar) if this thread has encountered one or more parallel region
|
||||
// 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
|
||||
// 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
|
||||
// region
|
||||
static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
|
||||
static const uint8_t TaskDescr_IsDynamic = 0x8;
|
||||
static const uint8_t TaskDescr_InPar = 0x10;
|
||||
static const uint8_t TaskDescr_IsParConstr = 0x20;
|
||||
static const uint8_t TaskDescr_InParL2P = 0x40;
|
||||
|
||||
union { // both have same size
|
||||
uint64_t vect[2];
|
||||
struct TaskDescr_items {
|
||||
uint8_t flags; // 6 bit used (see flag above)
|
||||
uint8_t unused;
|
||||
uint16_t nthreads; // thread num for subsequent parallel regions
|
||||
uint16_t threadlimit; // thread limit ICV
|
||||
uint16_t threadId; // thread id
|
||||
uint16_t threadsInTeam; // threads in current team
|
||||
uint64_t runtimeChunkSize; // runtime chunk size
|
||||
} items;
|
||||
} data;
|
||||
omptarget_nvptx_TaskDescr *prev;
|
||||
};
|
||||
|
||||
// build on kmp
|
||||
typedef struct omptarget_nvptx_ExplicitTaskDescr {
|
||||
omptarget_nvptx_TaskDescr
|
||||
taskDescr; // omptarget_nvptx task description (must be first)
|
||||
kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
|
||||
} omptarget_nvptx_ExplicitTaskDescr;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Descriptor of a parallel region (worksharing in general)
|
||||
|
||||
class omptarget_nvptx_WorkDescr {
|
||||
|
||||
public:
|
||||
// access to data
|
||||
INLINE omptarget_nvptx_CounterGroup &CounterGroup() { return cg; }
|
||||
INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
|
||||
// init
|
||||
INLINE void InitWorkDescr();
|
||||
|
||||
private:
|
||||
omptarget_nvptx_CounterGroup cg; // for barrier (no other needed)
|
||||
omptarget_nvptx_TaskDescr masterTaskICV;
|
||||
bool hasCancel;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class omptarget_nvptx_TeamDescr {
|
||||
public:
|
||||
// access to data
|
||||
INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
|
||||
return &levelZeroTaskDescr;
|
||||
}
|
||||
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
|
||||
return workDescrForActiveParallel;
|
||||
}
|
||||
INLINE omp_lock_t *CriticalLock() { return &criticalLock; }
|
||||
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
|
||||
|
||||
// init
|
||||
INLINE void InitTeamDescr();
|
||||
|
||||
INLINE __kmpc_data_sharing_slot *RootS(int wid) {
|
||||
// If this is invoked by the master thread of the master warp then intialize
|
||||
// it with a smaller slot.
|
||||
if (wid == WARPSIZE - 1) {
|
||||
// Initialize the pointer to the end of the slot given the size of the
|
||||
// data section. DataEnd is non-inclusive.
|
||||
master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size;
|
||||
// We currently do not have a next slot.
|
||||
master_rootS[0].Next = 0;
|
||||
return (__kmpc_data_sharing_slot *)&master_rootS[0];
|
||||
}
|
||||
// Initialize the pointer to the end of the slot given the size of the data
|
||||
// section. DataEnd is non-inclusive.
|
||||
worker_rootS[wid].DataEnd =
|
||||
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
|
||||
// We currently do not have a next slot.
|
||||
worker_rootS[wid].Next = 0;
|
||||
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
|
||||
}
|
||||
|
||||
private:
|
||||
omptarget_nvptx_TaskDescr
|
||||
levelZeroTaskDescr; // icv for team master initial thread
|
||||
omptarget_nvptx_WorkDescr
|
||||
workDescrForActiveParallel; // one, ONLY for the active par
|
||||
omp_lock_t criticalLock;
|
||||
uint64_t lastprivateIterBuffer;
|
||||
|
||||
__align__(16)
|
||||
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE - 1];
|
||||
__align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// thread private data (struct of arrays for better coalescing)
|
||||
// tid refers here to the global thread id
|
||||
// do not support multiple concurrent kernel a this time
|
||||
class omptarget_nvptx_ThreadPrivateContext {
|
||||
public:
|
||||
// task
|
||||
INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
|
||||
return &levelOneTaskDescr[tid];
|
||||
}
|
||||
INLINE void SetTopLevelTaskDescr(int tid,
|
||||
omptarget_nvptx_TaskDescr *taskICV) {
|
||||
topTaskDescr[tid] = taskICV;
|
||||
}
|
||||
INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid);
|
||||
// parallel
|
||||
INLINE uint16_t &NumThreadsForNextParallel(int tid) {
|
||||
return nextRegion.tnum[tid];
|
||||
}
|
||||
// simd
|
||||
INLINE uint16_t &SimdLimitForNextSimd(int tid) {
|
||||
return nextRegion.slim[tid];
|
||||
}
|
||||
// sync
|
||||
INLINE Counter &Priv(int tid) { return priv[tid]; }
|
||||
INLINE void IncrementPriv(int tid, Counter val) { priv[tid] += val; }
|
||||
// schedule (for dispatch)
|
||||
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
|
||||
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
|
||||
INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
|
||||
// state for dispatch with dyn/guided
|
||||
INLINE Counter &CurrentEvent(int tid) {
|
||||
return currEvent_or_nextLowerBound[tid];
|
||||
}
|
||||
INLINE Counter &EventsNumber(int tid) { return eventsNum_or_stride[tid]; }
|
||||
// state for dispatch with static
|
||||
INLINE Counter &NextLowerBound(int tid) {
|
||||
return currEvent_or_nextLowerBound[tid];
|
||||
}
|
||||
INLINE Counter &Stride(int tid) { return eventsNum_or_stride[tid]; }
|
||||
|
||||
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
|
||||
|
||||
INLINE void InitThreadPrivateContext(int tid);
|
||||
INLINE void SetSourceQueue(uint64_t Src) { SourceQueue = Src; }
|
||||
INLINE uint64_t GetSourceQueue() { return SourceQueue; }
|
||||
|
||||
private:
|
||||
// team context for this team
|
||||
omptarget_nvptx_TeamDescr teamContext;
|
||||
// task ICV for implict threads in the only parallel region
|
||||
omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
|
||||
// pointer where to find the current task ICV (top of the stack)
|
||||
omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
|
||||
union {
|
||||
// Only one of the two is live at the same time.
|
||||
// parallel
|
||||
uint16_t tnum[MAX_THREADS_PER_TEAM];
|
||||
// simd limit
|
||||
uint16_t slim[MAX_THREADS_PER_TEAM];
|
||||
} nextRegion;
|
||||
// sync
|
||||
Counter priv[MAX_THREADS_PER_TEAM];
|
||||
// schedule (for dispatch)
|
||||
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
|
||||
int64_t chunk[MAX_THREADS_PER_TEAM];
|
||||
int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
|
||||
// state for dispatch with dyn/guided OR static (never use both at a time)
|
||||
Counter currEvent_or_nextLowerBound[MAX_THREADS_PER_TEAM];
|
||||
Counter eventsNum_or_stride[MAX_THREADS_PER_TEAM];
|
||||
// Queue to which this object must be returned.
|
||||
uint64_t SourceQueue;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data tables
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern __device__ __shared__
|
||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||
extern __device__ __shared__ uint32_t execution_param;
|
||||
extern __device__ __shared__ void *ReductionScratchpadPtr;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// work function (outlined parallel/simd functions) and arguments.
|
||||
// needed for L1 parallelism only.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void *omptarget_nvptx_WorkFn;
|
||||
extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
|
||||
omptarget_nvptx_workFn;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get private data structures
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
|
||||
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor();
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// inlined implementation
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include "counter_groupi.h"
|
||||
#include "omptarget-nvptxi.h"
|
||||
#include "supporti.h"
|
||||
|
||||
#endif
|
195
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
Normal file
195
openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptxi.h
Normal file
@ -0,0 +1,195 @@
|
||||
//===---- omptarget-nvptxi.h - NVPTX OpenMP GPU initialization --- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the declarations of all library macros, types,
|
||||
// and functions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Task Descriptor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() {
|
||||
// sched starts from 1..4; encode it as 0..3; so add 1 here
|
||||
uint8_t rc = (data.items.flags & TaskDescr_SchedMask) + 1;
|
||||
return (omp_sched_t)rc;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
|
||||
// sched starts from 1..4; encode it as 0..3; so sub 1 here
|
||||
uint8_t val = ((uint8_t)sched) - 1;
|
||||
// clear current sched
|
||||
data.items.flags &= ~TaskDescr_SchedMask;
|
||||
// set new sched
|
||||
data.items.flags |= val;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
|
||||
// slow method
|
||||
// flag:
|
||||
// default sched is static,
|
||||
// dyn is off (unused now anyway, but may need to sample from host ?)
|
||||
// not in parallel
|
||||
|
||||
data.items.flags = 0;
|
||||
data.items.nthreads = GetNumberOfProcsInTeam();
|
||||
; // threads: whatever was alloc by kernel
|
||||
data.items.threadId = 0; // is master
|
||||
data.items.threadsInTeam = 1; // sequential
|
||||
data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
|
||||
}
|
||||
|
||||
// This is called when all threads are started together in SPMD mode.
|
||||
// OMP directives include target parallel, target distribute parallel for, etc.
|
||||
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
|
||||
uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||
// slow method
|
||||
// flag:
|
||||
// default sched is static,
|
||||
// dyn is off (unused now anyway, but may need to sample from host ?)
|
||||
// in L1 parallel
|
||||
|
||||
data.items.flags =
|
||||
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
|
||||
data.items.nthreads = 0; // # threads for subsequent parallel region
|
||||
data.items.threadId =
|
||||
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
|
||||
data.items.threadsInTeam = tnum;
|
||||
data.items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
|
||||
prev = parentTaskDescr;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyData(
|
||||
omptarget_nvptx_TaskDescr *sourceTaskDescr) {
|
||||
data.vect[0] = sourceTaskDescr->data.vect[0];
|
||||
data.vect[1] = sourceTaskDescr->data.vect[1];
|
||||
}
|
||||
|
||||
INLINE void
|
||||
omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
|
||||
CopyData(sourceTaskDescr);
|
||||
prev = sourceTaskDescr->prev;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyParent(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||
CopyData(parentTaskDescr);
|
||||
prev = parentTaskDescr;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||
CopyParent(parentTaskDescr);
|
||||
data.items.flags = data.items.flags & ~TaskDescr_IsParConstr;
|
||||
ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
|
||||
omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
|
||||
CopyParent(masterTaskDescr);
|
||||
// overrwrite specific items;
|
||||
data.items.flags |=
|
||||
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
|
||||
data.items.threadsInTeam = tnum; // set number of threads
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
|
||||
omptarget_nvptx_TaskDescr *workTaskDescr) {
|
||||
Copy(workTaskDescr);
|
||||
//
|
||||
// overrwrite specific items;
|
||||
//
|
||||
// The threadID should be GetThreadIdInBlock() % GetMasterThreadID().
|
||||
// This is so that the serial master (first lane in the master warp)
|
||||
// gets a threadId of 0.
|
||||
// However, we know that this function is always called in a parallel
|
||||
// region where only workers are active. The serial master thread
|
||||
// never enters this region. When a parallel region is executed serially,
|
||||
// the threadId is set to 0 elsewhere and the kmpc_serialized_* functions
|
||||
// are called, which never activate this region.
|
||||
data.items.threadId =
|
||||
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
|
||||
CopyParent(parentTaskDescr);
|
||||
data.items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
|
||||
data.items.threadsInTeam = tnum; // set number of threads
|
||||
data.items.threadId = tid;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Thread Private Context
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE omptarget_nvptx_TaskDescr *
|
||||
omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) {
|
||||
ASSERT0(
|
||||
LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
|
||||
"Getting top level, tid is larger than allocated data structure size");
|
||||
return topTaskDescr[tid];
|
||||
}
|
||||
|
||||
INLINE void
|
||||
omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
|
||||
// levelOneTaskDescr is init when starting the parallel region
|
||||
// top task descr is NULL (team master version will be fixed separately)
|
||||
topTaskDescr[tid] = NULL;
|
||||
// no num threads value has been pushed
|
||||
nextRegion.tnum[tid] = 0;
|
||||
// priv counter init to zero
|
||||
priv[tid] = 0;
|
||||
// the following don't need to be init here; they are init when using dyn
|
||||
// sched
|
||||
// current_Event, events_Number, chunk, num_Iterations, schedule
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Work Descriptor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE void omptarget_nvptx_WorkDescr::InitWorkDescr() {
|
||||
cg.Clear(); // start and stop to zero too
|
||||
// threadsInParallelTeam does not need to be init (done in start parallel)
|
||||
hasCancel = FALSE;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Team Descriptor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
|
||||
levelZeroTaskDescr.InitLevelZeroTaskDescr();
|
||||
workDescrForActiveParallel.InitWorkDescr();
|
||||
// omp_init_lock(criticalLock);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Get private data structure for thread
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Utility routines for CUDA threads
|
||||
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
|
||||
return omptarget_nvptx_threadPrivateContext->TeamContext();
|
||||
}
|
||||
|
||||
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
return currTeamDescr.WorkDescr();
|
||||
}
|
||||
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
|
||||
return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
}
|
||||
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor() {
|
||||
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
|
||||
}
|
66
openmp/libomptarget/deviceRTLs/nvptx/src/option.h
Normal file
66
openmp/libomptarget/deviceRTLs/nvptx/src/option.h
Normal file
@ -0,0 +1,66 @@
|
||||
//===------------ option.h - NVPTX OpenMP GPU options ------------ CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// GPU default options
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef _OPTION_H_
|
||||
#define _OPTION_H_
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The following def must match the absolute limit hardwired in the host RTL
|
||||
// max number of threads per team
|
||||
#define MAX_THREADS_PER_TEAM 1024
|
||||
|
||||
#define WARPSIZE 32
|
||||
|
||||
// The named barrier for active parallel threads of a team in an L1 parallel
|
||||
// region to synchronize with each other.
|
||||
#define L1_BARRIER (1)
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
#define OMP_STATE_COUNT 32
|
||||
#define MAX_SM 56
|
||||
#else
|
||||
#define OMP_STATE_COUNT 16
|
||||
#define MAX_SM 16
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// algo options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// data options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// decide if counters are 32 or 64 bit
|
||||
#define Counter unsigned long long
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// misc options (by def everythig here is device)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define EXTERN extern "C" __device__
|
||||
#define INLINE __inline__ __device__
|
||||
#define NOINLINE __noinline__ __device__
|
||||
#ifndef TRUE
|
||||
#define TRUE 1
|
||||
#endif
|
||||
#ifndef FALSE
|
||||
#define FALSE 0
|
||||
#endif
|
||||
|
||||
#endif
|
476
openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Normal file
476
openmp/libomptarget/deviceRTLs/nvptx/src/parallel.cu
Normal file
@ -0,0 +1,476 @@
|
||||
//===---- parallel.cu - NVPTX OpenMP parallel implementation ----- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Parallel implemention in the GPU. Here is the pattern:
|
||||
//
|
||||
// while (not finished) {
|
||||
//
|
||||
// if (master) {
|
||||
// sequential code, decide which par loop to do, or if finished
|
||||
// __kmpc_kernel_prepare_parallel() // exec by master only
|
||||
// }
|
||||
// syncthreads // A
|
||||
// __kmpc_kernel_parallel() // exec by all
|
||||
// if (this thread is included in the parallel) {
|
||||
// switch () for all parallel loops
|
||||
// __kmpc_kernel_end_parallel() // exec only by threads in parallel
|
||||
// }
|
||||
//
|
||||
//
|
||||
// The reason we don't exec end_parallel for the threads not included
|
||||
// in the parallel loop is that for each barrier in the parallel
|
||||
// region, these non-included threads will cycle through the
|
||||
// syncthread A. Thus they must preserve their current threadId that
|
||||
// is larger than thread in team.
|
||||
//
|
||||
// To make a long story short...
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
typedef struct ConvergentSimdJob {
|
||||
omptarget_nvptx_TaskDescr taskDescr;
|
||||
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
|
||||
uint16_t slimForNextSimd;
|
||||
} ConvergentSimdJob;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for convergent simd (team of threads in a warp only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
|
||||
bool *IsFinal, int32_t *LaneSource,
|
||||
int32_t *LaneId, int32_t *NumLanes) {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
|
||||
uint32_t ConvergentMask = Mask;
|
||||
int32_t ConvergentSize = __popc(ConvergentMask);
|
||||
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
|
||||
*LaneSource += __ffs(WorkRemaining);
|
||||
*IsFinal = __popc(WorkRemaining) == 1;
|
||||
uint32_t lanemask_lt;
|
||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
|
||||
*LaneId = __popc(ConvergentMask & lanemask_lt);
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
|
||||
|
||||
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
|
||||
int32_t SimdLimit =
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
|
||||
job->slimForNextSimd = SimdLimit;
|
||||
|
||||
int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
|
||||
// reset simdlimit to avoid propagating to successive #simd
|
||||
if (SimdLimitSource > 0 && threadId == sourceThreadId)
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
|
||||
|
||||
// We cannot have more than the # of convergent threads.
|
||||
if (SimdLimitSource > 0)
|
||||
*NumLanes = min(ConvergentSize, SimdLimitSource);
|
||||
else
|
||||
*NumLanes = ConvergentSize;
|
||||
ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads",
|
||||
*NumLanes);
|
||||
|
||||
// Set to true for lanes participating in the simd region.
|
||||
bool isActive = false;
|
||||
// Initialize state for active threads.
|
||||
if (*LaneId < *NumLanes) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
omptarget_nvptx_TaskDescr *sourceTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
|
||||
sourceThreadId);
|
||||
job->convHeadTaskDescr = currTaskDescr;
|
||||
// install top descriptor from the thread for which the lanes are working.
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
sourceTaskDescr);
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
// requires a memory fence between threads of a warp
|
||||
return isActive;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
|
||||
// pop stack
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
ConvergentSimdJob *job = (ConvergentSimdJob *)buffer;
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) =
|
||||
job->slimForNextSimd;
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, job->convHeadTaskDescr);
|
||||
}
|
||||
|
||||
typedef struct ConvergentParallelJob {
|
||||
omptarget_nvptx_TaskDescr taskDescr;
|
||||
omptarget_nvptx_TaskDescr *convHeadTaskDescr;
|
||||
uint16_t tnumForNextPar;
|
||||
} ConvergentParallelJob;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for convergent parallelism (team of threads in a warp only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
|
||||
bool *IsFinal,
|
||||
int32_t *LaneSource) {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
|
||||
uint32_t ConvergentMask = Mask;
|
||||
int32_t ConvergentSize = __popc(ConvergentMask);
|
||||
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
|
||||
*LaneSource += __ffs(WorkRemaining);
|
||||
*IsFinal = __popc(WorkRemaining) == 1;
|
||||
uint32_t lanemask_lt;
|
||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
|
||||
uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
|
||||
|
||||
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
|
||||
int32_t NumThreadsClause =
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
|
||||
job->tnumForNextPar = NumThreadsClause;
|
||||
|
||||
int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
|
||||
// reset numthreads to avoid propagating to successive #parallel
|
||||
if (NumThreadsSource > 0 && threadId == sourceThreadId)
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
|
||||
0;
|
||||
|
||||
// We cannot have more than the # of convergent threads.
|
||||
uint16_t NumThreads;
|
||||
if (NumThreadsSource > 0)
|
||||
NumThreads = min(ConvergentSize, NumThreadsSource);
|
||||
else
|
||||
NumThreads = ConvergentSize;
|
||||
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
|
||||
NumThreads);
|
||||
|
||||
// Set to true for workers participating in the parallel region.
|
||||
bool isActive = false;
|
||||
// Initialize state for active threads.
|
||||
if (OmpId < NumThreads) {
|
||||
// init L2 task descriptor and storage for the L1 parallel task descriptor.
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr;
|
||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
omptarget_nvptx_TaskDescr *sourceTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(
|
||||
sourceThreadId);
|
||||
job->convHeadTaskDescr = currTaskDescr;
|
||||
newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads);
|
||||
// install new top descriptor
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
// requires a memory fence between threads of a warp
|
||||
return isActive;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n");
|
||||
// pop stack
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
ConvergentParallelJob *job = (ConvergentParallelJob *)buffer;
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, job->convHeadTaskDescr);
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
|
||||
job->tnumForNextPar;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for parallel that goes parallel (1 static level only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// return number of cuda threads that participate to parallel
|
||||
// calculation has to consider simd implementation in nvptx
|
||||
// i.e. (num omp threads * num lanes)
|
||||
//
|
||||
// cudathreads =
|
||||
// if(num_threads != 0) {
|
||||
// if(thread_limit > 0) {
|
||||
// min (num_threads*numLanes ; thread_limit*numLanes);
|
||||
// } else {
|
||||
// min (num_threads*numLanes; blockDim.x)
|
||||
// }
|
||||
// } else {
|
||||
// if (thread_limit != 0) {
|
||||
// min (thread_limit*numLanes; blockDim.x)
|
||||
// } else { // no thread_limit, no num_threads, use all cuda threads
|
||||
// blockDim.x;
|
||||
// }
|
||||
// }
|
||||
//
|
||||
// This routine is always called by the team master..
|
||||
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
||||
int16_t IsOMPRuntimeInitialized) {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
|
||||
omptarget_nvptx_workFn = WorkFn;
|
||||
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return;
|
||||
|
||||
// This routine is only called by the team master. The team master is
|
||||
// the first thread of the last warp. It always has the logical thread
|
||||
// id of 0 (since it is a shadow for the first worker thread).
|
||||
int threadId = 0;
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
|
||||
ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
|
||||
"cannot be called in a parallel region.");
|
||||
if (currTaskDescr->InParallelRegion()) {
|
||||
PRINT0(LD_PAR, "already in parallel: go seq\n");
|
||||
return;
|
||||
}
|
||||
|
||||
uint16_t CudaThreadsForParallel = 0;
|
||||
uint16_t NumThreadsClause =
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
|
||||
|
||||
// we cannot have more than block size
|
||||
uint16_t CudaThreadsAvail = GetNumberOfWorkersInTeam();
|
||||
|
||||
// currTaskDescr->ThreadLimit(): If non-zero, this is the limit as
|
||||
// specified by the thread_limit clause on the target directive.
|
||||
// GetNumberOfWorkersInTeam(): This is the number of workers available
|
||||
// in this kernel instance.
|
||||
//
|
||||
// E.g: If thread_limit is 33, the kernel is launched with 33+32=65
|
||||
// threads. The last warp is the master warp so in this case
|
||||
// GetNumberOfWorkersInTeam() returns 64.
|
||||
|
||||
// this is different from ThreadAvail of OpenMP because we may be
|
||||
// using some of the CUDA threads as SIMD lanes
|
||||
int NumLanes = 1;
|
||||
if (NumThreadsClause != 0) {
|
||||
// reset request to avoid propagating to successive #parallel
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
|
||||
0;
|
||||
|
||||
// assume that thread_limit*numlanes is already <= CudaThreadsAvail
|
||||
// because that is already checked on the host side (CUDA offloading rtl)
|
||||
if (currTaskDescr->ThreadLimit() != 0)
|
||||
CudaThreadsForParallel =
|
||||
NumThreadsClause * NumLanes < currTaskDescr->ThreadLimit() * NumLanes
|
||||
? NumThreadsClause * NumLanes
|
||||
: currTaskDescr->ThreadLimit() * NumLanes;
|
||||
else {
|
||||
CudaThreadsForParallel = (NumThreadsClause * NumLanes > CudaThreadsAvail)
|
||||
? CudaThreadsAvail
|
||||
: NumThreadsClause * NumLanes;
|
||||
}
|
||||
} else {
|
||||
if (currTaskDescr->ThreadLimit() != 0) {
|
||||
CudaThreadsForParallel =
|
||||
(currTaskDescr->ThreadLimit() * NumLanes > CudaThreadsAvail)
|
||||
? CudaThreadsAvail
|
||||
: currTaskDescr->ThreadLimit() * NumLanes;
|
||||
} else
|
||||
CudaThreadsForParallel = CudaThreadsAvail;
|
||||
}
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
// On Volta and newer architectures we require that all lanes in
|
||||
// a warp participate in the parallel region. Round down to a
|
||||
// multiple of WARPSIZE since it is legal to do so in OpenMP.
|
||||
// CudaThreadsAvail is the number of workers available in this
|
||||
// kernel instance and is greater than or equal to
|
||||
// currTaskDescr->ThreadLimit().
|
||||
if (CudaThreadsForParallel < CudaThreadsAvail) {
|
||||
CudaThreadsForParallel =
|
||||
(CudaThreadsForParallel < WARPSIZE)
|
||||
? 1
|
||||
: CudaThreadsForParallel & ~((uint16_t)WARPSIZE - 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
ASSERT(LT_FUSSY, CudaThreadsForParallel > 0,
|
||||
"bad thread request of %d threads", CudaThreadsForParallel);
|
||||
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
|
||||
"only team master can create parallel");
|
||||
|
||||
// set number of threads on work descriptor
|
||||
// this is different from the number of cuda threads required for the parallel
|
||||
// region
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr,
|
||||
CudaThreadsForParallel / NumLanes);
|
||||
// init counters (copy start to init)
|
||||
workDescr.CounterGroup().Reset();
|
||||
}
|
||||
|
||||
// All workers call this function. Deactivate those not needed.
|
||||
// Fn - the outlined work function to execute.
|
||||
// returns True if this thread is active, else False.
|
||||
//
|
||||
// Only the worker threads call this routine.
|
||||
EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
|
||||
int16_t IsOMPRuntimeInitialized) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
|
||||
|
||||
// Work function and arguments for L1 parallel region.
|
||||
*WorkFn = omptarget_nvptx_workFn;
|
||||
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return true;
|
||||
|
||||
// If this is the termination signal from the master, quit early.
|
||||
if (!*WorkFn)
|
||||
return false;
|
||||
|
||||
// Only the worker threads call this routine and the master warp
|
||||
// never arrives here. Therefore, use the nvptx thread id.
|
||||
int threadId = GetThreadIdInBlock();
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
// Set to true for workers participating in the parallel region.
|
||||
bool isActive = false;
|
||||
// Initialize state for active threads.
|
||||
if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
|
||||
// init work descriptor from workdesccr
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||
newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
|
||||
// install new top descriptor
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
// init private from int value
|
||||
workDescr.CounterGroup().Init(
|
||||
omptarget_nvptx_threadPrivateContext->Priv(threadId));
|
||||
PRINT(LD_PAR,
|
||||
"thread will execute parallel region with id %d in a team of "
|
||||
"%d threads\n",
|
||||
newTaskDescr->ThreadId(), newTaskDescr->NThreads());
|
||||
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
return isActive;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_end_parallel() {
|
||||
// pop stack
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
|
||||
// Only the worker threads call this routine and the master warp
|
||||
// never arrives here. Therefore, use the nvptx thread id.
|
||||
int threadId = GetThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, currTaskDescr->GetPrevTaskDescr());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for parallel that goes sequential
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_serialized_parallel(kmp_Indent *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n");
|
||||
|
||||
// assume this is only called for nested parallel
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
|
||||
// unlike actual parallel, threads in the same team do not share
|
||||
// the workTaskDescr in this case and num threads is fixed to 1
|
||||
|
||||
// get current task
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
|
||||
// allocate new task descriptor and copy value from current one, set prev to
|
||||
// it
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||
(omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
|
||||
(char *)"new seq parallel task");
|
||||
newTaskDescr->CopyParent(currTaskDescr);
|
||||
|
||||
// tweak values for serialized parallel case:
|
||||
// - each thread becomes ID 0 in its serialized parallel, and
|
||||
// - there is only one thread per team
|
||||
newTaskDescr->ThreadId() = 0;
|
||||
newTaskDescr->ThreadsInTeam() = 1;
|
||||
|
||||
// set new task descriptor as top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_serialized_parallel(kmp_Indent *loc,
|
||||
uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n");
|
||||
|
||||
// pop stack
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
// set new top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, currTaskDescr->GetPrevTaskDescr());
|
||||
// free
|
||||
SafeFree(currTaskDescr, (char *)"new seq parallel task");
|
||||
}
|
||||
|
||||
EXTERN uint16_t __kmpc_parallel_level(kmp_Indent *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_parallel_level\n");
|
||||
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
if (currTaskDescr->InL2OrHigherParallelRegion())
|
||||
return 2;
|
||||
else if (currTaskDescr->InParallelRegion())
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
// This kmpc call returns the thread id across all teams. It's value is
|
||||
// cached by the compiler and used when calling the runtime. On nvptx
|
||||
// it's cheap to recalculate this value so we never use the result
|
||||
// of this call.
|
||||
EXTERN int32_t __kmpc_global_thread_num(kmp_Indent *loc) {
|
||||
return GetLogicalThreadIdInBlock();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// push params
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_push_num_threads(kmp_Indent *loc, int32_t tid,
|
||||
int32_t num_threads) {
|
||||
PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads);
|
||||
tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) =
|
||||
num_threads;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_push_simd_limit(kmp_Indent *loc, int32_t tid,
|
||||
int32_t simd_limit) {
|
||||
PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", simd_limit);
|
||||
tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit;
|
||||
}
|
||||
|
||||
// Do nothing. The host guarantees we started the requested number of
|
||||
// teams and we only need inspection of gridDim.
|
||||
|
||||
EXTERN void __kmpc_push_num_teams(kmp_Indent *loc, int32_t tid,
|
||||
int32_t num_teams, int32_t thread_limit) {
|
||||
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", num_teams);
|
||||
ASSERT0(LT_FUSSY, FALSE,
|
||||
"should never have anything with new teams on device");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_push_proc_bind(kmp_Indent *loc, uint32_t tid,
|
||||
int proc_bind) {
|
||||
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", proc_bind);
|
||||
}
|
443
openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
Normal file
443
openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu
Normal file
@ -0,0 +1,443 @@
|
||||
//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
|
||||
//-*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of reduction with KMPC interface.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include <complex.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
// may eventually remove this
|
||||
EXTERN
|
||||
int32_t __gpu_block_reduce() {
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
int nt = GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
if (nt != blockDim.x)
|
||||
return 0;
|
||||
unsigned tnum = __ACTIVEMASK();
|
||||
if (tnum != (~0x0)) { // assume swapSize is 32
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_reduce_gpu(kmp_Indent *loc, int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
void *reduce_array_size, kmp_ReductFctPtr *reductFct,
|
||||
kmp_CriticalName *lck) {
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
int numthread;
|
||||
if (currTaskDescr->IsParallelConstruct()) {
|
||||
numthread =
|
||||
GetNumberOfOmpThreads(threadId, isSPMDMode(), isRuntimeUninitialized());
|
||||
} else {
|
||||
numthread = GetNumberOfOmpTeams();
|
||||
}
|
||||
|
||||
if (numthread == 1)
|
||||
return 1;
|
||||
else if (!__gpu_block_reduce())
|
||||
return 2;
|
||||
else {
|
||||
if (threadIdx.x == 0)
|
||||
return 1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_reduce_combined(kmp_Indent *loc) {
|
||||
if (threadIdx.x == 0) {
|
||||
return 2;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_reduce_simd(kmp_Indent *loc) {
|
||||
if (threadIdx.x % 32 == 0) {
|
||||
return 1;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
|
||||
|
||||
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
|
||||
return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
|
||||
}
|
||||
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
|
||||
int lo, hi;
|
||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||
hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
|
||||
lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
|
||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||
return val;
|
||||
}
|
||||
|
||||
static INLINE void gpu_regular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct) {
|
||||
for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
|
||||
shflFct(reduce_data, /*LaneId - not used= */ 0,
|
||||
/*Offset = */ mask, /*AlgoVersion=*/0);
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void gpu_irregular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
uint32_t size, uint32_t tid) {
|
||||
uint32_t curr_size;
|
||||
uint32_t mask;
|
||||
curr_size = size;
|
||||
mask = curr_size / 2;
|
||||
while (mask > 0) {
|
||||
shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
|
||||
curr_size = (curr_size + 1) / 2;
|
||||
mask = curr_size / 2;
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE uint32_t
|
||||
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
|
||||
uint32_t lanemask_lt;
|
||||
uint32_t lanemask_gt;
|
||||
uint32_t size, remote_id, physical_lane_id;
|
||||
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
|
||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
|
||||
uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
|
||||
uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
|
||||
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
|
||||
do {
|
||||
Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
|
||||
remote_id = __ffs(Liveness & lanemask_gt);
|
||||
size = __popc(Liveness);
|
||||
logical_lane_id /= 2;
|
||||
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
|
||||
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
|
||||
} while (logical_lane_id % 2 == 0 && size > 1);
|
||||
return (logical_lane_id == 0);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
|
||||
if (Liveness == 0xffffffff) {
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
return GetThreadIdInBlock() % WARPSIZE ==
|
||||
0; // Result on lane 0 of the simd warp.
|
||||
} else {
|
||||
return gpu_irregular_simd_reduce(
|
||||
reduce_data, shflFct); // Result on the first active lane.
|
||||
}
|
||||
}
|
||||
|
||||
INLINE
|
||||
int32_t nvptx_parallel_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct,
|
||||
bool isSPMDExecutionMode,
|
||||
bool isRuntimeUninitialized = false) {
|
||||
/*
|
||||
* This reduce function handles reduction within a team. It handles
|
||||
* parallel regions in both L1 and L2 parallelism levels. It also
|
||||
* supports Generic, SPMD, and NoOMP modes.
|
||||
*
|
||||
* 1. Reduce within a warp.
|
||||
* 2. Warp master copies value to warp 0 via shared memory.
|
||||
* 3. Warp 0 reduces to a single value.
|
||||
* 4. The reduced value is available in the thread that returns 1.
|
||||
*/
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(
|
||||
BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
|
||||
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
uint32_t WarpId = BlockThreadId / WARPSIZE;
|
||||
|
||||
// Volta execution model:
|
||||
// For the Generic execution mode a parallel region either has 1 thread and
|
||||
// beyond that, always a multiple of 32. For the SPMD execution mode we may
|
||||
// have any number of threads.
|
||||
if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct,
|
||||
/*LaneCount=*/NumThreads % WARPSIZE,
|
||||
/*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
//
|
||||
// Only L1 parallel region can enter this if condition.
|
||||
if (NumThreads > WARPSIZE) {
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
|
||||
BlockThreadId);
|
||||
|
||||
return BlockThreadId == 0;
|
||||
}
|
||||
return BlockThreadId == 0;
|
||||
#else
|
||||
uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
|
||||
if (Liveness == 0xffffffff) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct,
|
||||
/*LaneCount=*/__popc(Liveness),
|
||||
/*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
|
||||
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
|
||||
// parallel region may enter here; return
|
||||
// early.
|
||||
return gpu_irregular_simd_reduce(reduce_data, shflFct);
|
||||
|
||||
uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(
|
||||
BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
//
|
||||
// Only L1 parallel region can enter this if condition.
|
||||
if (NumThreads > WARPSIZE) {
|
||||
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
uint32_t WarpId = BlockThreadId / WARPSIZE;
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
|
||||
BlockThreadId);
|
||||
|
||||
return BlockThreadId == 0;
|
||||
} else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
|
||||
return BlockThreadId == 0;
|
||||
}
|
||||
|
||||
// Get the OMP thread Id. This is different from BlockThreadId in the case of
|
||||
// an L2 parallel region.
|
||||
return GetOmpThreadId(BlockThreadId, isSPMDExecutionMode,
|
||||
isRuntimeUninitialized) == 0;
|
||||
#endif // __CUDA_ARCH__ >= 700
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct,
|
||||
/*isSPMDExecutionMode=*/isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct,
|
||||
/*isSPMDExecutionMode=*/true,
|
||||
/*isRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct,
|
||||
/*isSPMDExecutionMode=*/false,
|
||||
/*isRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
INLINE
|
||||
int32_t nvptx_teams_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
|
||||
bool isSPMDExecutionMode, bool isRuntimeUninitialized = false) {
|
||||
uint32_t ThreadId = GetLogicalThreadIdInBlock();
|
||||
// In non-generic mode all workers participate in the teams reduction.
|
||||
// In generic mode only the team master participates in the teams
|
||||
// reduction because the workers are waiting for parallel work.
|
||||
uint32_t NumThreads =
|
||||
isSPMDExecutionMode
|
||||
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
|
||||
isRuntimeUninitialized)
|
||||
: /*Master thread only*/ 1;
|
||||
uint32_t TeamId = GetBlockIdInKernel();
|
||||
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
||||
__shared__ volatile bool IsLastTeam;
|
||||
|
||||
// Team masters of all teams write to the scratchpad.
|
||||
if (ThreadId == 0) {
|
||||
unsigned int *timestamp = GetTeamsReductionTimestamp();
|
||||
char *scratchpad = GetTeamsReductionScratchpad();
|
||||
|
||||
scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
|
||||
__threadfence();
|
||||
|
||||
// atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
|
||||
// It resets 'timestamp' back to 0 once the last team increments
|
||||
// this counter.
|
||||
unsigned val = atomicInc(timestamp, NumTeams - 1);
|
||||
IsLastTeam = val == NumTeams - 1;
|
||||
}
|
||||
|
||||
// We have to wait on L1 barrier because in GENERIC mode the workers
|
||||
// are waiting on barrier 0 for work.
|
||||
//
|
||||
// If we guard this barrier as follows it leads to deadlock, probably
|
||||
// because of a compiler bug: if (!IsGenericMode()) __syncthreads();
|
||||
uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
|
||||
|
||||
// If this team is not the last, quit.
|
||||
if (/* Volatile read by all threads */ !IsLastTeam)
|
||||
return 0;
|
||||
|
||||
//
|
||||
// Last team processing.
|
||||
//
|
||||
|
||||
// Threads in excess of #teams do not participate in reduction of the
|
||||
// scratchpad values.
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
uint32_t ActiveThreads = NumThreads;
|
||||
if (NumTeams < NumThreads) {
|
||||
ActiveThreads =
|
||||
(NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
|
||||
}
|
||||
if (ThreadId >= ActiveThreads)
|
||||
return 0;
|
||||
|
||||
// Load from scratchpad and reduce.
|
||||
char *scratchpad = GetTeamsReductionScratchpad();
|
||||
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
|
||||
for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
|
||||
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
|
||||
|
||||
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
uint32_t WarpId = ThreadId / WARPSIZE;
|
||||
|
||||
// Reduce across warps to the warp master.
|
||||
if ((ActiveThreads % WARPSIZE == 0) ||
|
||||
(WarpId < WarpsNeeded - 1)) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (ActiveThreads > 1) // Partial warp but contiguous lanes
|
||||
// Only SPMD execution mode comes thru this case.
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct,
|
||||
/*LaneCount=*/ActiveThreads % WARPSIZE,
|
||||
/*LaneId=*/ThreadId % WARPSIZE);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
if (ActiveThreads > WARPSIZE) {
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
|
||||
}
|
||||
#else
|
||||
if (ThreadId >= NumTeams)
|
||||
return 0;
|
||||
|
||||
// Load from scratchpad and reduce.
|
||||
char *scratchpad = GetTeamsReductionScratchpad();
|
||||
ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
|
||||
for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
|
||||
ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
|
||||
|
||||
// Reduce across warps to the warp master.
|
||||
uint32_t Liveness = __BALLOT_SYNC(0xFFFFFFFF, true);
|
||||
if (Liveness == 0xffffffff) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else // Partial warp but contiguous lanes
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct,
|
||||
/*LaneCount=*/__popc(Liveness),
|
||||
/*LaneId=*/ThreadId % WARPSIZE);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
|
||||
if (ActiveThreads > WARPSIZE) {
|
||||
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
uint32_t WarpId = ThreadId / WARPSIZE;
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
|
||||
}
|
||||
#endif // __CUDA_ARCH__ >= 700
|
||||
|
||||
return ThreadId == 0;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
||||
size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct,
|
||||
kmp_LoadReduceFctPtr ldFct) {
|
||||
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct, scratchFct,
|
||||
ldFct, /*isSPMDExecutionMode=*/isSPMDMode());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
|
||||
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct, scratchFct,
|
||||
ldFct,
|
||||
/*isSPMDExecutionMode=*/true,
|
||||
/*isRuntimeUninitialized=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
|
||||
return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
|
||||
reduce_data, shflFct, cpyFct, scratchFct,
|
||||
ldFct,
|
||||
/*isSPMDExecutionMode=*/false,
|
||||
/*isRuntimeUninitialized=*/true);
|
||||
}
|
52
openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h
Normal file
52
openmp/libomptarget/deviceRTLs/nvptx/src/state-queue.h
Normal file
@ -0,0 +1,52 @@
|
||||
//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains a queue to hand out OpenMP state objects to teams of
|
||||
// one or more kernels.
|
||||
//
|
||||
// Reference:
|
||||
// Thomas R.W. Scogland and Wu-chun Feng. 2015.
|
||||
// Design and Evaluation of Scalable Concurrent Queues for Many-Core
|
||||
// Architectures. International Conference on Performance Engineering.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __STATE_QUEUE_H
|
||||
#define __STATE_QUEUE_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "option.h" // choices we have
|
||||
|
||||
template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
|
||||
private:
|
||||
ElementType elements[SIZE];
|
||||
volatile ElementType *elementQueue[SIZE];
|
||||
volatile uint32_t head;
|
||||
volatile uint32_t ids[SIZE];
|
||||
volatile uint32_t tail;
|
||||
|
||||
static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
|
||||
INLINE uint32_t ENQUEUE_TICKET();
|
||||
INLINE uint32_t DEQUEUE_TICKET();
|
||||
INLINE uint32_t ID(uint32_t ticket);
|
||||
INLINE bool IsServing(uint32_t slot, uint32_t id);
|
||||
INLINE void PushElement(uint32_t slot, ElementType *element);
|
||||
INLINE ElementType *PopElement(uint32_t slot);
|
||||
INLINE void DoneServing(uint32_t slot, uint32_t id);
|
||||
|
||||
public:
|
||||
INLINE omptarget_nvptx_Queue(){};
|
||||
INLINE void Enqueue(ElementType *element);
|
||||
INLINE ElementType *Dequeue();
|
||||
};
|
||||
|
||||
#include "state-queuei.h"
|
||||
|
||||
#endif
|
89
openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
Normal file
89
openmp/libomptarget/deviceRTLs/nvptx/src/state-queuei.h
Normal file
@ -0,0 +1,89 @@
|
||||
//===------- state-queue.cu - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of a queue to hand out OpenMP state
|
||||
// objects to teams of one or more kernels.
|
||||
//
|
||||
// Reference:
|
||||
// Thomas R.W. Scogland and Wu-chun Feng. 2015.
|
||||
// Design and Evaluation of Scalable Concurrent Queues for Many-Core
|
||||
// Architectures. International Conference on Performance Engineering.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "state-queue.h"
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
|
||||
return atomicAdd((unsigned int *)&tail, 1);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
|
||||
return atomicAdd((unsigned int *)&head, 1);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
|
||||
return (ticket / SIZE) * 2;
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
|
||||
uint32_t id) {
|
||||
return atomicAdd((unsigned int *)&ids[slot], 0) == id;
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE void
|
||||
omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
|
||||
ElementType *element) {
|
||||
atomicExch((unsigned long long *)&elementQueue[slot],
|
||||
(unsigned long long)element);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE ElementType *
|
||||
omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
|
||||
return (ElementType *)atomicAdd((unsigned long long *)&elementQueue[slot],
|
||||
(unsigned long long)0);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
|
||||
uint32_t id) {
|
||||
atomicExch((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE void
|
||||
omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
|
||||
uint32_t ticket = ENQUEUE_TICKET();
|
||||
uint32_t slot = ticket % SIZE;
|
||||
uint32_t id = ID(ticket) + 1;
|
||||
while (!IsServing(slot, id))
|
||||
;
|
||||
PushElement(slot, element);
|
||||
DoneServing(slot, id);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
|
||||
uint32_t ticket = DEQUEUE_TICKET();
|
||||
uint32_t slot = ticket % SIZE;
|
||||
uint32_t id = ID(ticket);
|
||||
while (!IsServing(slot, id))
|
||||
;
|
||||
ElementType *element = PopElement(slot);
|
||||
// This is to populate the queue because of the lack of GPU constructors.
|
||||
if (element == 0)
|
||||
element = &elements[slot];
|
||||
DoneServing(slot, id);
|
||||
return element;
|
||||
}
|
92
openmp/libomptarget/deviceRTLs/nvptx/src/support.h
Normal file
92
openmp/libomptarget/deviceRTLs/nvptx/src/support.h
Normal file
@ -0,0 +1,92 @@
|
||||
//===--------- support.h - NVPTX OpenMP support functions -------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Wrapper to some functions natively supported by the GPU.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Execution Parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
enum ExecutionMode {
|
||||
Generic = 0x00u,
|
||||
Spmd = 0x01u,
|
||||
ModeMask = 0x01u,
|
||||
};
|
||||
|
||||
enum RuntimeMode {
|
||||
RuntimeInitialized = 0x00u,
|
||||
RuntimeUninitialized = 0x02u,
|
||||
RuntimeMask = 0x02u,
|
||||
};
|
||||
|
||||
INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode);
|
||||
INLINE bool isGenericMode();
|
||||
INLINE bool isSPMDMode();
|
||||
INLINE bool isRuntimeUninitialized();
|
||||
INLINE bool isRuntimeInitialized();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get info from machine
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// get low level ids of resources
|
||||
INLINE int GetThreadIdInBlock();
|
||||
INLINE int GetBlockIdInKernel();
|
||||
INLINE int GetNumberOfBlocksInKernel();
|
||||
INLINE int GetNumberOfThreadsInBlock();
|
||||
|
||||
// get global ids to locate tread/team info (constant regardless of OMP)
|
||||
INLINE int GetLogicalThreadIdInBlock();
|
||||
INLINE int GetMasterThreadID();
|
||||
INLINE int GetNumberOfWorkersInTeam();
|
||||
|
||||
// get OpenMP thread and team ids
|
||||
INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
|
||||
bool isRuntimeUninitialized); // omp_thread_num
|
||||
INLINE int GetOmpTeamId(); // omp_team_num
|
||||
|
||||
// get OpenMP number of threads and team
|
||||
INLINE int
|
||||
GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
|
||||
bool isRuntimeUninitialized); // omp_num_threads
|
||||
INLINE int GetNumberOfOmpTeams(); // omp_num_teams
|
||||
|
||||
// get OpenMP number of procs
|
||||
INLINE int GetNumberOfProcsInTeam();
|
||||
INLINE int GetNumberOfProcsInDevice();
|
||||
|
||||
// masters
|
||||
INLINE int IsTeamMaster(int ompThreadId);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// safe alloc and free
|
||||
INLINE void *SafeMalloc(size_t size, const char *msg); // check if success
|
||||
INLINE void *SafeFree(void *ptr, const char *msg);
|
||||
// pad to a alignment (power of 2 only)
|
||||
INLINE unsigned long PadBytes(unsigned long size, unsigned long alignment);
|
||||
#define ADD_BYTES(_addr, _bytes) \
|
||||
((void *)((char *)((void *)(_addr)) + (_bytes)))
|
||||
#define SUB_BYTES(_addr, _bytes) \
|
||||
((void *)((char *)((void *)(_addr)) - (_bytes)))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Named Barrier Routines
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
INLINE void named_sync(const int barrier, const int num_threads);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Teams Reduction Scratchpad Helpers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
INLINE unsigned int *GetTeamsReductionTimestamp();
|
||||
INLINE char *GetTeamsReductionScratchpad();
|
||||
INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr);
|
216
openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
Normal file
216
openmp/libomptarget/deviceRTLs/nvptx/src/supporti.h
Normal file
@ -0,0 +1,216 @@
|
||||
//===--------- supporti.h - NVPTX OpenMP support functions ------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Wrapper implementation to some functions natively supported by the GPU.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Execution Parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
|
||||
execution_param = EMode;
|
||||
execution_param |= RMode;
|
||||
}
|
||||
|
||||
INLINE bool isGenericMode() { return (execution_param & ModeMask) == Generic; }
|
||||
|
||||
INLINE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; }
|
||||
|
||||
INLINE bool isRuntimeUninitialized() {
|
||||
return (execution_param & RuntimeMask) == RuntimeUninitialized;
|
||||
}
|
||||
|
||||
INLINE bool isRuntimeInitialized() {
|
||||
return (execution_param & RuntimeMask) == RuntimeInitialized;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support: get info from machine
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Calls to the NVPTX layer (assuming 1D layout)
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE int GetThreadIdInBlock() { return threadIdx.x; }
|
||||
|
||||
INLINE int GetBlockIdInKernel() { return blockIdx.x; }
|
||||
|
||||
INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; }
|
||||
|
||||
INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The master thread id is the first thread (lane) of the last warp.
|
||||
// Thread id is 0 indexed.
|
||||
// E.g: If NumThreads is 33, master id is 32.
|
||||
// If NumThreads is 64, master id is 32.
|
||||
// If NumThreads is 97, master id is 96.
|
||||
// If NumThreads is 1024, master id is 992.
|
||||
//
|
||||
// Called in Generic Execution Mode only.
|
||||
INLINE int GetMasterThreadID() { return (blockDim.x - 1) & ~(WARPSIZE - 1); }
|
||||
|
||||
// The last warp is reserved for the master; other warps are workers.
|
||||
// Called in Generic Execution Mode only.
|
||||
INLINE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get thread id in team
|
||||
|
||||
// This function may be called in a parallel region by the workers
|
||||
// or a serial region by the master. If the master (whose CUDA thread
|
||||
// id is GetMasterThreadID()) calls this routine, we return 0 because
|
||||
// it is a shadow for the first worker.
|
||||
INLINE int GetLogicalThreadIdInBlock() {
|
||||
// return GetThreadIdInBlock() % GetMasterThreadID();
|
||||
|
||||
// Implemented using control flow (predication) instead of with a modulo
|
||||
// operation.
|
||||
int tid = GetThreadIdInBlock();
|
||||
if (isGenericMode() && tid >= GetMasterThreadID())
|
||||
return 0;
|
||||
else
|
||||
return tid;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// OpenMP Thread Support Layer
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode,
|
||||
bool isRuntimeUninitialized) {
|
||||
// omp_thread_num
|
||||
int rc;
|
||||
|
||||
if (isRuntimeUninitialized) {
|
||||
rc = GetThreadIdInBlock();
|
||||
if (!isSPMDExecutionMode && rc >= GetMasterThreadID())
|
||||
rc = 0;
|
||||
} else {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
rc = currTaskDescr->ThreadId();
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode,
|
||||
bool isRuntimeUninitialized) {
|
||||
// omp_num_threads
|
||||
int rc;
|
||||
|
||||
if (isRuntimeUninitialized) {
|
||||
rc = isSPMDExecutionMode ? GetNumberOfThreadsInBlock()
|
||||
: GetNumberOfThreadsInBlock() - WARPSIZE;
|
||||
} else {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
|
||||
rc = currTaskDescr->ThreadsInTeam();
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Team id linked to OpenMP
|
||||
|
||||
INLINE int GetOmpTeamId() {
|
||||
// omp_team_num
|
||||
return GetBlockIdInKernel(); // assume 1 block per team
|
||||
}
|
||||
|
||||
INLINE int GetNumberOfOmpTeams() {
|
||||
// omp_num_teams
|
||||
return GetNumberOfBlocksInKernel(); // assume 1 block per team
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Masters
|
||||
|
||||
INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get OpenMP number of procs
|
||||
|
||||
// Get the number of processors in the device.
|
||||
INLINE int GetNumberOfProcsInDevice() {
|
||||
if (isGenericMode())
|
||||
return GetNumberOfWorkersInTeam();
|
||||
else
|
||||
return GetNumberOfThreadsInBlock();
|
||||
}
|
||||
|
||||
INLINE int GetNumberOfProcsInTeam() { return GetNumberOfProcsInDevice(); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE unsigned long PadBytes(unsigned long size,
|
||||
unsigned long alignment) // must be a power of 2
|
||||
{
|
||||
// compute the necessary padding to satisfy alignment constraint
|
||||
ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
|
||||
"alignment %ld is not a power of 2\n", alignment);
|
||||
return (~(unsigned long)size + 1) & (alignment - 1);
|
||||
}
|
||||
|
||||
INLINE void *SafeMalloc(size_t size, const char *msg) // check if success
|
||||
{
|
||||
void *ptr = malloc(size);
|
||||
PRINT(LD_MEM, "malloc data of size %d for %s: 0x%llx\n", size, msg, P64(ptr));
|
||||
ASSERT(LT_SAFETY, ptr, "failed to allocate %d bytes for %s\n", size, msg);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
INLINE void *SafeFree(void *ptr, const char *msg) {
|
||||
PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", P64(ptr), msg);
|
||||
free(ptr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Named Barrier Routines
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE void named_sync(const int barrier, const int num_threads) {
|
||||
asm volatile("bar.sync %0, %1;"
|
||||
:
|
||||
: "r"(barrier), "r"(num_threads)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Teams Reduction Scratchpad Helpers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE unsigned int *GetTeamsReductionTimestamp() {
|
||||
return static_cast<unsigned int *>(ReductionScratchpadPtr);
|
||||
}
|
||||
|
||||
INLINE char *GetTeamsReductionScratchpad() {
|
||||
return static_cast<char *>(ReductionScratchpadPtr) + 256;
|
||||
}
|
||||
|
||||
INLINE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) {
|
||||
ReductionScratchpadPtr = ScratchpadPtr;
|
||||
}
|
153
openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
Normal file
153
openmp/libomptarget/deviceRTLs/nvptx/src/sync.cu
Normal file
@ -0,0 +1,153 @@
|
||||
//===------------ sync.h - NVPTX OpenMP synchronizations --------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Include all synchronization.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP Ordered calls
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_ordered(kmp_Indent *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_ordered\n");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_ordered(kmp_Indent *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_ordered\n");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP Barriers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// a team is a block: we can use CUDA native synchronization mechanism
|
||||
// FIXME: what if not all threads (warps) participate to the barrier?
|
||||
// We may need to implement it differently
|
||||
|
||||
EXTERN int32_t __kmpc_cancel_barrier(kmp_Indent *loc_ref, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
|
||||
__syncthreads();
|
||||
PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_barrier(kmp_Indent *loc_ref, int32_t tid) {
|
||||
tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
|
||||
if (!currTaskDescr->InL2OrHigherParallelRegion()) {
|
||||
int numberOfActiveOMPThreads =
|
||||
GetNumberOfOmpThreads(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
// On Volta and newer architectures we require that all lanes in
|
||||
// a warp (at least, all present for the kernel launch) participate in the
|
||||
// barrier. This is enforced when launching the parallel region. An
|
||||
// exception is when there are < WARPSIZE workers. In this case only 1
|
||||
// worker is started, so we don't need a barrier.
|
||||
if (numberOfActiveOMPThreads > 1) {
|
||||
#endif
|
||||
// The #threads parameter must be rounded up to the WARPSIZE.
|
||||
int threads =
|
||||
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
|
||||
PRINT(LD_SYNC,
|
||||
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
|
||||
numberOfActiveOMPThreads, threads);
|
||||
// Barrier #1 is for synchronization among active threads.
|
||||
named_sync(L1_BARRIER, threads);
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
} // numberOfActiveOMPThreads > 1
|
||||
#endif
|
||||
}
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
|
||||
}
|
||||
|
||||
// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
|
||||
// parallel region and that all worker threads participate.
|
||||
EXTERN void __kmpc_barrier_simple_spmd(kmp_Indent *loc_ref, int32_t tid) {
|
||||
PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
|
||||
__syncthreads();
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
|
||||
}
|
||||
|
||||
// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0
|
||||
// parallel region and that all worker threads participate.
|
||||
EXTERN void __kmpc_barrier_simple_generic(kmp_Indent *loc_ref, int32_t tid) {
|
||||
int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE;
|
||||
// The #threads parameter must be rounded up to the WARPSIZE.
|
||||
int threads =
|
||||
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
|
||||
PRINT(LD_SYNC,
|
||||
"call kmpc_barrier_simple_generic with %d omp threads, sync parameter "
|
||||
"%d\n",
|
||||
numberOfActiveOMPThreads, threads);
|
||||
// Barrier #1 is for synchronization among active threads.
|
||||
named_sync(L1_BARRIER, threads);
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP MASTER
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE int32_t IsMaster() {
|
||||
// only the team master updates the state
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
int ompThreadId = GetOmpThreadId(tid, isSPMDMode(), isRuntimeUninitialized());
|
||||
return IsTeamMaster(ompThreadId);
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_master(kmp_Indent *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_master\n");
|
||||
return IsMaster();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_master(kmp_Indent *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_master\n");
|
||||
ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP SINGLE
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN int32_t __kmpc_single(kmp_Indent *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_single\n");
|
||||
// decide to implement single with master; master get the single
|
||||
return IsMaster();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_single(kmp_Indent *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_single\n");
|
||||
// decide to implement single with master: master get the single
|
||||
ASSERT0(LT_FUSSY, IsMaster(), "expected only master here");
|
||||
// sync barrier is explicitely called... so that is not a problem
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Flush
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_flush(kmp_Indent *loc) {
|
||||
PRINT0(LD_IO, "call kmpc_flush\n");
|
||||
__threadfence_block();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Vote
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN int32_t __kmpc_warp_active_thread_mask() {
|
||||
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
|
||||
return __ACTIVEMASK();
|
||||
}
|
208
openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
Normal file
208
openmp/libomptarget/deviceRTLs/nvptx/src/task.cu
Normal file
@ -0,0 +1,208 @@
|
||||
//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Task implementation support.
|
||||
//
|
||||
// explicit task structure uses
|
||||
// omptarget_nvptx task
|
||||
// kmp_task
|
||||
//
|
||||
// where kmp_task is
|
||||
// - klegacy_TaskDescr <- task pointer
|
||||
// shared -> X
|
||||
// routine
|
||||
// part_id
|
||||
// descr
|
||||
// - private (of size given by task_alloc call). Accessed by
|
||||
// task+sizeof(klegacy_TaskDescr)
|
||||
// * private data *
|
||||
// - shared: X. Accessed by shared ptr in klegacy_TaskDescr
|
||||
// * pointer table to shared variables *
|
||||
// - end
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "omptarget-nvptx.h"
|
||||
|
||||
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
|
||||
kmp_Indent *loc, // unused
|
||||
uint32_t global_tid, // unused
|
||||
int32_t flag, // unused (because in our impl, all are immediately exec
|
||||
size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
|
||||
kmp_TaskFctPtr taskSub) {
|
||||
PRINT(LD_IO,
|
||||
"call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
|
||||
"fct 0x%llx)\n",
|
||||
P64(sizeOfTaskInclPrivate), P64(sizeOfSharedTable), P64(taskSub));
|
||||
// want task+priv to be a multiple of 8 bytes
|
||||
size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
|
||||
sizeOfTaskInclPrivate += padForTaskInclPriv;
|
||||
size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
|
||||
ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
|
||||
"need task descr of size %d to be a multiple of %d\n",
|
||||
sizeof(omptarget_nvptx_TaskDescr), sizeof(void *));
|
||||
size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
|
||||
totSize, "explicit task descriptor");
|
||||
kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
|
||||
ASSERT0(LT_FUSSY,
|
||||
(uint64_t)newKmpTaskDescr ==
|
||||
(uint64_t)ADD_BYTES(newExplicitTaskDescr,
|
||||
sizeof(omptarget_nvptx_TaskDescr)),
|
||||
"bad size assumptions");
|
||||
// init kmp_TaskDescr
|
||||
newKmpTaskDescr->sharedPointerTable =
|
||||
(void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
|
||||
newKmpTaskDescr->sub = taskSub;
|
||||
newKmpTaskDescr->destructors = NULL;
|
||||
PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
|
||||
P64(newKmpTaskDescr), P64(newExplicitTaskDescr));
|
||||
|
||||
return newKmpTaskDescr;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_task(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
|
||||
0);
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum,
|
||||
void *noAliasDepList) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
// 1. get explict task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
|
||||
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
|
||||
"bad assumptions");
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
|
||||
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
|
||||
"bad assumptions");
|
||||
|
||||
// 2. push new context: update new task descriptor
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
|
||||
// set new task descriptor as top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
|
||||
|
||||
// 3. call sub
|
||||
PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
|
||||
P64(newKmpTaskDescr->sub), P64(newKmpTaskDescr));
|
||||
newKmpTaskDescr->sub(0, newKmpTaskDescr);
|
||||
PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
|
||||
P64(newKmpTaskDescr->sub));
|
||||
|
||||
// 4. pop context
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
|
||||
parentTaskDescr);
|
||||
// 5. free
|
||||
SafeFree(newExplicitTaskDescr, "explicit task descriptor");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_omp_task_begin_if0(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
// 1. get explict task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
|
||||
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
|
||||
"bad assumptions");
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
|
||||
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
|
||||
"bad assumptions");
|
||||
|
||||
// 2. push new context: update new task descriptor
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
|
||||
// set new task descriptor as top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
|
||||
// 3... noting to call... is inline
|
||||
// 4 & 5 ... done in complete
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_omp_task_complete_if0(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
// 1. get explict task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
|
||||
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
|
||||
"bad assumptions");
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
|
||||
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
|
||||
"bad assumptions");
|
||||
// 2. get parent
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
|
||||
// 3... noting to call... is inline
|
||||
// 4. pop context
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
|
||||
parentTaskDescr);
|
||||
// 5. free
|
||||
SafeFree(newExplicitTaskDescr, "explicit task descriptor");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_omp_wait_deps(kmp_Indent *loc, uint32_t global_tid,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum, void *noAliasDepList) {
|
||||
PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_taskgroup(kmp_Indent *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_taskgroup(kmp_Indent *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_taskyield(kmp_Indent *loc, uint32_t global_tid,
|
||||
int end_part) {
|
||||
PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
|
||||
// do nothing: tasks are executed immediately, no yielding allowed
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_taskwait(kmp_Indent *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_taskloop(kmp_Indent *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr, int if_val,
|
||||
uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
|
||||
int32_t sched, uint64_t grainsize, void *task_dup) {
|
||||
|
||||
// skip task entirely if empty iteration space
|
||||
if (*lb > *ub)
|
||||
return;
|
||||
|
||||
// the compiler has already stored lb and ub in the kmp_TaskDescr structure
|
||||
// as we are using a single task to execute the entire loop, we can leave
|
||||
// the initial task_t untouched
|
||||
|
||||
__kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user