mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2024-12-01 09:41:08 +00:00
[OpenMP] Changes in the plugin interface
This patch chagnes the plugin interface so that: 1) future plugins can take advantage of systems with shared CPU/device storage 2) instead of using base addresses, target regions are launched by providing target addresseds and base offsets explicitly. Differential revision: https://reviews.llvm.org/D33028 llvm-svn: 302663
This commit is contained in:
parent
dc1ed12015
commit
1546d31924
@ -19,7 +19,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "omptarget.h"
|
||||
#include "omptargetplugin.h"
|
||||
|
||||
#ifndef TARGET_NAME
|
||||
#define TARGET_NAME CUDA
|
||||
@ -473,7 +473,7 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
|
||||
return DeviceInfo.getOffloadEntriesTable(device_id);
|
||||
}
|
||||
|
||||
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) {
|
||||
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
|
||||
if (size == 0) {
|
||||
return NULL;
|
||||
}
|
||||
@ -559,8 +559,8 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit,
|
||||
uint64_t loop_tripcount) {
|
||||
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
|
||||
int32_t thread_limit, uint64_t loop_tripcount) {
|
||||
// Set the context we are using.
|
||||
CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]);
|
||||
if (err != CUDA_SUCCESS) {
|
||||
@ -571,9 +571,12 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
|
||||
// All args are references.
|
||||
std::vector<void *> args(arg_num);
|
||||
std::vector<void *> ptrs(arg_num);
|
||||
|
||||
for (int32_t i = 0; i < arg_num; ++i)
|
||||
args[i] = &tgt_args[i];
|
||||
for (int32_t i = 0; i < arg_num; ++i) {
|
||||
ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
|
||||
args[i] = &ptrs[i];
|
||||
}
|
||||
|
||||
KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr;
|
||||
|
||||
@ -678,12 +681,12 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
void **tgt_args, int32_t arg_num) {
|
||||
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
|
||||
// use one team and the default number of threads.
|
||||
const int32_t team_num = 1;
|
||||
const int32_t thread_limit = 0;
|
||||
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
|
||||
arg_num, team_num, thread_limit, 0);
|
||||
tgt_offsets, arg_num, team_num, thread_limit, 0);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include <list>
|
||||
#include <vector>
|
||||
|
||||
#include "omptarget.h"
|
||||
#include "omptargetplugin.h"
|
||||
|
||||
#ifndef TARGET_NAME
|
||||
#define TARGET_NAME Generic ELF - 64bit
|
||||
@ -251,7 +251,7 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t device_id,
|
||||
return DeviceInfo.getOffloadEntriesTable(device_id);
|
||||
}
|
||||
|
||||
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size) {
|
||||
void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) {
|
||||
void *ptr = malloc(size);
|
||||
return ptr;
|
||||
}
|
||||
@ -274,8 +274,8 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) {
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
void **tgt_args, int32_t arg_num, int32_t team_num, int32_t thread_limit,
|
||||
uint64_t loop_tripcount /*not used*/) {
|
||||
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num,
|
||||
int32_t thread_limit, uint64_t loop_tripcount /*not used*/) {
|
||||
// ignore team num and thread limit.
|
||||
|
||||
// Use libffi to launch execution.
|
||||
@ -284,9 +284,12 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
// All args are references.
|
||||
std::vector<ffi_type *> args_types(arg_num, &ffi_type_pointer);
|
||||
std::vector<void *> args(arg_num);
|
||||
std::vector<void *> ptrs(arg_num);
|
||||
|
||||
for (int32_t i = 0; i < arg_num; ++i)
|
||||
args[i] = &tgt_args[i];
|
||||
for (int32_t i = 0; i < arg_num; ++i) {
|
||||
ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]);
|
||||
args[i] = &ptrs[i];
|
||||
}
|
||||
|
||||
ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num,
|
||||
&ffi_type_void, &args_types[0]);
|
||||
@ -303,10 +306,10 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
}
|
||||
|
||||
int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr,
|
||||
void **tgt_args, int32_t arg_num) {
|
||||
void **tgt_args, ptrdiff_t *tgt_offsets, int32_t arg_num) {
|
||||
// use one team and one thread.
|
||||
return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args,
|
||||
arg_num, 1, 1, 0);
|
||||
tgt_offsets, arg_num, 1, 1, 0);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -162,10 +162,11 @@ struct DeviceTy {
|
||||
int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
|
||||
int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
|
||||
|
||||
int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize);
|
||||
int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize);
|
||||
int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint64_t LoopTripCount);
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
|
||||
int32_t ThreadLimit, uint64_t LoopTripCount);
|
||||
|
||||
private:
|
||||
// Call to RTL
|
||||
@ -181,13 +182,14 @@ struct RTLInfoTy {
|
||||
typedef int32_t(number_of_devices_ty)();
|
||||
typedef int32_t(init_device_ty)(int32_t);
|
||||
typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
|
||||
typedef void *(data_alloc_ty)(int32_t, int64_t);
|
||||
typedef void *(data_alloc_ty)(int32_t, int64_t, void *);
|
||||
typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
|
||||
typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
|
||||
typedef int32_t(data_delete_ty)(int32_t, void *);
|
||||
typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t);
|
||||
typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t,
|
||||
int32_t, int32_t, uint64_t);
|
||||
typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *,
|
||||
int32_t);
|
||||
typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *,
|
||||
int32_t, int32_t, int32_t, uint64_t);
|
||||
|
||||
int32_t Idx; // RTL index, index is the number of devices
|
||||
// of other RTLs that were registered before,
|
||||
@ -471,7 +473,7 @@ EXTERN void *omp_target_alloc(size_t size, int device_num) {
|
||||
}
|
||||
|
||||
DeviceTy &Device = Devices[device_num];
|
||||
rc = Device.RTL->data_alloc(Device.RTLDeviceID, size);
|
||||
rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL);
|
||||
DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
|
||||
return rc;
|
||||
}
|
||||
@ -861,7 +863,7 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
|
||||
} else if (Size) {
|
||||
// If it is not contained and Size > 0 we should create a new entry for it.
|
||||
IsNew = true;
|
||||
uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size);
|
||||
uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
|
||||
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
|
||||
"HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
|
||||
DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
|
||||
@ -995,16 +997,17 @@ int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
|
||||
|
||||
// Run region on device
|
||||
int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
int32_t TgtVarsSize) {
|
||||
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize);
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize) {
|
||||
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
|
||||
TgtVarsSize);
|
||||
}
|
||||
|
||||
// Run team region on device.
|
||||
int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
|
||||
int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint64_t LoopTripCount) {
|
||||
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize,
|
||||
NumTeams, ThreadLimit, LoopTripCount);
|
||||
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams,
|
||||
int32_t ThreadLimit, uint64_t LoopTripCount) {
|
||||
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
|
||||
TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
@ -2108,6 +2111,7 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
|
||||
}
|
||||
|
||||
std::vector<void *> tgt_args;
|
||||
std::vector<ptrdiff_t> tgt_offsets;
|
||||
|
||||
// List of (first-)private arrays allocated for this target region
|
||||
std::vector<void *> fpArrays;
|
||||
@ -2119,16 +2123,18 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
|
||||
}
|
||||
void *HstPtrBegin = args[i];
|
||||
void *HstPtrBase = args_base[i];
|
||||
void *TgtPtrBase;
|
||||
void *TgtPtrBegin;
|
||||
ptrdiff_t TgtBaseOffset;
|
||||
bool IsLast; // unused.
|
||||
if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
|
||||
DP("Forwarding first-private value " DPxMOD " to the target construct\n",
|
||||
DPxPTR(HstPtrBase));
|
||||
TgtPtrBase = HstPtrBase;
|
||||
TgtPtrBegin = HstPtrBase;
|
||||
TgtBaseOffset = 0;
|
||||
} else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
|
||||
// Allocate memory for (first-)private array
|
||||
void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
|
||||
arg_sizes[i]);
|
||||
TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
|
||||
arg_sizes[i], HstPtrBegin);
|
||||
if (!TgtPtrBegin) {
|
||||
DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
|
||||
(arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
|
||||
@ -2137,8 +2143,8 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
|
||||
break;
|
||||
} else {
|
||||
fpArrays.push_back(TgtPtrBegin);
|
||||
uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
|
||||
TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
|
||||
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
|
||||
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
|
||||
DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
|
||||
"%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
|
||||
arg_sizes[i], DPxPTR(TgtPtrBegin),
|
||||
@ -2155,24 +2161,29 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
|
||||
}
|
||||
}
|
||||
} else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
|
||||
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *),
|
||||
IsLast, false);
|
||||
TgtPtrBase = TgtPtrBegin; // no offset for ptrs.
|
||||
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
|
||||
false);
|
||||
TgtBaseOffset = 0; // no offset for ptrs.
|
||||
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
|
||||
"object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
|
||||
DPxPTR(HstPtrBase));
|
||||
} else {
|
||||
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i],
|
||||
IsLast, false);
|
||||
uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
|
||||
TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
|
||||
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
|
||||
false);
|
||||
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
|
||||
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
|
||||
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
|
||||
DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
|
||||
}
|
||||
tgt_args.push_back(TgtPtrBase);
|
||||
tgt_args.push_back(TgtPtrBegin);
|
||||
tgt_offsets.push_back(TgtBaseOffset);
|
||||
}
|
||||
// Push omp handle.
|
||||
tgt_args.push_back((void *)0);
|
||||
tgt_offsets.push_back(0);
|
||||
|
||||
assert(tgt_args.size() == tgt_offsets.size() &&
|
||||
"Size mismatch in arguments and offsets");
|
||||
|
||||
// Pop loop trip count
|
||||
uint64_t ltc = Device.loopTripCnt;
|
||||
@ -2185,10 +2196,11 @@ static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
|
||||
DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
|
||||
if (IsTeamConstruct) {
|
||||
rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
|
||||
&tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc);
|
||||
&tgt_args[0], &tgt_offsets[0], tgt_args.size(), team_num,
|
||||
thread_limit, ltc);
|
||||
} else {
|
||||
rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
|
||||
&tgt_args[0], tgt_args.size());
|
||||
&tgt_args[0], &tgt_offsets[0], tgt_args.size());
|
||||
}
|
||||
} else {
|
||||
DP("Errors occurred while obtaining target arguments, skipping kernel "
|
||||
|
@ -16,6 +16,7 @@
|
||||
#define _OMPTARGET_H_
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define OFFLOAD_SUCCESS (0)
|
||||
#define OFFLOAD_FAIL (~0)
|
||||
|
92
openmp/libomptarget/src/omptargetplugin.h
Normal file
92
openmp/libomptarget/src/omptargetplugin.h
Normal file
@ -0,0 +1,92 @@
|
||||
//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is dual licensed under the MIT and the University of Illinois Open
|
||||
// Source Licenses. See LICENSE.txt for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file defines an interface between target independent OpenMP offload
|
||||
// runtime library libomptarget and target dependent plugin.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OMPTARGETPLUGIN_H_
|
||||
#define _OMPTARGETPLUGIN_H_
|
||||
|
||||
#include <omptarget.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Return the number of available devices of the type supported by the
|
||||
// target RTL.
|
||||
int32_t __tgt_rtl_number_of_devices(void);
|
||||
|
||||
// Return an integer different from zero if the provided device image can be
|
||||
// supported by the runtime. The functionality is similar to comparing the
|
||||
// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a
|
||||
// lightweight query to determine if the RTL is suitable for an image without
|
||||
// having to load the library, which can be expensive.
|
||||
int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image);
|
||||
|
||||
// Initialize the specified device. In case of success return 0; otherwise
|
||||
// return an error code.
|
||||
int32_t __tgt_rtl_init_device(int32_t ID);
|
||||
|
||||
// Pass an executable image section described by image to the specified
|
||||
// device and prepare an address table of target entities. In case of error,
|
||||
// return NULL. Otherwise, return a pointer to the built address table.
|
||||
// Individual entries in the table may also be NULL, when the corresponding
|
||||
// offload region is not supported on the target device.
|
||||
__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
|
||||
__tgt_device_image *Image);
|
||||
|
||||
// Allocate data on the particular target device, of the specified size.
|
||||
// HostPtr is a address of the host data the allocated target data
|
||||
// will be associated with (HostPtr may be NULL if it is not known at
|
||||
// allocation time, like for example it would be for target data that
|
||||
// is allocated by omp_target_alloc() API). Return address of the
|
||||
// allocated data on the target that will be used by libomptarget.so to
|
||||
// initialize the target data mapping structures. These addresses are
|
||||
// used to generate a table of target variables to pass to
|
||||
// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in
|
||||
// case an error occurred on the target device.
|
||||
void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr);
|
||||
|
||||
// Pass the data content to the target device using the target address.
|
||||
// In case of success, return zero. Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr,
|
||||
int64_t Size);
|
||||
|
||||
// Retrieve the data content from the target device using its address.
|
||||
// In case of success, return zero. Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr,
|
||||
int64_t Size);
|
||||
|
||||
// De-allocate the data referenced by target ptr on the device. In case of
|
||||
// success, return zero. Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr);
|
||||
|
||||
// Transfer control to the offloaded entry Entry on the target device.
|
||||
// Args and Offsets are arrays of NumArgs size of target addresses and
|
||||
// offsets. An offset should be added to the target address before passing it
|
||||
// to the outlined function on device side. In case of success, return zero.
|
||||
// Otherwise, return an error code.
|
||||
int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args,
|
||||
ptrdiff_t *Offsets, int32_t NumArgs);
|
||||
|
||||
// Similar to __tgt_rtl_run_target_region, but additionally specify the
|
||||
// number of teams to be created and a number of threads in each team.
|
||||
int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args,
|
||||
ptrdiff_t *Offsets, int32_t NumArgs,
|
||||
int32_t NumTeams, int32_t ThreadLimit,
|
||||
uint64_t loop_tripcount);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _OMPTARGETPLUGIN_H_
|
Loading…
Reference in New Issue
Block a user