[libomptarget] Build cuda plugin without cuda installed locally

[libomptarget] Build cuda plugin without cuda installed locally

Compiles a new file, `plugins/cuda/dynamic_cuda/cuda.cpp`, to an object file that exposes the same symbols that the plugin presently uses from libcuda. The object file contains dlopen of libcuda and cached dlsym calls. Also provides a cuda.h containing the subset that is used.

This lets the cmake file choose between the system cuda and a dlopen shim, with no changes to rtl.cpp.

The corresponding change to amdgpu is postponed until after a refactor of the plugin to reduce the size of the hsa.h stub required

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D95155
This commit is contained in:
Jon Chesterfield 2021-01-23 00:15:03 +00:00
parent ad25bdcb8e
commit 47e95e87a3
4 changed files with 495 additions and 9 deletions

View File

@ -0,0 +1,277 @@
//===------- dlwrap.h - Convenience wrapper around dlopen/dlsym -- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The openmp plugins depend on extern libraries. These can be used via:
// - bitcode file statically linked
// - (relocatable) object file statically linked
// - static library
// - dynamic library, linked at build time
// - dynamic library, loaded at application run time by dlopen
//
// This file factors out most boilerplate for using a dlopened library.
// - Function symbols are generated that are statically linked against
// - The dlopen can be done implicitly when initializing the library
// - dlsym lookups are done once and cached
// - The abstraction is very thin to permit varied uses of the library
//
// Given int foo(char, double, void*);, writing DLWRAP(foo, 3) will expand to:
// int foo(char x0, double x1, void* x2) {
// constexpr size_t index = id();
// void * dlsymResult = pointer(index);
// return ((int (*)(char, double, void*))dlsymResult)(x0, x1, x2);
// }
//
// Multiple calls to DLWRAP(symbol_name, arity) with bespoke
// initialization code that can use the thin abstraction:
// namespace dlwrap {
// static size_t size();
// static const char *symbol(size_t);
// static void **pointer(size_t);
// }
// will compile to an object file that only exposes the symbols that the
// dynamic library would do, with the right function types.
//
//===----------------------------------------------------------------------===//
#ifndef DLWRAP_H_INCLUDED
#define DLWRAP_H_INCLUDED
#include <array>
#include <cstddef>
#include <tuple>
#include <type_traits>
// Where symbol is a function, these expand to some book keeping and an
// implementation of that function
#define DLWRAP(SYMBOL, ARITY) DLWRAP_IMPL(SYMBOL, ARITY)
#define DLWRAP_INTERNAL(SYMBOL, ARITY) DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY)
// For example, given a prototype:
// int foo(char, double);
//
// DLWRAP(foo, 2) expands to:
//
// namespace dlwrap {
// struct foo_Trait : public dlwrap::trait<decltype(&foo)> {
// using T = dlwrap::trait<decltype(&foo)>;
// static T::FunctionType get() {
// constexpr size_t Index = getIndex();
// void *P = *dlwrap::pointer(Index);
// return reinterpret_cast<T::FunctionType>(P);
// }
// };
// }
// int foo(char x0, double x1) { return dlwrap::foo_Trait::get()(x0, x1); }
//
// DLWRAP_INTERNAL is similar, except the function it expands to is:
// static int dlwrap_foo(char x0, double x1) { ... }
// so that the function pointer call can be wrapped in library-specific code
// DLWRAP_FINALIZE() expands to definitions of:
#define DLWRAP_FINALIZE() DLWRAP_FINALIZE_IMPL()
namespace dlwrap {
static size_t size();
static const char *symbol(size_t); // get symbol name in [0, size())
static void **pointer(size_t); // get pointer to function pointer in [0, size())
} // namespace dlwrap
// Implementation details follow.
namespace dlwrap {
// Extract return / argument types from address of function symbol
template <typename F> struct trait;
template <typename R, typename... Ts> struct trait<R (*)(Ts...)> {
constexpr static const size_t nargs = sizeof...(Ts);
typedef R ReturnType;
template <size_t i> struct arg {
typedef typename std::tuple_element<i, std::tuple<Ts...>>::type type;
};
typedef R (*FunctionType)(Ts...);
};
namespace type {
// Book keeping is by type specialization
template <size_t S> struct count {
static constexpr size_t N = count<S - 1>::N;
};
template <> struct count<0> { static constexpr size_t N = 0; };
// Get a constexpr size_t ID, starts at zero
#define DLWRAP_ID() (dlwrap::type::count<__LINE__>::N)
// Increment value returned by DLWRAP_ID
#define DLWRAP_INC() \
template <> struct dlwrap::type::count<__LINE__> { \
static constexpr size_t N = 1 + dlwrap::type::count<__LINE__ - 1>::N; \
}
template <size_t N> struct symbol;
#define DLWRAP_SYMBOL(SYMBOL, ID) \
template <> struct dlwrap::type::symbol<ID> { \
static constexpr const char *call() { return #SYMBOL; } \
}
} // namespace type
template <size_t N, size_t... Is>
constexpr std::array<const char *, N> static getSymbolArray(
std::index_sequence<Is...>) {
return {{dlwrap::type::symbol<Is>::call()...}};
}
} // namespace dlwrap
#define DLWRAP_INSTANTIATE(SYM_USE, SYM_DEF, ARITY) \
DLWRAP_INSTANTIATE_##ARITY(SYM_USE, SYM_DEF, \
dlwrap::trait<decltype(&SYM_USE)>)
#define DLWRAP_FINALIZE_IMPL() \
static size_t dlwrap::size() { return DLWRAP_ID(); } \
static const char *dlwrap::symbol(size_t i) { \
static constexpr const std::array<const char *, DLWRAP_ID()> \
dlwrap_symbols = getSymbolArray<DLWRAP_ID()>( \
std::make_index_sequence<DLWRAP_ID()>()); \
return dlwrap_symbols[i]; \
} \
static void **dlwrap::pointer(size_t i) { \
static std::array<void *, DLWRAP_ID()> dlwrap_pointers; \
return &dlwrap_pointers.data()[i]; \
}
#define DLWRAP_COMMON(SYMBOL, ARITY) \
DLWRAP_INC(); \
DLWRAP_SYMBOL(SYMBOL, DLWRAP_ID() - 1); \
namespace dlwrap { \
struct SYMBOL##_Trait : public dlwrap::trait<decltype(&SYMBOL)> { \
using T = dlwrap::trait<decltype(&SYMBOL)>; \
static T::FunctionType get() { \
constexpr size_t Index = DLWRAP_ID() - 1; \
void *P = *dlwrap::pointer(Index); \
return reinterpret_cast<T::FunctionType>(P); \
} \
}; \
}
#define DLWRAP_IMPL(SYMBOL, ARITY) \
DLWRAP_COMMON(SYMBOL, ARITY); \
DLWRAP_INSTANTIATE(SYMBOL, SYMBOL, ARITY)
#define DLWRAP_INTERNAL_IMPL(SYMBOL, ARITY) \
DLWRAP_COMMON(SYMBOL, ARITY); \
static DLWRAP_INSTANTIATE(SYMBOL, dlwrap_##SYMBOL, ARITY)
#define DLWRAP_INSTANTIATE_0(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF() { return dlwrap::SYM_USE##_Trait::get()(); }
#define DLWRAP_INSTANTIATE_1(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0) { \
return dlwrap::SYM_USE##_Trait::get()(x0); \
}
#define DLWRAP_INSTANTIATE_2(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1); \
}
#define DLWRAP_INSTANTIATE_3(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2); \
}
#define DLWRAP_INSTANTIATE_4(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3); \
}
#define DLWRAP_INSTANTIATE_5(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4); \
}
#define DLWRAP_INSTANTIATE_6(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4, \
typename T::template arg<5>::type x5) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5); \
}
#define DLWRAP_INSTANTIATE_7(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4, \
typename T::template arg<5>::type x5, \
typename T::template arg<6>::type x6) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6); \
}
#define DLWRAP_INSTANTIATE_8(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4, \
typename T::template arg<5>::type x5, \
typename T::template arg<6>::type x6, \
typename T::template arg<7>::type x7) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7); \
}
#define DLWRAP_INSTANTIATE_9(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4, \
typename T::template arg<5>::type x5, \
typename T::template arg<6>::type x6, \
typename T::template arg<7>::type x7, \
typename T::template arg<8>::type x8) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8); \
}
#define DLWRAP_INSTANTIATE_10(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4, \
typename T::template arg<5>::type x5, \
typename T::template arg<6>::type x6, \
typename T::template arg<7>::type x7, \
typename T::template arg<8>::type x8, \
typename T::template arg<9>::type x9) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
x9); \
}
#define DLWRAP_INSTANTIATE_11(SYM_USE, SYM_DEF, T) \
T::ReturnType SYM_DEF(typename T::template arg<0>::type x0, \
typename T::template arg<1>::type x1, \
typename T::template arg<2>::type x2, \
typename T::template arg<3>::type x3, \
typename T::template arg<4>::type x4, \
typename T::template arg<5>::type x5, \
typename T::template arg<6>::type x6, \
typename T::template arg<7>::type x7, \
typename T::template arg<8>::type x8, \
typename T::template arg<9>::type x9, \
typename T::template arg<10>::type x10) { \
return dlwrap::SYM_USE##_Trait::get()(x0, x1, x2, x3, x4, x5, x6, x7, x8, \
x9, x10); \
}
#endif

View File

@ -15,12 +15,6 @@ if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE
elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND)
libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.")
return()
elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND)
libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.")
return()
elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND)
libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.")
return()
endif()
libomptarget_say("Building CUDA offloading plugin.")
@ -28,10 +22,22 @@ libomptarget_say("Building CUDA offloading plugin.")
# Define the suffix for the runtime messaging dumps.
add_definitions(-DTARGET_NAME=CUDA)
include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS})
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
option(LIBOMPTARGET_DLOPEN_LIBCUDA "Build with dlopened libcuda" OFF)
if (LIBOMPTARGET_DEP_CUDA_FOUND AND LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND
AND NOT LIBOMPTARGET_DLOPEN_LIBCUDA)
libomptarget_say("Building CUDA plugin linked against libcuda")
include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS})
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp)
set (LIBOMPTARGET_DEP_LIBRARIES ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES})
else()
libomptarget_say("Building CUDA plugin for dlopened libcuda")
include_directories(dynamic_cuda)
add_library(omptarget.rtl.cuda SHARED src/rtl.cpp dynamic_cuda/cuda.cpp)
set (LIBOMPTARGET_DEP_LIBRARIES ${CMAKE_DL_LIBS})
endif()
# Install plugin under the lib destination folder.
install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}")
@ -39,7 +45,7 @@ install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}
target_link_libraries(omptarget.rtl.cuda
elf_common
MemoryManager
${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES}
${LIBOMPTARGET_DEP_LIBRARIES}
${LIBOMPTARGET_DEP_LIBELF_LIBRARIES}
"-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports"
"-Wl,-z,defs")

View File

@ -0,0 +1,99 @@
//===--- cuda/dynamic_cuda/cuda.pp ------------------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implement subset of cuda api by calling into cuda library via dlopen
// Does the dlopen/dlsym calls as part of the call to cuInit
//
//===----------------------------------------------------------------------===//
#include "cuda.h"
#include "Debug.h"
#include "dlwrap.h"
#include <dlfcn.h>
DLWRAP_INTERNAL(cuInit, 1);
DLWRAP(cuCtxGetDevice, 1);
DLWRAP(cuDeviceGet, 2);
DLWRAP(cuDeviceGetAttribute, 3);
DLWRAP(cuDeviceGetCount, 1);
DLWRAP(cuFuncGetAttribute, 3);
DLWRAP(cuGetErrorString, 2);
DLWRAP(cuLaunchKernel, 11);
DLWRAP(cuMemAlloc, 2);
DLWRAP(cuMemcpyDtoDAsync, 4);
DLWRAP(cuMemcpyDtoH, 3);
DLWRAP(cuMemcpyDtoHAsync, 4);
DLWRAP(cuMemcpyHtoD, 3);
DLWRAP(cuMemcpyHtoDAsync, 4);
DLWRAP(cuMemFree, 1);
DLWRAP(cuModuleGetFunction, 3);
DLWRAP(cuModuleGetGlobal, 4);
DLWRAP(cuModuleUnload, 1);
DLWRAP(cuStreamCreate, 2);
DLWRAP(cuStreamDestroy, 1);
DLWRAP(cuStreamSynchronize, 1);
DLWRAP(cuCtxSetCurrent, 1);
DLWRAP(cuDevicePrimaryCtxRelease, 1);
DLWRAP(cuDevicePrimaryCtxGetState, 3);
DLWRAP(cuDevicePrimaryCtxSetFlags, 2);
DLWRAP(cuDevicePrimaryCtxRetain, 2);
DLWRAP(cuModuleLoadDataEx, 5);
DLWRAP(cuDeviceCanAccessPeer, 3);
DLWRAP(cuCtxEnablePeerAccess, 2);
DLWRAP(cuMemcpyPeerAsync, 6);
DLWRAP_FINALIZE();
#ifndef DYNAMIC_CUDA_PATH
#define DYNAMIC_CUDA_PATH "libcuda.so"
#endif
#define TARGET_NAME CUDA
#define DEBUG_PREFIX "Target " GETNAME(TARGET_NAME) " RTL"
static bool checkForCUDA() {
// return true if dlopen succeeded and all functions found
const char *CudaLib = DYNAMIC_CUDA_PATH;
void *DynlibHandle = dlopen(CudaLib, RTLD_NOW);
if (!DynlibHandle) {
DP("Unable to load library '%s': %s!\n", CudaLib, dlerror());
return false;
}
for (size_t I = 0; I < dlwrap::size(); I++) {
const char *Sym = dlwrap::symbol(I);
void *P = dlsym(DynlibHandle, Sym);
if (P == nullptr) {
DP("Unable to find '%s' in '%s'!\n", Sym, CudaLib);
return false;
}
*dlwrap::pointer(I) = P;
}
return true;
}
CUresult cuInit(unsigned X) {
// Note: Called exactly once from cuda rtl.cpp in a global constructor so
// does not need to handle being called repeatedly or concurrently
if (!checkForCUDA()) {
return CUDA_ERROR_INVALID_VALUE;
}
return dlwrap_cuInit(X);
}

View File

@ -0,0 +1,104 @@
//===--- cuda/dynamic_cuda/cuda.h --------------------------------- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// The parts of the cuda api that are presently in use by the openmp cuda plugin
//
//===----------------------------------------------------------------------===//
#ifndef DYNAMIC_CUDA_CUDA_H_INCLUDED
#define DYNAMIC_CUDA_CUDA_H_INCLUDED
#include <cstddef>
#include <cstdint>
typedef int CUdevice;
typedef uintptr_t CUdeviceptr;
typedef struct CUmod_st *CUmodule;
typedef struct CUctx_st *CUcontext;
typedef struct CUfunc_st *CUfunction;
typedef struct CUstream_st *CUstream;
typedef enum cudaError_enum {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE = 1,
} CUresult;
typedef enum CUstream_flags_enum {
CU_STREAM_DEFAULT = 0x0,
CU_STREAM_NON_BLOCKING = 0x1,
} CUstream_flags;
typedef enum CUdevice_attribute_enum {
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,
CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,
} CUdevice_attribute;
typedef enum CUfunction_attribute_enum {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
} CUfunction_attribute;
typedef enum CUctx_flags_enum {
CU_CTX_SCHED_BLOCKING_SYNC = 0x04,
CU_CTX_SCHED_MASK = 0x07,
} CUctx_flags;
#define cuMemFree cuMemFree_v2
#define cuMemAlloc cuMemAlloc_v2
#define cuMemcpyDtoH cuMemcpyDtoH_v2
#define cuMemcpyHtoD cuMemcpyHtoD_v2
#define cuStreamDestroy cuStreamDestroy_v2
#define cuModuleGetGlobal cuModuleGetGlobal_v2
#define cuMemcpyDtoHAsync cuMemcpyDtoHAsync_v2
#define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2
#define cuMemcpyHtoDAsync cuMemcpyHtoDAsync_v2
#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2
#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2
CUresult cuCtxGetDevice(CUdevice *);
CUresult cuDeviceGet(CUdevice *, int);
CUresult cuDeviceGetAttribute(int *, CUdevice_attribute, CUdevice);
CUresult cuDeviceGetCount(int *);
CUresult cuFuncGetAttribute(int *, CUfunction_attribute, CUfunction);
CUresult cuGetErrorString(CUresult, const char **);
CUresult cuInit(unsigned);
CUresult cuLaunchKernel(CUfunction, unsigned, unsigned, unsigned, unsigned,
unsigned, unsigned, unsigned, CUstream, void **,
void **);
CUresult cuMemAlloc(CUdeviceptr *, size_t);
CUresult cuMemcpyDtoDAsync(CUdeviceptr, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyDtoH(void *, CUdeviceptr, size_t);
CUresult cuMemcpyDtoHAsync(void *, CUdeviceptr, size_t, CUstream);
CUresult cuMemcpyHtoD(CUdeviceptr, const void *, size_t);
CUresult cuMemcpyHtoDAsync(CUdeviceptr, const void *, size_t, CUstream);
CUresult cuMemFree(CUdeviceptr);
CUresult cuModuleGetFunction(CUfunction *, CUmodule, const char *);
CUresult cuModuleGetGlobal(CUdeviceptr *, size_t *, CUmodule, const char *);
CUresult cuModuleUnload(CUmodule);
CUresult cuStreamCreate(CUstream *, unsigned);
CUresult cuStreamDestroy(CUstream);
CUresult cuStreamSynchronize(CUstream);
CUresult cuCtxSetCurrent(CUcontext);
CUresult cuDevicePrimaryCtxRelease(CUdevice);
CUresult cuDevicePrimaryCtxGetState(CUdevice, unsigned *, int *);
CUresult cuDevicePrimaryCtxSetFlags(CUdevice, unsigned);
CUresult cuDevicePrimaryCtxRetain(CUcontext *, CUdevice);
CUresult cuModuleLoadDataEx(CUmodule *, const void *, unsigned, void *,
void **);
CUresult cuDeviceCanAccessPeer(int *, CUdevice, CUdevice);
CUresult cuCtxEnablePeerAccess(CUcontext, unsigned);
CUresult cuMemcpyPeerAsync(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext,
size_t, CUstream);
#endif