CUDAToolkit: No targets now depend on the CUDA runtime

It is not a requirement to have shared|static consistent across your
CUDA libraries (e.g curand, nppc ) and your CUDA runtime library.
It is entirely allowable to use a static nppc and a shared runtime.
This commit is contained in:
Robert Maynard 2020-01-03 16:43:26 -05:00
parent 907bb7df57
commit 6e474364d1
16 changed files with 480 additions and 5 deletions

View File

@ -122,7 +122,6 @@ CUDA Runtime Library
The CUDA Runtime library (cudart) are what most applications will typically
need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
They are an explicit dependency of almost every library.
Targets Created:
@ -729,10 +728,8 @@ if(CUDAToolkit_FOUND)
foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
find_and_add_cuda_import_lib(${cuda_lib})
add_cuda_link_dependency(${cuda_lib} cudart)
find_and_add_cuda_import_lib(${cuda_lib}_static)
add_cuda_link_dependency(${cuda_lib}_static cudart_static)
endforeach()
# cuSOLVER depends on cuBLAS, and cuSPARSE
@ -772,8 +769,6 @@ if(CUDAToolkit_FOUND)
endif()
find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
add_cuda_link_dependency(nvToolsExt cudart)
find_and_add_cuda_import_lib(OpenCL)
find_and_add_cuda_import_lib(culibos)

View File

@ -13,4 +13,12 @@ ADD_TEST_MACRO(Cuda.Toolkit Toolkit)
ADD_TEST_MACRO(Cuda.IncludePathNoToolkit IncludePathNoToolkit)
ADD_TEST_MACRO(Cuda.ProperDeviceLibraries ProperDeviceLibraries)
ADD_TEST_MACRO(Cuda.ProperLinkFlags ProperLinkFlags)
ADD_TEST_MACRO(Cuda.SharedRuntimePlusToolkit SharedRuntimePlusToolkit)
# The CUDA only ships the shared version of the toolkit libraries
# on windows
if(NOT WIN32)
ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
endif()
ADD_TEST_MACRO(Cuda.WithC CudaWithC)

View File

@ -0,0 +1,35 @@
cmake_minimum_required(VERSION 3.15)
project(SharedRuntimePlusToolkit CXX)
#Goal for this example:
# Validate that with c++ we can use some components of the CUDA toolkit, and
# specify the cuda runtime
find_package(CUDAToolkit REQUIRED)
add_library(Common OBJECT curand.cpp nppif.cpp)
target_link_libraries(Common PRIVATE CUDA::toolkit)
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
#shared runtime with shared toolkit libraries
add_library(SharedToolkit SHARED shared.cpp)
target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
# The CUDA only ships the shared version of the toolkit libraries
# on windows
if(NOT WIN32)
#shared runtime with static toolkit libraries
add_library(StaticToolkit SHARED static.cpp)
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
target_link_libraries(StaticToolkit PUBLIC CUDA::cudart)
#static runtime with mixed toolkit libraries
add_library(MixedToolkit SHARED mixed.cpp)
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
target_link_libraries(MixedToolkit PUBLIC CUDA::cudart)
endif()
add_executable(SharedRuntimePlusToolkit main.cpp)
target_link_libraries(SharedRuntimePlusToolkit PRIVATE SharedToolkit
$<TARGET_NAME_IF_EXISTS:StaticToolkit>
$<TARGET_NAME_IF_EXISTS:MixedToolkit>)

View File

@ -0,0 +1,65 @@
// Comes from:
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
#ifdef _WIN32
# define EXPORT __declspec(dllexport)
#else
# define EXPORT
#endif
/*
* This program uses the host CURAND API to generate 100
* pseudorandom floats.
*/
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#define CUDA_CALL(x) \
do { \
if ((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
#define CURAND_CALL(x) \
do { \
if ((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
EXPORT int curand_main()
{
size_t n = 100;
size_t i;
curandGenerator_t gen;
float *devData, *hostData;
/* Allocate n floats on host */
hostData = (float*)calloc(n, sizeof(float));
/* Allocate n floats on device */
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
/* Create pseudo-random number generator */
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
/* Set seed */
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
/* Generate n floats on device */
CURAND_CALL(curandGenerateUniform(gen, devData, n));
/* Copy device memory to host */
CUDA_CALL(
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
/* Cleanup */
CURAND_CALL(curandDestroyGenerator(gen));
CUDA_CALL(cudaFree(devData));
free(hostData);
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,23 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
IMPORT int shared_version();
int static_version()
{
return 0;
}
int mixed_version()
{
return 0;
}
#else
int shared_version();
int static_version();
int mixed_version();
#endif
int main()
{
return mixed_version() == 0 && shared_version() == 0 &&
static_version() == 0;
}

View File

@ -0,0 +1,16 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
# define EXPORT __declspec(dllexport)
#else
# define IMPORT
# define EXPORT
#endif
IMPORT int curand_main();
IMPORT int nppif_main();
EXPORT int mixed_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@ -0,0 +1,92 @@
// Comes from
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
#ifdef _WIN32
# define EXPORT __declspec(dllexport)
#else
# define EXPORT
#endif
#include <cstdio>
#include <iostream>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <nppi_filtering_functions.h>
EXPORT int nppif_main()
{
/**
* 8-bit unsigned single-channel 1D row convolution.
*/
const int simgrows = 32;
const int simgcols = 32;
Npp8u *d_pSrc, *d_pDst;
const int nMaskSize = 3;
NppiSize oROI;
oROI.width = simgcols - nMaskSize;
oROI.height = simgrows;
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
const int simgpix = simgrows * simgcols;
const int dimgpix = oROI.width * oROI.height;
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
const int pixval = 1;
const int nDivisor = 1;
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
Npp32s* d_pKernel;
const Npp32s nAnchor = 2;
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pDst, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// set image to pixval initially
err = cudaMemset(d_pSrc, pixval, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemset(d_pDst, 0, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// copy src to dst
NppStatus ret =
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
nMaskSize, nAnchor, nDivisor);
assert(ret == NPP_NO_ERROR);
Npp8u* h_imgres = new Npp8u[dimgpix];
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// test for filtering
for (int i = 0; i < dimgpix; i++) {
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
return 1;
}
}
return 0;
}

View File

@ -0,0 +1,16 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
# define EXPORT __declspec(dllexport)
#else
# define IMPORT
# define EXPORT
#endif
int curand_main();
int nppif_main();
EXPORT int shared_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@ -0,0 +1,16 @@
#ifdef _WIN32
# define IMPORT __declspec(dllimport)
# define EXPORT __declspec(dllexport)
#else
# define IMPORT
# define EXPORT
#endif
IMPORT int curand_main();
IMPORT int nppif_main();
EXPORT int static_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@ -0,0 +1,29 @@
cmake_minimum_required(VERSION 3.15)
project(StaticRuntimePlusToolkit CXX)
#Goal for this example:
# Validate that with c++ we can use some components of the CUDA toolkit, and
# specify the cuda runtime
find_package(CUDAToolkit REQUIRED)
add_library(Common OBJECT curand.cpp nppif.cpp)
target_link_libraries(Common PRIVATE CUDA::toolkit)
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
#static runtime with shared toolkit libraries
add_library(SharedToolkit SHARED shared.cpp)
target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
#static runtime with static toolkit libraries
add_library(StaticToolkit SHARED static.cpp)
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
target_link_libraries(StaticToolkit PUBLIC CUDA::cudart_static)
#static runtime with mixed toolkit libraries
add_library(MixedToolkit SHARED mixed.cpp)
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
target_link_libraries(MixedToolkit PUBLIC CUDA::cudart_static)
add_executable(StaticRuntimePlusToolkit main.cpp)
target_link_libraries(StaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)

View File

@ -0,0 +1,59 @@
// Comes from:
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
/*
* This program uses the host CURAND API to generate 100
* pseudorandom floats.
*/
#include <cuda.h>
#include <curand.h>
#include <stdio.h>
#include <stdlib.h>
#define CUDA_CALL(x) \
do { \
if ((x) != cudaSuccess) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
#define CURAND_CALL(x) \
do { \
if ((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
return EXIT_FAILURE; \
} \
} while (0)
int curand_main()
{
size_t n = 100;
size_t i;
curandGenerator_t gen;
float *devData, *hostData;
/* Allocate n floats on host */
hostData = (float*)calloc(n, sizeof(float));
/* Allocate n floats on device */
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
/* Create pseudo-random number generator */
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
/* Set seed */
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
/* Generate n floats on device */
CURAND_CALL(curandGenerateUniform(gen, devData, n));
/* Copy device memory to host */
CUDA_CALL(
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
/* Cleanup */
CURAND_CALL(curandDestroyGenerator(gen));
CUDA_CALL(cudaFree(devData));
free(hostData);
return EXIT_SUCCESS;
}

View File

@ -0,0 +1,11 @@
int shared_version();
int static_version();
int mixed_version();
int main()
{
return mixed_version() == 0 && shared_version() == 0 &&
static_version() == 0;
}

View File

@ -0,0 +1,8 @@
int curand_main();
int nppif_main();
int mixed_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@ -0,0 +1,86 @@
// Comes from
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
#include <cstdio>
#include <iostream>
#include <assert.h>
#include <cuda_runtime_api.h>
#include <nppi_filtering_functions.h>
int nppif_main()
{
/**
* 8-bit unsigned single-channel 1D row convolution.
*/
const int simgrows = 32;
const int simgcols = 32;
Npp8u *d_pSrc, *d_pDst;
const int nMaskSize = 3;
NppiSize oROI;
oROI.width = simgcols - nMaskSize;
oROI.height = simgrows;
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
const int simgpix = simgrows * simgcols;
const int dimgpix = oROI.width * oROI.height;
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
const int pixval = 1;
const int nDivisor = 1;
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
Npp32s* d_pKernel;
const Npp32s nAnchor = 2;
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pDst, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// set image to pixval initially
err = cudaMemset(d_pSrc, pixval, simgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemset(d_pDst, 0, dimgsize);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// copy src to dst
NppStatus ret =
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
nMaskSize, nAnchor, nDivisor);
assert(ret == NPP_NO_ERROR);
Npp8u* h_imgres = new Npp8u[dimgpix];
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
if (err != cudaSuccess) {
fprintf(stderr, "Cuda error %d\n", __LINE__);
return 1;
}
// test for filtering
for (int i = 0; i < dimgpix; i++) {
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
return 1;
}
}
return 0;
}

View File

@ -0,0 +1,8 @@
int curand_main();
int nppif_main();
int shared_version()
{
return curand_main() == 0 && nppif_main() == 0;
}

View File

@ -0,0 +1,8 @@
int curand_main();
int nppif_main();
int static_version()
{
return curand_main() == 0 && nppif_main() == 0;
}