mirror of
https://github.com/reactos/CMake.git
synced 2025-02-26 06:55:34 +00:00
CUDAToolkit: No targets now depend on the CUDA runtime
It is not a requirement to have shared|static consistent across your CUDA libraries (e.g curand, nppc ) and your CUDA runtime library. It is entirely allowable to use a static nppc and a shared runtime.
This commit is contained in:
parent
907bb7df57
commit
6e474364d1
@ -122,7 +122,6 @@ CUDA Runtime Library
|
||||
|
||||
The CUDA Runtime library (cudart) are what most applications will typically
|
||||
need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
|
||||
They are an explicit dependency of almost every library.
|
||||
|
||||
Targets Created:
|
||||
|
||||
@ -729,10 +728,8 @@ if(CUDAToolkit_FOUND)
|
||||
|
||||
foreach (cuda_lib cublas cufft cufftw curand cusolver cusparse nvgraph nvjpeg)
|
||||
find_and_add_cuda_import_lib(${cuda_lib})
|
||||
add_cuda_link_dependency(${cuda_lib} cudart)
|
||||
|
||||
find_and_add_cuda_import_lib(${cuda_lib}_static)
|
||||
add_cuda_link_dependency(${cuda_lib}_static cudart_static)
|
||||
endforeach()
|
||||
|
||||
# cuSOLVER depends on cuBLAS, and cuSPARSE
|
||||
@ -772,8 +769,6 @@ if(CUDAToolkit_FOUND)
|
||||
endif()
|
||||
find_and_add_cuda_import_lib(nvToolsExt nvToolsExt nvToolsExt64)
|
||||
|
||||
add_cuda_link_dependency(nvToolsExt cudart)
|
||||
|
||||
find_and_add_cuda_import_lib(OpenCL)
|
||||
|
||||
find_and_add_cuda_import_lib(culibos)
|
||||
|
@ -13,4 +13,12 @@ ADD_TEST_MACRO(Cuda.Toolkit Toolkit)
|
||||
ADD_TEST_MACRO(Cuda.IncludePathNoToolkit IncludePathNoToolkit)
|
||||
ADD_TEST_MACRO(Cuda.ProperDeviceLibraries ProperDeviceLibraries)
|
||||
ADD_TEST_MACRO(Cuda.ProperLinkFlags ProperLinkFlags)
|
||||
ADD_TEST_MACRO(Cuda.SharedRuntimePlusToolkit SharedRuntimePlusToolkit)
|
||||
|
||||
# The CUDA only ships the shared version of the toolkit libraries
|
||||
# on windows
|
||||
if(NOT WIN32)
|
||||
ADD_TEST_MACRO(Cuda.StaticRuntimePlusToolkit StaticRuntimePlusToolkit)
|
||||
endif()
|
||||
|
||||
ADD_TEST_MACRO(Cuda.WithC CudaWithC)
|
||||
|
35
Tests/Cuda/SharedRuntimePlusToolkit/CMakeLists.txt
Normal file
35
Tests/Cuda/SharedRuntimePlusToolkit/CMakeLists.txt
Normal file
@ -0,0 +1,35 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(SharedRuntimePlusToolkit CXX)
|
||||
|
||||
#Goal for this example:
|
||||
# Validate that with c++ we can use some components of the CUDA toolkit, and
|
||||
# specify the cuda runtime
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
add_library(Common OBJECT curand.cpp nppif.cpp)
|
||||
target_link_libraries(Common PRIVATE CUDA::toolkit)
|
||||
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
#shared runtime with shared toolkit libraries
|
||||
add_library(SharedToolkit SHARED shared.cpp)
|
||||
target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
|
||||
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart)
|
||||
|
||||
# The CUDA only ships the shared version of the toolkit libraries
|
||||
# on windows
|
||||
if(NOT WIN32)
|
||||
#shared runtime with static toolkit libraries
|
||||
add_library(StaticToolkit SHARED static.cpp)
|
||||
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
|
||||
target_link_libraries(StaticToolkit PUBLIC CUDA::cudart)
|
||||
|
||||
#static runtime with mixed toolkit libraries
|
||||
add_library(MixedToolkit SHARED mixed.cpp)
|
||||
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand_static CUDA::nppif)
|
||||
target_link_libraries(MixedToolkit PUBLIC CUDA::cudart)
|
||||
endif()
|
||||
|
||||
add_executable(SharedRuntimePlusToolkit main.cpp)
|
||||
target_link_libraries(SharedRuntimePlusToolkit PRIVATE SharedToolkit
|
||||
$<TARGET_NAME_IF_EXISTS:StaticToolkit>
|
||||
$<TARGET_NAME_IF_EXISTS:MixedToolkit>)
|
65
Tests/Cuda/SharedRuntimePlusToolkit/curand.cpp
Normal file
65
Tests/Cuda/SharedRuntimePlusToolkit/curand.cpp
Normal file
@ -0,0 +1,65 @@
|
||||
// Comes from:
|
||||
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
|
||||
|
||||
#ifdef _WIN32
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This program uses the host CURAND API to generate 100
|
||||
* pseudorandom floats.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if ((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if ((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
EXPORT int curand_main()
|
||||
{
|
||||
size_t n = 100;
|
||||
size_t i;
|
||||
curandGenerator_t gen;
|
||||
float *devData, *hostData;
|
||||
|
||||
/* Allocate n floats on host */
|
||||
hostData = (float*)calloc(n, sizeof(float));
|
||||
|
||||
/* Allocate n floats on device */
|
||||
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
|
||||
|
||||
/* Create pseudo-random number generator */
|
||||
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
|
||||
|
||||
/* Set seed */
|
||||
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
|
||||
|
||||
/* Generate n floats on device */
|
||||
CURAND_CALL(curandGenerateUniform(gen, devData, n));
|
||||
|
||||
/* Copy device memory to host */
|
||||
CUDA_CALL(
|
||||
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Cleanup */
|
||||
CURAND_CALL(curandDestroyGenerator(gen));
|
||||
CUDA_CALL(cudaFree(devData));
|
||||
free(hostData);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
23
Tests/Cuda/SharedRuntimePlusToolkit/main.cpp
Normal file
23
Tests/Cuda/SharedRuntimePlusToolkit/main.cpp
Normal file
@ -0,0 +1,23 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
IMPORT int shared_version();
|
||||
int static_version()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
int mixed_version()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
int shared_version();
|
||||
int static_version();
|
||||
int mixed_version();
|
||||
#endif
|
||||
|
||||
int main()
|
||||
{
|
||||
return mixed_version() == 0 && shared_version() == 0 &&
|
||||
static_version() == 0;
|
||||
}
|
16
Tests/Cuda/SharedRuntimePlusToolkit/mixed.cpp
Normal file
16
Tests/Cuda/SharedRuntimePlusToolkit/mixed.cpp
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define IMPORT
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
IMPORT int curand_main();
|
||||
IMPORT int nppif_main();
|
||||
|
||||
EXPORT int mixed_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
92
Tests/Cuda/SharedRuntimePlusToolkit/nppif.cpp
Normal file
92
Tests/Cuda/SharedRuntimePlusToolkit/nppif.cpp
Normal file
@ -0,0 +1,92 @@
|
||||
// Comes from
|
||||
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
|
||||
|
||||
#ifdef _WIN32
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include <assert.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <nppi_filtering_functions.h>
|
||||
|
||||
EXPORT int nppif_main()
|
||||
{
|
||||
/**
|
||||
* 8-bit unsigned single-channel 1D row convolution.
|
||||
*/
|
||||
const int simgrows = 32;
|
||||
const int simgcols = 32;
|
||||
Npp8u *d_pSrc, *d_pDst;
|
||||
const int nMaskSize = 3;
|
||||
NppiSize oROI;
|
||||
oROI.width = simgcols - nMaskSize;
|
||||
oROI.height = simgrows;
|
||||
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
|
||||
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
|
||||
const int simgpix = simgrows * simgcols;
|
||||
const int dimgpix = oROI.width * oROI.height;
|
||||
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
|
||||
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
|
||||
const int pixval = 1;
|
||||
const int nDivisor = 1;
|
||||
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
|
||||
Npp32s* d_pKernel;
|
||||
const Npp32s nAnchor = 2;
|
||||
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pDst, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// set image to pixval initially
|
||||
err = cudaMemset(d_pSrc, pixval, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemset(d_pDst, 0, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
|
||||
cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// copy src to dst
|
||||
NppStatus ret =
|
||||
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
|
||||
nMaskSize, nAnchor, nDivisor);
|
||||
assert(ret == NPP_NO_ERROR);
|
||||
Npp8u* h_imgres = new Npp8u[dimgpix];
|
||||
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// test for filtering
|
||||
for (int i = 0; i < dimgpix; i++) {
|
||||
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
|
||||
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
16
Tests/Cuda/SharedRuntimePlusToolkit/shared.cpp
Normal file
16
Tests/Cuda/SharedRuntimePlusToolkit/shared.cpp
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define IMPORT
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
EXPORT int shared_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
16
Tests/Cuda/SharedRuntimePlusToolkit/static.cpp
Normal file
16
Tests/Cuda/SharedRuntimePlusToolkit/static.cpp
Normal file
@ -0,0 +1,16 @@
|
||||
|
||||
#ifdef _WIN32
|
||||
# define IMPORT __declspec(dllimport)
|
||||
# define EXPORT __declspec(dllexport)
|
||||
#else
|
||||
# define IMPORT
|
||||
# define EXPORT
|
||||
#endif
|
||||
|
||||
IMPORT int curand_main();
|
||||
IMPORT int nppif_main();
|
||||
|
||||
EXPORT int static_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
29
Tests/Cuda/StaticRuntimePlusToolkit/CMakeLists.txt
Normal file
29
Tests/Cuda/StaticRuntimePlusToolkit/CMakeLists.txt
Normal file
@ -0,0 +1,29 @@
|
||||
cmake_minimum_required(VERSION 3.15)
|
||||
project(StaticRuntimePlusToolkit CXX)
|
||||
|
||||
#Goal for this example:
|
||||
# Validate that with c++ we can use some components of the CUDA toolkit, and
|
||||
# specify the cuda runtime
|
||||
find_package(CUDAToolkit REQUIRED)
|
||||
|
||||
add_library(Common OBJECT curand.cpp nppif.cpp)
|
||||
target_link_libraries(Common PRIVATE CUDA::toolkit)
|
||||
set_target_properties(Common PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
#static runtime with shared toolkit libraries
|
||||
add_library(SharedToolkit SHARED shared.cpp)
|
||||
target_link_libraries(SharedToolkit PRIVATE Common PUBLIC CUDA::curand CUDA::nppif)
|
||||
target_link_libraries(SharedToolkit PUBLIC CUDA::cudart_static)
|
||||
|
||||
#static runtime with static toolkit libraries
|
||||
add_library(StaticToolkit SHARED static.cpp)
|
||||
target_link_libraries(StaticToolkit PRIVATE Common CUDA::curand_static CUDA::nppif_static)
|
||||
target_link_libraries(StaticToolkit PUBLIC CUDA::cudart_static)
|
||||
|
||||
#static runtime with mixed toolkit libraries
|
||||
add_library(MixedToolkit SHARED mixed.cpp)
|
||||
target_link_libraries(MixedToolkit PRIVATE Common CUDA::curand CUDA::nppif_static)
|
||||
target_link_libraries(MixedToolkit PUBLIC CUDA::cudart_static)
|
||||
|
||||
add_executable(StaticRuntimePlusToolkit main.cpp)
|
||||
target_link_libraries(StaticRuntimePlusToolkit PRIVATE SharedToolkit StaticToolkit MixedToolkit)
|
59
Tests/Cuda/StaticRuntimePlusToolkit/curand.cpp
Normal file
59
Tests/Cuda/StaticRuntimePlusToolkit/curand.cpp
Normal file
@ -0,0 +1,59 @@
|
||||
// Comes from:
|
||||
// https://docs.nvidia.com/cuda/curand/host-api-overview.html#host-api-example
|
||||
|
||||
/*
|
||||
* This program uses the host CURAND API to generate 100
|
||||
* pseudorandom floats.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <curand.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#define CUDA_CALL(x) \
|
||||
do { \
|
||||
if ((x) != cudaSuccess) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
#define CURAND_CALL(x) \
|
||||
do { \
|
||||
if ((x) != CURAND_STATUS_SUCCESS) { \
|
||||
printf("Error at %s:%d\n", __FILE__, __LINE__); \
|
||||
return EXIT_FAILURE; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
int curand_main()
|
||||
{
|
||||
size_t n = 100;
|
||||
size_t i;
|
||||
curandGenerator_t gen;
|
||||
float *devData, *hostData;
|
||||
|
||||
/* Allocate n floats on host */
|
||||
hostData = (float*)calloc(n, sizeof(float));
|
||||
|
||||
/* Allocate n floats on device */
|
||||
CUDA_CALL(cudaMalloc((void**)&devData, n * sizeof(float)));
|
||||
|
||||
/* Create pseudo-random number generator */
|
||||
CURAND_CALL(curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT));
|
||||
|
||||
/* Set seed */
|
||||
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(gen, 1234ULL));
|
||||
|
||||
/* Generate n floats on device */
|
||||
CURAND_CALL(curandGenerateUniform(gen, devData, n));
|
||||
|
||||
/* Copy device memory to host */
|
||||
CUDA_CALL(
|
||||
cudaMemcpy(hostData, devData, n * sizeof(float), cudaMemcpyDeviceToHost));
|
||||
|
||||
/* Cleanup */
|
||||
CURAND_CALL(curandDestroyGenerator(gen));
|
||||
CUDA_CALL(cudaFree(devData));
|
||||
free(hostData);
|
||||
return EXIT_SUCCESS;
|
||||
}
|
11
Tests/Cuda/StaticRuntimePlusToolkit/main.cpp
Normal file
11
Tests/Cuda/StaticRuntimePlusToolkit/main.cpp
Normal file
@ -0,0 +1,11 @@
|
||||
|
||||
|
||||
int shared_version();
|
||||
int static_version();
|
||||
int mixed_version();
|
||||
|
||||
int main()
|
||||
{
|
||||
return mixed_version() == 0 && shared_version() == 0 &&
|
||||
static_version() == 0;
|
||||
}
|
8
Tests/Cuda/StaticRuntimePlusToolkit/mixed.cpp
Normal file
8
Tests/Cuda/StaticRuntimePlusToolkit/mixed.cpp
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
int mixed_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
86
Tests/Cuda/StaticRuntimePlusToolkit/nppif.cpp
Normal file
86
Tests/Cuda/StaticRuntimePlusToolkit/nppif.cpp
Normal file
@ -0,0 +1,86 @@
|
||||
// Comes from
|
||||
// https://devtalk.nvidia.com/default/topic/1037482/gpu-accelerated-libraries/help-me-help-you-with-modern-cmake-and-cuda-mwe-for-npp/post/5271066/#5271066
|
||||
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
|
||||
#include <assert.h>
|
||||
#include <cuda_runtime_api.h>
|
||||
#include <nppi_filtering_functions.h>
|
||||
|
||||
int nppif_main()
|
||||
{
|
||||
/**
|
||||
* 8-bit unsigned single-channel 1D row convolution.
|
||||
*/
|
||||
const int simgrows = 32;
|
||||
const int simgcols = 32;
|
||||
Npp8u *d_pSrc, *d_pDst;
|
||||
const int nMaskSize = 3;
|
||||
NppiSize oROI;
|
||||
oROI.width = simgcols - nMaskSize;
|
||||
oROI.height = simgrows;
|
||||
const int simgsize = simgrows * simgcols * sizeof(d_pSrc[0]);
|
||||
const int dimgsize = oROI.width * oROI.height * sizeof(d_pSrc[0]);
|
||||
const int simgpix = simgrows * simgcols;
|
||||
const int dimgpix = oROI.width * oROI.height;
|
||||
const int nSrcStep = simgcols * sizeof(d_pSrc[0]);
|
||||
const int nDstStep = oROI.width * sizeof(d_pDst[0]);
|
||||
const int pixval = 1;
|
||||
const int nDivisor = 1;
|
||||
const Npp32s h_pKernel[nMaskSize] = { pixval, pixval, pixval };
|
||||
Npp32s* d_pKernel;
|
||||
const Npp32s nAnchor = 2;
|
||||
cudaError_t err = cudaMalloc((void**)&d_pSrc, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pDst, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMalloc((void**)&d_pKernel, nMaskSize * sizeof(d_pKernel[0]));
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// set image to pixval initially
|
||||
err = cudaMemset(d_pSrc, pixval, simgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemset(d_pDst, 0, dimgsize);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
err = cudaMemcpy(d_pKernel, h_pKernel, nMaskSize * sizeof(d_pKernel[0]),
|
||||
cudaMemcpyHostToDevice);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// copy src to dst
|
||||
NppStatus ret =
|
||||
nppiFilterRow_8u_C1R(d_pSrc, nSrcStep, d_pDst, nDstStep, oROI, d_pKernel,
|
||||
nMaskSize, nAnchor, nDivisor);
|
||||
assert(ret == NPP_NO_ERROR);
|
||||
Npp8u* h_imgres = new Npp8u[dimgpix];
|
||||
err = cudaMemcpy(h_imgres, d_pDst, dimgsize, cudaMemcpyDeviceToHost);
|
||||
if (err != cudaSuccess) {
|
||||
fprintf(stderr, "Cuda error %d\n", __LINE__);
|
||||
return 1;
|
||||
}
|
||||
// test for filtering
|
||||
for (int i = 0; i < dimgpix; i++) {
|
||||
if (h_imgres[i] != (pixval * pixval * nMaskSize)) {
|
||||
fprintf(stderr, "h_imgres at index %d failed to match\n", i);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
8
Tests/Cuda/StaticRuntimePlusToolkit/shared.cpp
Normal file
8
Tests/Cuda/StaticRuntimePlusToolkit/shared.cpp
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
int shared_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
8
Tests/Cuda/StaticRuntimePlusToolkit/static.cpp
Normal file
8
Tests/Cuda/StaticRuntimePlusToolkit/static.cpp
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
int curand_main();
|
||||
int nppif_main();
|
||||
|
||||
int static_version()
|
||||
{
|
||||
return curand_main() == 0 && nppif_main() == 0;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user