CUDA: Static libraries can now explicitly resolve device symbols

If a static library has the property CUDA_RESOLVE_DEVICE_SYMBOLS enabled
it will now perform the device link step. The normal behavior is
to delay calling device link until the static library is consumed by
a shared library or an executable.
This commit is contained in:
Robert Maynard 2017-04-25 16:09:56 -04:00
parent 8fb85c68bb
commit 493671a521
14 changed files with 276 additions and 4 deletions

View File

@ -154,6 +154,7 @@ Properties on Targets
/prop_tgt/CROSSCOMPILING_EMULATOR
/prop_tgt/CUDA_PTX_COMPILATION
/prop_tgt/CUDA_SEPARABLE_COMPILATION
/prop_tgt/CUDA_RESOLVE_DEVICE_SYMBOLS
/prop_tgt/CUDA_EXTENSIONS
/prop_tgt/CUDA_STANDARD
/prop_tgt/CUDA_STANDARD_REQUIRED

View File

@ -0,0 +1,15 @@
CUDA_RESOLVE_DEVICE_SYMBOLS
---------------------------
CUDA only: Enables device linking for the specific static library target
If set this will enable device linking on this static library target. Normally
device linking is deferred until a shared library or executable is generated,
allowing for multiple static libraries to resolve device symbols at the same
time.
For instance:
.. code-block:: cmake
set_property(TARGET mystaticlib PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)

View File

@ -39,9 +39,24 @@ std::string cmLinkLineDeviceComputer::ComputeLinkLibraries(
continue;
}
if (li->Target->GetType() == cmStateEnums::INTERFACE_LIBRARY ||
li->Target->GetType() == cmStateEnums::SHARED_LIBRARY ||
li->Target->GetType() == cmStateEnums::MODULE_LIBRARY) {
bool skippable = false;
switch (li->Target->GetType()) {
case cmStateEnums::SHARED_LIBRARY:
case cmStateEnums::MODULE_LIBRARY:
case cmStateEnums::INTERFACE_LIBRARY:
skippable = true;
break;
case cmStateEnums::STATIC_LIBRARY:
// If a static library is resolving its device linking, it should
// be removed for other device linking
skippable =
li->Target->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
break;
default:
break;
}
if (skippable) {
continue;
}

View File

@ -127,6 +127,24 @@ void cmMakefileLibraryTargetGenerator::WriteObjectLibraryRules()
void cmMakefileLibraryTargetGenerator::WriteStaticLibraryRules()
{
const std::string cuda_lang("CUDA");
cmGeneratorTarget::LinkClosure const* closure =
this->GeneratorTarget->GetLinkClosure(this->ConfigName);
const bool hasCUDA =
(std::find(closure->Languages.begin(), closure->Languages.end(),
cuda_lang) != closure->Languages.end());
const bool resolveDeviceSymbols =
this->GeneratorTarget->GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
if (hasCUDA && resolveDeviceSymbols) {
std::string linkRuleVar = "CMAKE_CUDA_DEVICE_LINK_LIBRARY";
std::string extraFlags;
this->LocalGenerator->AppendFlags(
extraFlags, this->GeneratorTarget->GetProperty("LINK_FLAGS"));
this->WriteDeviceLibraryRules(linkRuleVar, extraFlags, false);
}
std::string linkLanguage =
this->GeneratorTarget->GetLinkerLanguage(this->ConfigName);
@ -861,6 +879,16 @@ void cmMakefileLibraryTargetGenerator::WriteLibraryRules(
std::vector<std::string> object_strings;
this->WriteObjectsStrings(object_strings, archiveCommandLimit);
// Add the cuda device object to the list of archive files. This will
// only occur on archives which have CUDA_RESOLVE_DEVICE_SYMBOLS enabled
if (!this->DeviceLinkObject.empty()) {
object_strings.push_back(this->LocalGenerator->ConvertToOutputFormat(
this->LocalGenerator->MaybeConvertToRelativePath(
this->LocalGenerator->GetCurrentBinaryDirectory(),
this->DeviceLinkObject),
cmOutputConverter::SHELL));
}
// Create the archive with the first set of objects.
std::vector<std::string>::iterator osi = object_strings.begin();
{

View File

@ -447,6 +447,7 @@ std::vector<std::string> cmNinjaNormalTargetGenerator::ComputeDeviceLinkCmd()
// an executable or a dynamic library.
std::string linkCmd;
switch (this->GetGeneratorTarget()->GetType()) {
case cmStateEnums::STATIC_LIBRARY:
case cmStateEnums::SHARED_LIBRARY:
case cmStateEnums::MODULE_LIBRARY: {
const std::string cudaLinkCmd(
@ -559,11 +560,15 @@ void cmNinjaNormalTargetGenerator::WriteDeviceLinkStatement()
case cmStateEnums::EXECUTABLE:
shouldHaveDeviceLinking = true;
break;
case cmStateEnums::STATIC_LIBRARY:
shouldHaveDeviceLinking =
genTarget.GetPropertyAsBool("CUDA_RESOLVE_DEVICE_SYMBOLS");
break;
default:
break;
}
if (!shouldHaveDeviceLinking || !hasCUDA) {
if (!(shouldHaveDeviceLinking && hasCUDA)) {
return;
}

View File

@ -2563,6 +2563,10 @@ bool cmVisualStudio10TargetGenerator::ComputeCudaLinkOptions(
case cmStateEnums::EXECUTABLE:
doDeviceLinking = true;
break;
case cmStateEnums::STATIC_LIBRARY:
doDeviceLinking = this->GeneratorTarget->GetPropertyAsBool(
"CUDA_RESOLVE_DEVICE_SYMBOLS");
break;
default:
break;
}

View File

@ -3,3 +3,4 @@ ADD_TEST_MACRO(CudaOnly.EnableStandard CudaOnlyEnableStandard)
ADD_TEST_MACRO(CudaOnly.ExportPTX CudaOnlyExportPTX)
ADD_TEST_MACRO(CudaOnly.SeparateCompilation CudaOnlySeparateCompilation)
ADD_TEST_MACRO(CudaOnly.WithDefs CudaOnlyWithDefs)
ADD_TEST_MACRO(CudaOnly.ResolveDeviceSymbols CudaOnlyResolveDeviceSymbols)

View File

@ -0,0 +1,52 @@
cmake_minimum_required(VERSION 3.7)
project (CudaOnlyResolveDeviceSymbols CUDA)
# Find nm and dumpbin
if(CMAKE_NM)
set(dump_command ${CMAKE_NM})
set(dump_args -g)
else()
include(GetPrerequisites)
message(STATUS "calling list_prerequisites to find dumpbin")
list_prerequisites("${CMAKE_COMMAND}" 0 0 0)
if(gp_dumpbin)
set(dump_command ${gp_dumpbin})
set(dump_args /ARCHIVEMEMBERS)
endif()
endif()
#Goal for this example:
#Build a static library that defines multiple methods and kernels that
#use each other.
#Use a custom command to build an executable that uses this static library
#We do these together to verify that we can get a static library to do
#device symbol linking, and not have it done when the executable is made
string(APPEND CMAKE_CUDA_FLAGS " -gencode arch=compute_30,code=compute_30")
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CUDA_STANDARD 11)
add_library(CUDAResolveDeviceLib STATIC file1.cu file2.cu)
set_target_properties(CUDAResolveDeviceLib
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
CUDA_RESOLVE_DEVICE_SYMBOLS ON
POSITION_INDEPENDENT_CODE ON)
if(dump_command)
add_custom_command(TARGET CUDAResolveDeviceLib POST_BUILD
COMMAND ${CMAKE_COMMAND}
-DDUMP_COMMAND=${dump_command}
-DDUMP_ARGS=${dump_args}
-DTEST_LIBRARY_PATH=$<TARGET_FILE:CUDAResolveDeviceLib>
-P ${CMAKE_CURRENT_SOURCE_DIR}/verify.cmake
)
endif()
add_executable(CudaOnlyResolveDeviceSymbols main.cu)
target_link_libraries(CudaOnlyResolveDeviceSymbols PRIVATE CUDAResolveDeviceLib)
if(APPLE)
# We need to add the default path to the driver (libcuda.dylib) as an rpath, so that
# the static cuda runtime can find it at runtime.
target_link_libraries(CudaOnlyResolveDeviceSymbols PRIVATE -Wl,-rpath,/usr/local/cuda/lib)
endif()

View File

@ -0,0 +1,10 @@
#include "file1.h"
result_type __device__ file1_func(int x)
{
result_type r;
r.input = x;
r.sum = x * x;
return r;
}

View File

@ -0,0 +1,7 @@
#pragma once
struct result_type
{
int input;
int sum;
};

View File

@ -0,0 +1,25 @@
#include "file2.h"
result_type __device__ file1_func(int x);
result_type_dynamic __device__ file2_func(int x)
{
const result_type r = file1_func(x);
const result_type_dynamic rd{ r.input, r.sum, true };
return rd;
}
static __global__ void file2_kernel(result_type_dynamic& r, int x)
{
// call static_func which is a method that is defined in the
// static library that is always out of date
r = file2_func(x);
}
int file2_launch_kernel(int x)
{
result_type_dynamic r;
file2_kernel<<<1, 1>>>(r, x);
return r.sum;
}

View File

@ -0,0 +1,10 @@
#pragma once
#include "file1.h"
struct result_type_dynamic
{
int input;
int sum;
bool from_static;
};

View File

@ -0,0 +1,85 @@
#include <iostream>
#include "file1.h"
#include "file2.h"
int file2_launch_kernel(int x);
result_type_dynamic __device__ file2_func(int x);
static __global__ void main_kernel(result_type_dynamic& r, int x)
{
// call function that was not device linked to us, this will cause
// a runtime failure of "invalid device function"
r = file2_func(x);
}
int main_launch_kernel(int x)
{
result_type_dynamic r;
main_kernel<<<1, 1>>>(r, x);
return r.sum;
}
int choose_cuda_device()
{
int nDevices = 0;
cudaError_t err = cudaGetDeviceCount(&nDevices);
if (err != cudaSuccess) {
std::cerr << "Failed to retrieve the number of CUDA enabled devices"
<< std::endl;
return 1;
}
for (int i = 0; i < nDevices; ++i) {
cudaDeviceProp prop;
cudaError_t err = cudaGetDeviceProperties(&prop, i);
if (err != cudaSuccess) {
std::cerr << "Could not retrieve properties from CUDA device " << i
<< std::endl;
return 1;
}
std::cout << "prop.major: " << prop.major << std::endl;
if (prop.major >= 3) {
err = cudaSetDevice(i);
if (err != cudaSuccess) {
std::cout << "Could not select CUDA device " << i << std::endl;
} else {
return 0;
}
}
}
std::cout << "Could not find a CUDA enabled card supporting compute >=3.0"
<< std::endl;
return 1;
}
int main(int argc, char** argv)
{
int ret = choose_cuda_device();
if (ret) {
return 0;
}
cudaError_t err;
file2_launch_kernel(42);
err = cudaGetLastError();
if (err != cudaSuccess) {
std::cerr << "file2_launch_kernel: kernel launch failed: "
<< cudaGetErrorString(err) << std::endl;
return 1;
}
main_launch_kernel(1);
err = cudaGetLastError();
if (err == cudaSuccess) {
// This kernel launch should fail as the file2_func was device linked
// into the static library and is not usable by the executable
std::cerr << "main_launch_kernel: kernel launch should have failed"
<< std::endl;
return 1;
}
return 0;
}

View File

@ -0,0 +1,14 @@
execute_process(COMMAND ${DUMP_COMMAND} ${DUMP_ARGS} ${TEST_LIBRARY_PATH}
RESULT_VARIABLE RESULT
OUTPUT_VARIABLE OUTPUT
ERROR_VARIABLE ERROR
)
if(NOT "${RESULT}" STREQUAL "0")
message(FATAL_ERROR "${DUMP_COMMAND} failed [${RESULT}] [${OUTPUT}] [${ERROR}]")
endif()
if(NOT "${OUTPUT}" MATCHES "(cmake_device_link|device-link)")
message(FATAL_ERROR
"No cuda device objects found, device linking did not occur")
endif()