[libc] Implement the RPC client / server for NVPTX

This patch adds the necessary code to impelement the existing RPC client
/ server interface when targeting NVPTX GPUs. This follows closely to
the implementation in the AMDGPU version. This does not yet enable unit
testing as the `nvlink` linker does not support static libraries. So
that will need to be worked around.

I am ignoring the RPC duplication between the AMDGPU and NVPTX loaders. This
will be changed completely later so there's no point unifying the code at this
stage. The implementation was tested manually with the following file and
compilation flags.

```
namespace __llvm_libc {
void write_to_stderr(const char *msg);
void quick_exit(int);
} // namespace __llvm_libc

using namespace __llvm_libc;

int main(int argc, char **argv, char **envp) {
  for (int i = 0; i < argc; ++i) {
    write_to_stderr(argv[i]);
    write_to_stderr("\n");
  }
  quick_exit(255);
}
```

```
$ clang++ crt1.o rpc_client.o quick_exit.o io.o main.cpp --target=nvptx64-nvidia-cuda -march=sm_70 -o image
$ ./nvptx_loader image 1 2 3
image
1
2
3
$ echo $?
255
```

Depends on D146681

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D146846
This commit is contained in:
Joseph Huber 2023-03-24 15:53:05 -05:00
parent 2bef46d2ad
commit 58f5e5e6b0
3 changed files with 48 additions and 4 deletions

View File

@ -2,6 +2,8 @@ add_startup_object(
crt1
SRC
start.cpp
DEPENDS
libc.src.__support.RPC.rpc_client
COMPILE_OPTIONS
-ffreestanding # To avoid compiler warnings about calling the main function.
-fno-builtin

View File

@ -1,4 +1,4 @@
//===-- Implementation of crt for amdgpu ----------------------------------===//
//===-- Implementation of crt for nvptx -----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@ -6,10 +6,14 @@
//
//===----------------------------------------------------------------------===//
#include "src/__support/RPC/rpc_client.h"
extern "C" int main(int argc, char **argv, char **envp);
extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
void *buffer) {
__llvm_libc::rpc::client.reset(in, out, buffer);
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
}

View File

@ -15,6 +15,8 @@
#include "Loader.h"
#include "src/__support/RPC/rpc.h"
#include "cuda.h"
#include <cstddef>
#include <cstdio>
@ -32,6 +34,30 @@ struct kernel_args_t {
void *buffer;
};
static __llvm_libc::rpc::Server server;
/// Queries the RPC client at least once and performs server-side work if there
/// are any active requests.
void handle_server() {
while (server.handle(
[&](__llvm_libc::rpc::Buffer *buffer) {
switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) {
case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
fputs(reinterpret_cast<const char *>(&buffer->data[1]), stderr);
break;
}
case __llvm_libc::rpc::Opcode::EXIT: {
exit(buffer->data[1]);
break;
}
default:
return;
};
},
[](__llvm_libc::rpc::Buffer *buffer) {}))
;
}
static void handle_error(CUresult err) {
if (err == CUDA_SUCCESS)
return;
@ -106,8 +132,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
handle_error(err);
void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
if (!server_inbox || !server_outbox || !buffer)
handle_error("Failed to allocate memory the RPC client / server.");
// Set up the arguments to the '_start' kernel on the GPU.
// TODO: Setup RPC server implementation;
uint64_t args_size = sizeof(kernel_args_t);
kernel_args_t args;
std::memset(&args, 0, args_size);
@ -115,10 +146,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
args.argv = dev_argv;
args.envp = dev_envp;
args.ret = reinterpret_cast<void *>(dev_ret);
args.inbox = server_outbox;
args.outbox = server_inbox;
args.buffer = buffer;
void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END};
// Initialize the RPC server's buffer for host-device communication.
server.reset(server_inbox, server_outbox, buffer);
// Call the kernel with the given arguments.
if (CUresult err =
cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
@ -126,9 +163,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
/*bloackDimZ=*/1, 0, stream, nullptr, args_config))
handle_error(err);
// TODO: Query the RPC server periodically while the kernel is running.
// Wait until the kernel has completed execution on the device. Periodically
// check the RPC client for work to be performed on the server.
while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
;
handle_server();
// Copy the return value back from the kernel and wait.
int host_ret = 0;