[libc] Implement the RPC client / server for NVPTX

This patch adds the necessary code to impelement the existing RPC client / server interface when targeting NVPTX GPUs. This follows closely to the implementation in the AMDGPU version. This does not yet enable unit testing as the `nvlink` linker does not support static libraries. So that will need to be worked around. I am ignoring the RPC duplication between the AMDGPU and NVPTX loaders. This will be changed completely later so there's no point unifying the code at this stage. The implementation was tested manually with the following file and compilation flags. ``` namespace __llvm_libc { void write_to_stderr(const char *msg); void quick_exit(int); } // namespace __llvm_libc using namespace __llvm_libc; int main(int argc, char **argv, char **envp) { for (int i = 0; i < argc; ++i) { write_to_stderr(argv[i]); write_to_stderr("\n"); } quick_exit(255); } ``` ``` $ clang++ crt1.o rpc_client.o quick_exit.o io.o main.cpp --target=nvptx64-nvidia-cuda -march=sm_70 -o image $ ./nvptx_loader image 1 2 3 image 1 2 3 $ echo $? 255 ``` Depends on D146681 Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D146846
2025-04-01 12:43:47 +00:00 · 2023-03-24 15:53:05 -05:00 · 2023-03-24 15:53:05 -05:00 · 58f5e5e6b0
commit 58f5e5e6b0
parent 2bef46d2ad
3 changed files with 48 additions and 4 deletions
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@ -2,6 +2,8 @@ add_startup_object(
  crt1
  SRC
    start.cpp
+  DEPENDS
+    libc.src.__support.RPC.rpc_client
  COMPILE_OPTIONS
    -ffreestanding # To avoid compiler warnings about calling the main function.
    -fno-builtin
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@ -1,4 +1,4 @@
-//===-- Implementation of crt for amdgpu ----------------------------------===//
+//===-- Implementation of crt for nvptx -----------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@ -6,10 +6,14 @@
 //
 //===----------------------------------------------------------------------===//

+#include "src/__support/RPC/rpc_client.h"
+
 extern "C" int main(int argc, char **argv, char **envp);

 extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
       void *buffer) {
+  __llvm_libc::rpc::client.reset(in, out, buffer);
+
  __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@ -15,6 +15,8 @@

 #include "Loader.h"

+#include "src/__support/RPC/rpc.h"
+
 #include "cuda.h"
 #include <cstddef>
 #include <cstdio>
@ -32,6 +34,30 @@ struct kernel_args_t {
  void *buffer;
 };

+static __llvm_libc::rpc::Server server;
+
+/// Queries the RPC client at least once and performs server-side work if there
+/// are any active requests.
+void handle_server() {
+  while (server.handle(
+      [&](__llvm_libc::rpc::Buffer *buffer) {
+        switch (static_cast<__llvm_libc::rpc::Opcode>(buffer->data[0])) {
+        case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
+          fputs(reinterpret_cast<const char *>(&buffer->data[1]), stderr);
+          break;
+        }
+        case __llvm_libc::rpc::Opcode::EXIT: {
+          exit(buffer->data[1]);
+          break;
+        }
+        default:
+          return;
+        };
+      },
+      [](__llvm_libc::rpc::Buffer *buffer) {}))
+    ;
+}
+
 static void handle_error(CUresult err) {
  if (err == CUDA_SUCCESS)
    return;
@ -106,8 +132,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
  if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
    handle_error(err);

+  void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
+  void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
+  void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
+  if (!server_inbox || !server_outbox || !buffer)
+    handle_error("Failed to allocate memory the RPC client / server.");
+
  // Set up the arguments to the '_start' kernel on the GPU.
-  // TODO: Setup RPC server implementation;
  uint64_t args_size = sizeof(kernel_args_t);
  kernel_args_t args;
  std::memset(&args, 0, args_size);
@ -115,10 +146,16 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
  args.argv = dev_argv;
  args.envp = dev_envp;
  args.ret = reinterpret_cast<void *>(dev_ret);
+  args.inbox = server_outbox;
+  args.outbox = server_inbox;
+  args.buffer = buffer;
  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
                         CU_LAUNCH_PARAM_END};

+  // Initialize the RPC server's buffer for host-device communication.
+  server.reset(server_inbox, server_outbox, buffer);
+
  // Call the kernel with the given arguments.
  if (CUresult err =
          cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
@ -126,9 +163,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
                         /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
    handle_error(err);

-  // TODO: Query the RPC server periodically while the kernel is running.
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
  while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    ;
+    handle_server();

  // Copy the return value back from the kernel and wait.
  int host_ret = 0;