[libc] More efficiently send bytes via send_n and recv_n

Currently we have the `send_n` and `recv_n` routines to stream data,
such as a string to print, to the other side. The first operation is to
send the size so the other side knows the number of bytes to recieve.
However, this wasted 56 bytes that could've been sent. This meant that
small values, like the arguments to a function to call on the host for
example, needed to perform an extra send. This patch sends the first 56
bytes in the first packet and continues if necessary.

Depends on D150992

Reviewed By: JonChesterfield

Differential Revision: https://reviews.llvm.org/D151041
This commit is contained in:
Joseph Huber 2023-05-19 11:17:42 -05:00
parent 29d3da3b86
commit e826762a08
4 changed files with 45 additions and 34 deletions

View File

@ -17,6 +17,7 @@ namespace __llvm_libc {
void write_to_stderr(cpp::string_view msg) {
rpc::Client::Port port = rpc::client.open<rpc::PRINT_TO_STDERR>();
port.send_n(msg.data(), msg.size());
port.recv([](rpc::Buffer *) { /* void */ });
port.close();
}

View File

@ -417,34 +417,6 @@ LIBC_INLINE void Port<T>::recv_and_send(W work) {
send([](Buffer *) { /* no-op */ });
}
/// Sends an arbitrarily sized data buffer \p src across the shared channel in
/// multiples of the packet length.
template <bool T>
LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
// TODO: We could send the first bytes in this call and potentially save an
// extra send operation.
uint64_t num_sends = 0;
send([&](Buffer *buffer, uint32_t id) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
num_sends = is_process_gpu() ? lane_value(size, id)
: max(lane_value(size, id), num_sends);
});
uint64_t idx = 0;
uint64_t mask = process.get_packet(index).header.mask;
while (gpu::ballot(mask, idx < num_sends)) {
send([=](Buffer *buffer, uint32_t id) {
const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)
: lane_value(size, id) - idx;
if (idx < lane_value(size, id))
inline_memcpy(
buffer->data,
reinterpret_cast<const uint8_t *>(lane_value(src, id)) + idx, len);
});
idx += sizeof(Buffer::data);
}
}
/// Helper routine to simplify the interface when sending from the GPU using
/// thread private pointers to the underlying value.
template <bool T>
@ -455,6 +427,34 @@ LIBC_INLINE void Port<T>::send_n(const void *src, uint64_t size) {
send_n(src_ptr, size_ptr);
}
/// Sends an arbitrarily sized data buffer \p src across the shared channel in
/// multiples of the packet length.
template <bool T>
LIBC_INLINE void Port<T>::send_n(const void *const *src, uint64_t *size) {
uint64_t num_sends = 0;
send([&](Buffer *buffer, uint32_t id) {
reinterpret_cast<uint64_t *>(buffer->data)[0] = lane_value(size, id);
num_sends = is_process_gpu() ? lane_value(size, id)
: max(lane_value(size, id), num_sends);
uint64_t len =
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
? sizeof(Buffer::data) - sizeof(uint64_t)
: lane_value(size, id);
inline_memcpy(&buffer->data[1], lane_value(src, id), len);
});
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
while (gpu::ballot(process.get_packet(index).header.mask, idx < num_sends)) {
send([=](Buffer *buffer, uint32_t id) {
uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data)
? sizeof(Buffer::data)
: lane_value(size, id) - idx;
if (idx < lane_value(size, id))
inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
});
idx += sizeof(Buffer::data);
}
}
/// Receives an arbitrarily sized data buffer across the shared channel in
/// multiples of the packet length. The \p alloc function is called with the
/// size of the data so that we can initialize the size of the \p dst buffer.
@ -468,8 +468,13 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
reinterpret_cast<uint8_t *>(alloc(lane_value(size, id)));
num_recvs = is_process_gpu() ? lane_value(size, id)
: max(lane_value(size, id), num_recvs);
uint64_t len =
lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
? sizeof(Buffer::data) - sizeof(uint64_t)
: lane_value(size, id);
inline_memcpy(lane_value(dst, id), &buffer->data[1], len);
});
uint64_t idx = 0;
uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
uint64_t mask = process.get_packet(index).header.mask;
while (gpu::ballot(mask, idx < num_recvs)) {
recv([=](Buffer *buffer, uint32_t id) {
@ -477,8 +482,7 @@ LIBC_INLINE void Port<T>::recv_n(void **dst, uint64_t *size, A &&alloc) {
? sizeof(Buffer::data)
: lane_value(size, id) - idx;
if (idx < lane_value(size, id))
inline_memcpy(reinterpret_cast<uint8_t *>(lane_value(dst, id)) + idx,
buffer->data, len);
inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
});
idx += sizeof(Buffer::data);
}

View File

@ -9,6 +9,7 @@
#ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
#define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H
#include "src/__support/CPP/type_traits.h"
#include "src/__support/GPU/utils.h"
#include "src/__support/macros/attributes.h"
#include "src/__support/macros/properties/architectures.h"
@ -69,9 +70,13 @@ template <typename T> LIBC_INLINE const T &max(const T &x, const T &y) {
return x < y ? y : x;
}
/// Advance the \p ptr by \p bytes.
template <typename T, typename U> LIBC_INLINE T *advance(T ptr, U bytes) {
return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
/// Advance the \p p by \p bytes.
template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
if constexpr (cpp::is_const_v<T>)
return reinterpret_cast<T *>(reinterpret_cast<const uint8_t *>(ptr) +
bytes);
else
return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
}
} // namespace rpc

View File

@ -35,6 +35,7 @@ void handle_server() {
uint64_t sizes[rpc::MAX_LANE_SIZE] = {0};
void *strs[rpc::MAX_LANE_SIZE] = {nullptr};
port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; });
port->send([](rpc::Buffer *) { /* void */ });
for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) {
if (strs[i]) {
fwrite(strs[i], sizes[i], 1, stderr);