From e826762a0826c11dc62696e46068c61c57a00aa9 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 19 May 2023 11:17:42 -0500 Subject: [PATCH] [libc] More efficiently send bytes via `send_n` and `recv_n` Currently we have the `send_n` and `recv_n` routines to stream data, such as a string to print, to the other side. The first operation is to send the size so the other side knows the number of bytes to recieve. However, this wasted 56 bytes that could've been sent. This meant that small values, like the arguments to a function to call on the host for example, needed to perform an extra send. This patch sends the first 56 bytes in the first packet and continues if necessary. Depends on D150992 Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D151041 --- libc/src/__support/OSUtil/gpu/io.cpp | 1 + libc/src/__support/RPC/rpc.h | 52 +++++++++++++++++++----------------- libc/src/__support/RPC/rpc_util.h | 11 +++++--- libc/utils/gpu/loader/Server.h | 1 + 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp index 995a973..0d8c9ac 100644 --- a/libc/src/__support/OSUtil/gpu/io.cpp +++ b/libc/src/__support/OSUtil/gpu/io.cpp @@ -17,6 +17,7 @@ namespace __llvm_libc { void write_to_stderr(cpp::string_view msg) { rpc::Client::Port port = rpc::client.open(); port.send_n(msg.data(), msg.size()); + port.recv([](rpc::Buffer *) { /* void */ }); port.close(); } diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index bc8c05b..836cdbe 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -417,44 +417,44 @@ LIBC_INLINE void Port::recv_and_send(W work) { send([](Buffer *) { /* no-op */ }); } +/// Helper routine to simplify the interface when sending from the GPU using +/// thread private pointers to the underlying value. +template +LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { + static_assert(is_process_gpu(), "Only valid when running on the GPU"); + const void **src_ptr = &src; + uint64_t *size_ptr = &size; + send_n(src_ptr, size_ptr); +} + /// Sends an arbitrarily sized data buffer \p src across the shared channel in /// multiples of the packet length. template LIBC_INLINE void Port::send_n(const void *const *src, uint64_t *size) { - // TODO: We could send the first bytes in this call and potentially save an - // extra send operation. uint64_t num_sends = 0; send([&](Buffer *buffer, uint32_t id) { reinterpret_cast(buffer->data)[0] = lane_value(size, id); num_sends = is_process_gpu() ? lane_value(size, id) : max(lane_value(size, id), num_sends); + uint64_t len = + lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t) + ? sizeof(Buffer::data) - sizeof(uint64_t) + : lane_value(size, id); + inline_memcpy(&buffer->data[1], lane_value(src, id), len); }); - uint64_t idx = 0; - uint64_t mask = process.get_packet(index).header.mask; - while (gpu::ballot(mask, idx < num_sends)) { + uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); + while (gpu::ballot(process.get_packet(index).header.mask, idx < num_sends)) { send([=](Buffer *buffer, uint32_t id) { - const uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) - ? sizeof(Buffer::data) - : lane_value(size, id) - idx; + uint64_t len = lane_value(size, id) - idx > sizeof(Buffer::data) + ? sizeof(Buffer::data) + : lane_value(size, id) - idx; if (idx < lane_value(size, id)) - inline_memcpy( - buffer->data, - reinterpret_cast(lane_value(src, id)) + idx, len); + inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len); }); idx += sizeof(Buffer::data); } } -/// Helper routine to simplify the interface when sending from the GPU using -/// thread private pointers to the underlying value. -template -LIBC_INLINE void Port::send_n(const void *src, uint64_t size) { - static_assert(is_process_gpu(), "Only valid when running on the GPU"); - const void **src_ptr = &src; - uint64_t *size_ptr = &size; - send_n(src_ptr, size_ptr); -} - /// Receives an arbitrarily sized data buffer across the shared channel in /// multiples of the packet length. The \p alloc function is called with the /// size of the data so that we can initialize the size of the \p dst buffer. @@ -468,8 +468,13 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { reinterpret_cast(alloc(lane_value(size, id))); num_recvs = is_process_gpu() ? lane_value(size, id) : max(lane_value(size, id), num_recvs); + uint64_t len = + lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t) + ? sizeof(Buffer::data) - sizeof(uint64_t) + : lane_value(size, id); + inline_memcpy(lane_value(dst, id), &buffer->data[1], len); }); - uint64_t idx = 0; + uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t); uint64_t mask = process.get_packet(index).header.mask; while (gpu::ballot(mask, idx < num_recvs)) { recv([=](Buffer *buffer, uint32_t id) { @@ -477,8 +482,7 @@ LIBC_INLINE void Port::recv_n(void **dst, uint64_t *size, A &&alloc) { ? sizeof(Buffer::data) : lane_value(size, id) - idx; if (idx < lane_value(size, id)) - inline_memcpy(reinterpret_cast(lane_value(dst, id)) + idx, - buffer->data, len); + inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len); }); idx += sizeof(Buffer::data); } diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h index b9ffdaa..67a509c 100644 --- a/libc/src/__support/RPC/rpc_util.h +++ b/libc/src/__support/RPC/rpc_util.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H #define LLVM_LIBC_SRC_SUPPORT_RPC_RPC_UTILS_H +#include "src/__support/CPP/type_traits.h" #include "src/__support/GPU/utils.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/properties/architectures.h" @@ -69,9 +70,13 @@ template LIBC_INLINE const T &max(const T &x, const T &y) { return x < y ? y : x; } -/// Advance the \p ptr by \p bytes. -template LIBC_INLINE T *advance(T ptr, U bytes) { - return reinterpret_cast(reinterpret_cast(ptr) + bytes); +/// Advance the \p p by \p bytes. +template LIBC_INLINE T *advance(T *ptr, U bytes) { + if constexpr (cpp::is_const_v) + return reinterpret_cast(reinterpret_cast(ptr) + + bytes); + else + return reinterpret_cast(reinterpret_cast(ptr) + bytes); } } // namespace rpc diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h index 2e9fdfd..f4e39af 100644 --- a/libc/utils/gpu/loader/Server.h +++ b/libc/utils/gpu/loader/Server.h @@ -35,6 +35,7 @@ void handle_server() { uint64_t sizes[rpc::MAX_LANE_SIZE] = {0}; void *strs[rpc::MAX_LANE_SIZE] = {nullptr}; port->recv_n(strs, sizes, [&](uint64_t size) { return new char[size]; }); + port->send([](rpc::Buffer *) { /* void */ }); for (uint64_t i = 0; i < rpc::MAX_LANE_SIZE; ++i) { if (strs[i]) { fwrite(strs[i], sizes[i], 1, stderr); -- 2.7.4