From 493108d95f1464ccd101d4e5cfa7e93f1fc64d47 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 21 Sep 2022 12:17:54 +0100 Subject: [PATCH] io_uring/net: zerocopy sendmsg Add a zerocopy version of sendmsg. Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/6aabc4bdfc0ec78df6ec9328137e394af9d4e7ef.1663668091.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- include/uapi/linux/io_uring.h | 1 + io_uring/net.c | 91 ++++++++++++++++++++++++++++++++++++++++--- io_uring/net.h | 1 + io_uring/opdef.c | 19 +++++++++ 4 files changed, 107 insertions(+), 5 deletions(-) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 972b179..92f29d9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -213,6 +213,7 @@ enum io_uring_op { IORING_OP_SOCKET, IORING_OP_URING_CMD, IORING_OP_SEND_ZC, + IORING_OP_SENDMSG_ZC, /* this goes last, obviously */ IORING_OP_LAST, diff --git a/io_uring/net.c b/io_uring/net.c index 209bc69..757a300 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -909,7 +909,12 @@ out_free: void io_send_zc_cleanup(struct io_kiocb *req) { struct io_sr_msg *zc = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr *io; + if (req_has_async_data(req)) { + io = req->async_data; + kfree(io->free_iov); + } zc->notif->flags |= REQ_F_CQE_SKIP; io_notif_flush(zc->notif); zc->notif = NULL; @@ -921,8 +926,7 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *notif; - if (READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3) || - READ_ONCE(sqe->__pad3[0])) + if (unlikely(READ_ONCE(sqe->__pad2[0]) || READ_ONCE(sqe->addr3))) return -EINVAL; /* we don't support IOSQE_CQE_SKIP_SUCCESS just yet */ if (req->flags & REQ_F_CQE_SKIP) @@ -949,14 +953,24 @@ int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) io_req_set_rsrc_node(notif, ctx, 0); } + if (req->opcode == IORING_OP_SEND_ZC) { + if (READ_ONCE(sqe->__pad3[0])) + return -EINVAL; + zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + zc->addr_len = READ_ONCE(sqe->addr_len); + } else { + if (unlikely(sqe->addr2 || sqe->file_index)) + return -EINVAL; + if (unlikely(zc->flags & IORING_RECVSEND_FIXED_BUF)) + return -EINVAL; + } + zc->buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); zc->len = READ_ONCE(sqe->len); zc->msg_flags = READ_ONCE(sqe->msg_flags) | MSG_NOSIGNAL; if (zc->msg_flags & MSG_DONTWAIT) req->flags |= REQ_F_NOWAIT; - zc->addr = u64_to_user_ptr(READ_ONCE(sqe->addr2)); - zc->addr_len = READ_ONCE(sqe->addr_len); zc->done_io = 0; #ifdef CONFIG_COMPAT @@ -1118,6 +1132,73 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) return IOU_OK; } +int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags) +{ + struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); + struct io_async_msghdr iomsg, *kmsg; + struct socket *sock; + unsigned flags, cflags; + int ret, min_ret = 0; + + sock = sock_from_file(req->file); + if (unlikely(!sock)) + return -ENOTSOCK; + + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { + ret = io_sendmsg_copy_hdr(req, &iomsg); + if (ret) + return ret; + kmsg = &iomsg; + } + + if (!(req->flags & REQ_F_POLLED) && + (sr->flags & IORING_RECVSEND_POLL_FIRST)) + return io_setup_async_msg(req, kmsg, issue_flags); + + flags = sr->msg_flags | MSG_ZEROCOPY; + if (issue_flags & IO_URING_F_NONBLOCK) + flags |= MSG_DONTWAIT; + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + + kmsg->msg.msg_ubuf = &io_notif_to_data(sr->notif)->uarg; + kmsg->msg.sg_from_iter = io_sg_from_iter_iovec; + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); + + if (unlikely(ret < min_ret)) { + if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) + return io_setup_async_msg(req, kmsg, issue_flags); + + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; + req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg, issue_flags); + } + if (ret < 0 && !sr->done_io) + sr->notif->flags |= REQ_F_CQE_SKIP; + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail(req); + } + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + + io_netmsg_recycle(req, issue_flags); + if (ret >= 0) + ret += sr->done_io; + else if (sr->done_io) + ret = sr->done_io; + + io_notif_flush(sr->notif); + req->flags &= ~REQ_F_NEED_CLEANUP; + cflags = ret >= 0 ? IORING_CQE_F_MORE : 0; + io_req_set_res(req, ret, cflags); + return IOU_OK; +} + void io_sendrecv_fail(struct io_kiocb *req) { struct io_sr_msg *sr = io_kiocb_to_cmd(req, struct io_sr_msg); @@ -1127,7 +1208,7 @@ void io_sendrecv_fail(struct io_kiocb *req) if (req->flags & REQ_F_PARTIAL_IO) res = sr->done_io; if ((req->flags & REQ_F_NEED_CLEANUP) && - req->opcode == IORING_OP_SEND_ZC) { + (req->opcode == IORING_OP_SEND_ZC || req->opcode == IORING_OP_SENDMSG_ZC)) { /* preserve notification for partial I/O */ if (res < 0) sr->notif->flags |= REQ_F_CQE_SKIP; diff --git a/io_uring/net.h b/io_uring/net.h index 45558e2..5ffa11b 100644 --- a/io_uring/net.h +++ b/io_uring/net.h @@ -57,6 +57,7 @@ int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); int io_connect(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc(struct io_kiocb *req, unsigned int issue_flags); +int io_sendmsg_zc(struct io_kiocb *req, unsigned int issue_flags); int io_send_zc_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe); void io_send_zc_cleanup(struct io_kiocb *req); diff --git a/io_uring/opdef.c b/io_uring/opdef.c index 0fdeb1b..2330f6d 100644 --- a/io_uring/opdef.c +++ b/io_uring/opdef.c @@ -505,6 +505,25 @@ const struct io_op_def io_op_defs[] = { .prep = io_eopnotsupp_prep, #endif }, + [IORING_OP_SENDMSG_ZC] = { + .name = "SENDMSG_ZC", + .needs_file = 1, + .unbound_nonreg_file = 1, + .pollout = 1, + .audit_skip = 1, + .ioprio = 1, + .manual_alloc = 1, +#if defined(CONFIG_NET) + .async_size = sizeof(struct io_async_msghdr), + .prep = io_send_zc_prep, + .issue = io_sendmsg_zc, + .prep_async = io_sendmsg_prep_async, + .cleanup = io_send_zc_cleanup, + .fail = io_sendrecv_fail, +#else + .prep = io_eopnotsupp_prep, +#endif + }, }; const char *io_uring_get_opcode(u8 opcode) -- 2.7.4