From 583bbf0624dfd8fc45f1049be1d4980be59451ff Mon Sep 17 00:00:00 2001 From: Luke Hsiao Date: Fri, 21 Aug 2020 21:41:04 -0700 Subject: [PATCH] io_uring: allow tcp ancillary data for __sys_recvmsg_sock() For TCP tx zero-copy, the kernel notifies the process of completions by queuing completion notifications on the socket error queue. This patch allows reading these notifications via recvmsg to support TCP tx zero-copy. Ancillary data was originally disallowed due to privilege escalation via io_uring's offloading of sendmsg() onto a kernel thread with kernel credentials (https://crbug.com/project-zero/1975). So, we must ensure that the socket type is one where the ancillary data types that are delivered on recvmsg are plain data (no file descriptors or values that are translated based on the identity of the calling process). This was tested by using io_uring to call recvmsg on the MSG_ERRQUEUE with tx zero-copy enabled. Before this patch, we received -EINVALID from this specific code path. After this patch, we could read tcp tx zero-copy completion notifications from the MSG_ERRQUEUE. Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Arjun Roy Acked-by: Eric Dumazet Reviewed-by: Jann Horn Reviewed-by: Jens Axboe Signed-off-by: Luke Hsiao Signed-off-by: David S. Miller --- include/linux/net.h | 3 +++ net/ipv4/af_inet.c | 1 + net/ipv6/af_inet6.c | 1 + net/socket.c | 8 +++++--- 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index d48ff11..7657c64 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -41,6 +41,8 @@ struct net; #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 +#define PROTO_CMSG_DATA_ONLY 0x0001 + #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types @@ -135,6 +137,7 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, struct proto_ops { int family; + unsigned int flags; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4307503..b7260c8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1017,6 +1017,7 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon const struct proto_ops inet_stream_ops = { .family = PF_INET, + .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 0306509..d9a1493 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -661,6 +661,7 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, const struct proto_ops inet6_stream_ops = { .family = PF_INET6, + .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, diff --git a/net/socket.c b/net/socket.c index dbbe8ea..e84a8e2 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2628,9 +2628,11 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { - /* disallow ancillary data requests from this path */ - if (msg->msg_control || msg->msg_controllen) - return -EINVAL; + if (msg->msg_control || msg->msg_controllen) { + /* disallow ancillary data reqs unless cmsg is plain data */ + if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY)) + return -EINVAL; + } return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } -- 2.7.4