tcp: Support MSG_SPLICE_PAGES
authorDavid Howells <dhowells@redhat.com>
Mon, 22 May 2023 12:11:13 +0000 (13:11 +0100)
committerJakub Kicinski <kuba@kernel.org>
Wed, 24 May 2023 03:48:27 +0000 (20:48 -0700)
Make TCP's sendmsg() support MSG_SPLICE_PAGES.  This causes pages to be
spliced or copied (if it cannot be spliced) from the source iterator.

This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: David Ahern <dsahern@kernel.org>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net/ipv4/tcp.c

index 3d18e29..2d61150 100644 (file)
@@ -1223,7 +1223,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
        int flags, err, copied = 0;
        int mss_now = 0, size_goal, copied_syn = 0;
        int process_backlog = 0;
-       bool zc = false;
+       int zc = 0;
        long timeo;
 
        flags = msg->msg_flags;
@@ -1231,7 +1231,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
        if ((flags & MSG_ZEROCOPY) && size) {
                if (msg->msg_ubuf) {
                        uarg = msg->msg_ubuf;
-                       zc = sk->sk_route_caps & NETIF_F_SG;
+                       if (sk->sk_route_caps & NETIF_F_SG)
+                               zc = MSG_ZEROCOPY;
                } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
                        skb = tcp_write_queue_tail(sk);
                        uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb));
@@ -1239,10 +1240,14 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
                                err = -ENOBUFS;
                                goto out_err;
                        }
-                       zc = sk->sk_route_caps & NETIF_F_SG;
-                       if (!zc)
+                       if (sk->sk_route_caps & NETIF_F_SG)
+                               zc = MSG_ZEROCOPY;
+                       else
                                uarg_to_msgzc(uarg)->zerocopy = 0;
                }
+       } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) {
+               if (sk->sk_route_caps & NETIF_F_SG)
+                       zc = MSG_SPLICE_PAGES;
        }
 
        if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect) &&
@@ -1305,7 +1310,7 @@ restart:
                goto do_error;
 
        while (msg_data_left(msg)) {
-               int copy = 0;
+               ssize_t copy = 0;
 
                skb = tcp_write_queue_tail(sk);
                if (skb)
@@ -1346,7 +1351,7 @@ new_segment:
                if (copy > msg_data_left(msg))
                        copy = msg_data_left(msg);
 
-               if (!zc) {
+               if (zc == 0) {
                        bool merge = true;
                        int i = skb_shinfo(skb)->nr_frags;
                        struct page_frag *pfrag = sk_page_frag(sk);
@@ -1391,7 +1396,7 @@ new_segment:
                                page_ref_inc(pfrag->page);
                        }
                        pfrag->offset += copy;
-               } else {
+               } else if (zc == MSG_ZEROCOPY)  {
                        /* First append to a fragless skb builds initial
                         * pure zerocopy skb
                         */
@@ -1412,6 +1417,30 @@ new_segment:
                        if (err < 0)
                                goto do_error;
                        copy = err;
+               } else if (zc == MSG_SPLICE_PAGES) {
+                       /* Splice in data if we can; copy if we can't. */
+                       if (tcp_downgrade_zcopy_pure(sk, skb))
+                               goto wait_for_space;
+                       copy = tcp_wmem_schedule(sk, copy);
+                       if (!copy)
+                               goto wait_for_space;
+
+                       err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
+                                                  sk->sk_allocation);
+                       if (err < 0) {
+                               if (err == -EMSGSIZE) {
+                                       tcp_mark_push(tp, skb);
+                                       goto new_segment;
+                               }
+                               goto do_error;
+                       }
+                       copy = err;
+
+                       if (!(flags & MSG_NO_SHARED_FRAGS))
+                               skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
+
+                       sk_wmem_queued_add(sk, copy);
+                       sk_mem_charge(sk, copy);
                }
 
                if (!copied)