tcp: Convert do_tcp_sendpages() to use MSG_SPLICE_PAGES
authorDavid Howells <dhowells@redhat.com>
Mon, 22 May 2023 12:11:14 +0000 (13:11 +0100)
committerJakub Kicinski <kuba@kernel.org>
Wed, 24 May 2023 03:48:27 +0000 (20:48 -0700)
Convert do_tcp_sendpages() to use sendmsg() with MSG_SPLICE_PAGES rather
than directly splicing in the pages itself.  do_tcp_sendpages() can then be
inlined in subsequent patches into its callers.

This allows ->sendpage() to be replaced by something that can handle
multiple multipage folios in a single transaction.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: David Ahern <dsahern@kernel.org>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
net/ipv4/tcp.c

index 2d61150..f3a0c02 100644 (file)
@@ -974,163 +974,19 @@ static int tcp_wmem_schedule(struct sock *sk, int copy)
        return min(copy, sk->sk_forward_alloc);
 }
 
-static struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags,
-                                     struct page *page, int offset, size_t *size)
-{
-       struct sk_buff *skb = tcp_write_queue_tail(sk);
-       struct tcp_sock *tp = tcp_sk(sk);
-       bool can_coalesce;
-       int copy, i;
-
-       if (!skb || (copy = size_goal - skb->len) <= 0 ||
-           !tcp_skb_can_collapse_to(skb)) {
-new_segment:
-               if (!sk_stream_memory_free(sk))
-                       return NULL;
-
-               skb = tcp_stream_alloc_skb(sk, 0, sk->sk_allocation,
-                                          tcp_rtx_and_write_queues_empty(sk));
-               if (!skb)
-                       return NULL;
-
-#ifdef CONFIG_TLS_DEVICE
-               skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
-#endif
-               tcp_skb_entail(sk, skb);
-               copy = size_goal;
-       }
-
-       if (copy > *size)
-               copy = *size;
-
-       i = skb_shinfo(skb)->nr_frags;
-       can_coalesce = skb_can_coalesce(skb, i, page, offset);
-       if (!can_coalesce && i >= READ_ONCE(sysctl_max_skb_frags)) {
-               tcp_mark_push(tp, skb);
-               goto new_segment;
-       }
-       if (tcp_downgrade_zcopy_pure(sk, skb))
-               return NULL;
-
-       copy = tcp_wmem_schedule(sk, copy);
-       if (!copy)
-               return NULL;
-
-       if (can_coalesce) {
-               skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
-       } else {
-               get_page(page);
-               skb_fill_page_desc_noacc(skb, i, page, offset, copy);
-       }
-
-       if (!(flags & MSG_NO_SHARED_FRAGS))
-               skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
-
-       skb->len += copy;
-       skb->data_len += copy;
-       skb->truesize += copy;
-       sk_wmem_queued_add(sk, copy);
-       sk_mem_charge(sk, copy);
-       WRITE_ONCE(tp->write_seq, tp->write_seq + copy);
-       TCP_SKB_CB(skb)->end_seq += copy;
-       tcp_skb_pcount_set(skb, 0);
-
-       *size = copy;
-       return skb;
-}
-
 ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
                         size_t size, int flags)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
-       int mss_now, size_goal;
-       int err;
-       ssize_t copied;
-       long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
-
-       if (IS_ENABLED(CONFIG_DEBUG_VM) &&
-           WARN_ONCE(!sendpage_ok(page),
-                     "page must not be a Slab one and have page_count > 0"))
-               return -EINVAL;
-
-       /* Wait for a connection to finish. One exception is TCP Fast Open
-        * (passive side) where data is allowed to be sent before a connection
-        * is fully established.
-        */
-       if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
-           !tcp_passive_fastopen(sk)) {
-               err = sk_stream_wait_connect(sk, &timeo);
-               if (err != 0)
-                       goto out_err;
-       }
+       struct bio_vec bvec;
+       struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES, };
 
-       sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+       bvec_set_page(&bvec, page, size, offset);
+       iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
 
-       mss_now = tcp_send_mss(sk, &size_goal, flags);
-       copied = 0;
+       if (flags & MSG_SENDPAGE_NOTLAST)
+               msg.msg_flags |= MSG_MORE;
 
-       err = -EPIPE;
-       if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
-               goto out_err;
-
-       while (size > 0) {
-               struct sk_buff *skb;
-               size_t copy = size;
-
-               skb = tcp_build_frag(sk, size_goal, flags, page, offset, &copy);
-               if (!skb)
-                       goto wait_for_space;
-
-               if (!copied)
-                       TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
-
-               copied += copy;
-               offset += copy;
-               size -= copy;
-               if (!size)
-                       goto out;
-
-               if (skb->len < size_goal || (flags & MSG_OOB))
-                       continue;
-
-               if (forced_push(tp)) {
-                       tcp_mark_push(tp, skb);
-                       __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH);
-               } else if (skb == tcp_send_head(sk))
-                       tcp_push_one(sk, mss_now);
-               continue;
-
-wait_for_space:
-               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-               tcp_push(sk, flags & ~MSG_MORE, mss_now,
-                        TCP_NAGLE_PUSH, size_goal);
-
-               err = sk_stream_wait_memory(sk, &timeo);
-               if (err != 0)
-                       goto do_error;
-
-               mss_now = tcp_send_mss(sk, &size_goal, flags);
-       }
-
-out:
-       if (copied) {
-               tcp_tx_timestamp(sk, sk->sk_tsflags);
-               if (!(flags & MSG_SENDPAGE_NOTLAST))
-                       tcp_push(sk, flags, mss_now, tp->nonagle, size_goal);
-       }
-       return copied;
-
-do_error:
-       tcp_remove_empty_skb(sk);
-       if (copied)
-               goto out;
-out_err:
-       /* make sure we wake any epoll edge trigger waiter */
-       if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
-               sk->sk_write_space(sk);
-               tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
-       }
-       return sk_stream_error(sk, flags, err);
+       return tcp_sendmsg_locked(sk, &msg, size);
 }
 EXPORT_SYMBOL_GPL(do_tcp_sendpages);