tcp: Migrate TCP_ESTABLISHED/TCP_SYN_RECV sockets in accept queues.
authorKuniyuki Iwashima <kuniyu@amazon.co.jp>
Sat, 12 Jun 2021 12:32:18 +0000 (21:32 +0900)
committerDaniel Borkmann <daniel@iogearbox.net>
Tue, 15 Jun 2021 16:01:05 +0000 (18:01 +0200)
When we call close() or shutdown() for listening sockets, each child socket
in the accept queue are freed at inet_csk_listen_stop(). If we can get a
new listener by reuseport_migrate_sock() and clone the request by
inet_reqsk_clone(), we try to add it into the new listener's accept queue
by inet_csk_reqsk_queue_add(). If it fails, we have to call __reqsk_free()
to call sock_put() for its listener and free the cloned request.

After putting the full socket into ehash, tcp_v[46]_syn_recv_sock() sets
NULL to ireq_opt/pktopts in struct inet_request_sock, but ipv6_opt can be
non-NULL. So, we have to set NULL to ipv6_opt of the old request to avoid
double free.

Note that we do not update req->rsk_listener and instead clone the req to
migrate because another path may reference the original request. If we
protected it by RCU, we would need to add rcu_read_lock() in many places.

Suggested-by: Martin KaFai Lau <kafai@fb.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Link: https://lore.kernel.org/netdev/20201209030903.hhow5r53l6fmozjn@kafai-mbp.dhcp.thefacebook.com/
Link: https://lore.kernel.org/bpf/20210612123224.12525-6-kuniyu@amazon.co.jp
net/ipv4/inet_connection_sock.c

index fa806e9167ecacf22dde76c3e4eb3c19196765b9..08878ef1bc7037f5d7e8aa3d4bb0d406777c863e 100644 (file)
@@ -695,6 +695,52 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
 }
 EXPORT_SYMBOL(inet_rtx_syn_ack);
 
+static struct request_sock *inet_reqsk_clone(struct request_sock *req,
+                                            struct sock *sk)
+{
+       struct sock *req_sk, *nreq_sk;
+       struct request_sock *nreq;
+
+       nreq = kmem_cache_alloc(req->rsk_ops->slab, GFP_ATOMIC | __GFP_NOWARN);
+       if (!nreq) {
+               /* paired with refcount_inc_not_zero() in reuseport_migrate_sock() */
+               sock_put(sk);
+               return NULL;
+       }
+
+       req_sk = req_to_sk(req);
+       nreq_sk = req_to_sk(nreq);
+
+       memcpy(nreq_sk, req_sk,
+              offsetof(struct sock, sk_dontcopy_begin));
+       memcpy(&nreq_sk->sk_dontcopy_end, &req_sk->sk_dontcopy_end,
+              req->rsk_ops->obj_size - offsetof(struct sock, sk_dontcopy_end));
+
+       sk_node_init(&nreq_sk->sk_node);
+       nreq_sk->sk_tx_queue_mapping = req_sk->sk_tx_queue_mapping;
+#ifdef CONFIG_XPS
+       nreq_sk->sk_rx_queue_mapping = req_sk->sk_rx_queue_mapping;
+#endif
+       nreq_sk->sk_incoming_cpu = req_sk->sk_incoming_cpu;
+
+       nreq->rsk_listener = sk;
+
+       /* We need not acquire fastopenq->lock
+        * because the child socket is locked in inet_csk_listen_stop().
+        */
+       if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(nreq)->tfo_listener)
+               rcu_assign_pointer(tcp_sk(nreq->sk)->fastopen_rsk, nreq);
+
+       return nreq;
+}
+
+static void reqsk_migrate_reset(struct request_sock *req)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+       inet_rsk(req)->ipv6_opt = NULL;
+#endif
+}
+
 /* return true if req was found in the ehash table */
 static bool reqsk_queue_unlink(struct request_sock *req)
 {
@@ -1036,14 +1082,36 @@ void inet_csk_listen_stop(struct sock *sk)
         * of the variants now.                 --ANK
         */
        while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
-               struct sock *child = req->sk;
+               struct sock *child = req->sk, *nsk;
+               struct request_sock *nreq;
 
                local_bh_disable();
                bh_lock_sock(child);
                WARN_ON(sock_owned_by_user(child));
                sock_hold(child);
 
+               nsk = reuseport_migrate_sock(sk, child, NULL);
+               if (nsk) {
+                       nreq = inet_reqsk_clone(req, nsk);
+                       if (nreq) {
+                               refcount_set(&nreq->rsk_refcnt, 1);
+
+                               if (inet_csk_reqsk_queue_add(nsk, nreq, child)) {
+                                       reqsk_migrate_reset(req);
+                               } else {
+                                       reqsk_migrate_reset(nreq);
+                                       __reqsk_free(nreq);
+                               }
+
+                               /* inet_csk_reqsk_queue_add() has already
+                                * called inet_child_forget() on failure case.
+                                */
+                               goto skip_child_forget;
+                       }
+               }
+
                inet_child_forget(sk, req, child);
+skip_child_forget:
                reqsk_put(req);
                bh_unlock_sock(child);
                local_bh_enable();