1662e117894900a8e4439475741fc83d1fb052eb
[platform/kernel/linux-rpi.git] / net / mptcp / subflow.c
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6
7 #define pr_fmt(fmt) "MPTCP: " fmt
8
9 #include <linux/kernel.h>
10 #include <linux/module.h>
11 #include <linux/netdevice.h>
12 #include <net/sock.h>
13 #include <net/inet_common.h>
14 #include <net/inet_hashtables.h>
15 #include <net/protocol.h>
16 #include <net/tcp.h>
17 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
18 #include <net/ip6_route.h>
19 #endif
20 #include <net/mptcp.h>
21 #include "protocol.h"
22
23 static int subflow_rebuild_header(struct sock *sk)
24 {
25         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
26         int err = 0;
27
28         if (subflow->request_mptcp && !subflow->token) {
29                 pr_debug("subflow=%p", sk);
30                 err = mptcp_token_new_connect(sk);
31         }
32
33         if (err)
34                 return err;
35
36         return subflow->icsk_af_ops->rebuild_header(sk);
37 }
38
39 static void subflow_req_destructor(struct request_sock *req)
40 {
41         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
42
43         pr_debug("subflow_req=%p", subflow_req);
44
45         if (subflow_req->mp_capable)
46                 mptcp_token_destroy_request(subflow_req->token);
47         tcp_request_sock_ops.destructor(req);
48 }
49
50 static void subflow_init_req(struct request_sock *req,
51                              const struct sock *sk_listener,
52                              struct sk_buff *skb)
53 {
54         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
55         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
56         struct tcp_options_received rx_opt;
57
58         pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
59
60         memset(&rx_opt.mptcp, 0, sizeof(rx_opt.mptcp));
61         mptcp_get_options(skb, &rx_opt);
62
63         subflow_req->mp_capable = 0;
64         subflow_req->remote_key_valid = 0;
65
66 #ifdef CONFIG_TCP_MD5SIG
67         /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
68          * TCP option space.
69          */
70         if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
71                 return;
72 #endif
73
74         if (rx_opt.mptcp.mp_capable && listener->request_mptcp) {
75                 int err;
76
77                 err = mptcp_token_new_request(req);
78                 if (err == 0)
79                         subflow_req->mp_capable = 1;
80
81                 subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
82         }
83 }
84
85 static void subflow_v4_init_req(struct request_sock *req,
86                                 const struct sock *sk_listener,
87                                 struct sk_buff *skb)
88 {
89         tcp_rsk(req)->is_mptcp = 1;
90
91         tcp_request_sock_ipv4_ops.init_req(req, sk_listener, skb);
92
93         subflow_init_req(req, sk_listener, skb);
94 }
95
96 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
97 static void subflow_v6_init_req(struct request_sock *req,
98                                 const struct sock *sk_listener,
99                                 struct sk_buff *skb)
100 {
101         tcp_rsk(req)->is_mptcp = 1;
102
103         tcp_request_sock_ipv6_ops.init_req(req, sk_listener, skb);
104
105         subflow_init_req(req, sk_listener, skb);
106 }
107 #endif
108
109 static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
110 {
111         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
112
113         subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
114
115         if (subflow->conn && !subflow->conn_finished) {
116                 pr_debug("subflow=%p, remote_key=%llu", mptcp_subflow_ctx(sk),
117                          subflow->remote_key);
118                 mptcp_finish_connect(sk);
119                 subflow->conn_finished = 1;
120
121                 if (skb) {
122                         pr_debug("synack seq=%u", TCP_SKB_CB(skb)->seq);
123                         subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
124                 }
125         }
126 }
127
128 static struct request_sock_ops subflow_request_sock_ops;
129 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops;
130
131 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
132 {
133         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
134
135         pr_debug("subflow=%p", subflow);
136
137         /* Never answer to SYNs sent to broadcast or multicast */
138         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
139                 goto drop;
140
141         return tcp_conn_request(&subflow_request_sock_ops,
142                                 &subflow_request_sock_ipv4_ops,
143                                 sk, skb);
144 drop:
145         tcp_listendrop(sk);
146         return 0;
147 }
148
149 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
150 static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops;
151 static struct inet_connection_sock_af_ops subflow_v6_specific;
152 static struct inet_connection_sock_af_ops subflow_v6m_specific;
153
154 static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
155 {
156         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
157
158         pr_debug("subflow=%p", subflow);
159
160         if (skb->protocol == htons(ETH_P_IP))
161                 return subflow_v4_conn_request(sk, skb);
162
163         if (!ipv6_unicast_destination(skb))
164                 goto drop;
165
166         return tcp_conn_request(&subflow_request_sock_ops,
167                                 &subflow_request_sock_ipv6_ops, sk, skb);
168
169 drop:
170         tcp_listendrop(sk);
171         return 0; /* don't send reset */
172 }
173 #endif
174
175 static struct sock *subflow_syn_recv_sock(const struct sock *sk,
176                                           struct sk_buff *skb,
177                                           struct request_sock *req,
178                                           struct dst_entry *dst,
179                                           struct request_sock *req_unhash,
180                                           bool *own_req)
181 {
182         struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
183         struct mptcp_subflow_request_sock *subflow_req;
184         struct tcp_options_received opt_rx;
185         struct sock *child;
186
187         pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
188
189         /* if the sk is MP_CAPABLE, we try to fetch the client key */
190         subflow_req = mptcp_subflow_rsk(req);
191         if (subflow_req->mp_capable) {
192                 if (TCP_SKB_CB(skb)->seq != subflow_req->ssn_offset + 1) {
193                         /* here we can receive and accept an in-window,
194                          * out-of-order pkt, which will not carry the MP_CAPABLE
195                          * opt even on mptcp enabled paths
196                          */
197                         goto create_child;
198                 }
199
200                 opt_rx.mptcp.mp_capable = 0;
201                 mptcp_get_options(skb, &opt_rx);
202                 if (opt_rx.mptcp.mp_capable) {
203                         subflow_req->remote_key = opt_rx.mptcp.sndr_key;
204                         subflow_req->remote_key_valid = 1;
205                 } else {
206                         subflow_req->mp_capable = 0;
207                 }
208         }
209
210 create_child:
211         child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
212                                                      req_unhash, own_req);
213
214         if (child && *own_req) {
215                 struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
216
217                 /* we have null ctx on TCP fallback, not fatal on MPC
218                  * handshake
219                  */
220                 if (!ctx)
221                         return child;
222
223                 if (ctx->mp_capable) {
224                         if (mptcp_token_new_accept(ctx->token))
225                                 goto close_child;
226                 }
227         }
228
229         return child;
230
231 close_child:
232         pr_debug("closing child socket");
233         tcp_send_active_reset(child, GFP_ATOMIC);
234         inet_csk_prepare_forced_close(child);
235         tcp_done(child);
236         return NULL;
237 }
238
239 static struct inet_connection_sock_af_ops subflow_specific;
240
241 enum mapping_status {
242         MAPPING_OK,
243         MAPPING_INVALID,
244         MAPPING_EMPTY,
245         MAPPING_DATA_FIN
246 };
247
248 static u64 expand_seq(u64 old_seq, u16 old_data_len, u64 seq)
249 {
250         if ((u32)seq == (u32)old_seq)
251                 return old_seq;
252
253         /* Assume map covers data not mapped yet. */
254         return seq | ((old_seq + old_data_len + 1) & GENMASK_ULL(63, 32));
255 }
256
257 static void warn_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
258 {
259         WARN_ONCE(1, "Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
260                   ssn, subflow->map_subflow_seq, subflow->map_data_len);
261 }
262
263 static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
264 {
265         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
266         unsigned int skb_consumed;
267
268         skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
269         if (WARN_ON_ONCE(skb_consumed >= skb->len))
270                 return true;
271
272         return skb->len - skb_consumed <= subflow->map_data_len -
273                                           mptcp_subflow_get_map_offset(subflow);
274 }
275
276 static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
277 {
278         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
279         u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
280
281         if (unlikely(before(ssn, subflow->map_subflow_seq))) {
282                 /* Mapping covers data later in the subflow stream,
283                  * currently unsupported.
284                  */
285                 warn_bad_map(subflow, ssn);
286                 return false;
287         }
288         if (unlikely(!before(ssn, subflow->map_subflow_seq +
289                                   subflow->map_data_len))) {
290                 /* Mapping does covers past subflow data, invalid */
291                 warn_bad_map(subflow, ssn + skb->len);
292                 return false;
293         }
294         return true;
295 }
296
297 static enum mapping_status get_mapping_status(struct sock *ssk)
298 {
299         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
300         struct mptcp_ext *mpext;
301         struct sk_buff *skb;
302         u16 data_len;
303         u64 map_seq;
304
305         skb = skb_peek(&ssk->sk_receive_queue);
306         if (!skb)
307                 return MAPPING_EMPTY;
308
309         mpext = mptcp_get_ext(skb);
310         if (!mpext || !mpext->use_map) {
311                 if (!subflow->map_valid && !skb->len) {
312                         /* the TCP stack deliver 0 len FIN pkt to the receive
313                          * queue, that is the only 0len pkts ever expected here,
314                          * and we can admit no mapping only for 0 len pkts
315                          */
316                         if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
317                                 WARN_ONCE(1, "0len seq %d:%d flags %x",
318                                           TCP_SKB_CB(skb)->seq,
319                                           TCP_SKB_CB(skb)->end_seq,
320                                           TCP_SKB_CB(skb)->tcp_flags);
321                         sk_eat_skb(ssk, skb);
322                         return MAPPING_EMPTY;
323                 }
324
325                 if (!subflow->map_valid)
326                         return MAPPING_INVALID;
327
328                 goto validate_seq;
329         }
330
331         pr_debug("seq=%llu is64=%d ssn=%u data_len=%u data_fin=%d",
332                  mpext->data_seq, mpext->dsn64, mpext->subflow_seq,
333                  mpext->data_len, mpext->data_fin);
334
335         data_len = mpext->data_len;
336         if (data_len == 0) {
337                 pr_err("Infinite mapping not handled");
338                 return MAPPING_INVALID;
339         }
340
341         if (mpext->data_fin == 1) {
342                 if (data_len == 1) {
343                         pr_debug("DATA_FIN with no payload");
344                         if (subflow->map_valid) {
345                                 /* A DATA_FIN might arrive in a DSS
346                                  * option before the previous mapping
347                                  * has been fully consumed. Continue
348                                  * handling the existing mapping.
349                                  */
350                                 skb_ext_del(skb, SKB_EXT_MPTCP);
351                                 return MAPPING_OK;
352                         } else {
353                                 return MAPPING_DATA_FIN;
354                         }
355                 }
356
357                 /* Adjust for DATA_FIN using 1 byte of sequence space */
358                 data_len--;
359         }
360
361         if (!mpext->dsn64) {
362                 map_seq = expand_seq(subflow->map_seq, subflow->map_data_len,
363                                      mpext->data_seq);
364                 pr_debug("expanded seq=%llu", subflow->map_seq);
365         } else {
366                 map_seq = mpext->data_seq;
367         }
368
369         if (subflow->map_valid) {
370                 /* Allow replacing only with an identical map */
371                 if (subflow->map_seq == map_seq &&
372                     subflow->map_subflow_seq == mpext->subflow_seq &&
373                     subflow->map_data_len == data_len) {
374                         skb_ext_del(skb, SKB_EXT_MPTCP);
375                         return MAPPING_OK;
376                 }
377
378                 /* If this skb data are fully covered by the current mapping,
379                  * the new map would need caching, which is not supported
380                  */
381                 if (skb_is_fully_mapped(ssk, skb))
382                         return MAPPING_INVALID;
383
384                 /* will validate the next map after consuming the current one */
385                 return MAPPING_OK;
386         }
387
388         subflow->map_seq = map_seq;
389         subflow->map_subflow_seq = mpext->subflow_seq;
390         subflow->map_data_len = data_len;
391         subflow->map_valid = 1;
392         subflow->mpc_map = mpext->mpc_map;
393         pr_debug("new map seq=%llu subflow_seq=%u data_len=%u",
394                  subflow->map_seq, subflow->map_subflow_seq,
395                  subflow->map_data_len);
396
397 validate_seq:
398         /* we revalidate valid mapping on new skb, because we must ensure
399          * the current skb is completely covered by the available mapping
400          */
401         if (!validate_mapping(ssk, skb))
402                 return MAPPING_INVALID;
403
404         skb_ext_del(skb, SKB_EXT_MPTCP);
405         return MAPPING_OK;
406 }
407
408 static bool subflow_check_data_avail(struct sock *ssk)
409 {
410         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
411         enum mapping_status status;
412         struct mptcp_sock *msk;
413         struct sk_buff *skb;
414
415         pr_debug("msk=%p ssk=%p data_avail=%d skb=%p", subflow->conn, ssk,
416                  subflow->data_avail, skb_peek(&ssk->sk_receive_queue));
417         if (subflow->data_avail)
418                 return true;
419
420         if (!subflow->conn)
421                 return false;
422
423         msk = mptcp_sk(subflow->conn);
424         for (;;) {
425                 u32 map_remaining;
426                 size_t delta;
427                 u64 ack_seq;
428                 u64 old_ack;
429
430                 status = get_mapping_status(ssk);
431                 pr_debug("msk=%p ssk=%p status=%d", msk, ssk, status);
432                 if (status == MAPPING_INVALID) {
433                         ssk->sk_err = EBADMSG;
434                         goto fatal;
435                 }
436
437                 if (status != MAPPING_OK)
438                         return false;
439
440                 skb = skb_peek(&ssk->sk_receive_queue);
441                 if (WARN_ON_ONCE(!skb))
442                         return false;
443
444                 /* if msk lacks the remote key, this subflow must provide an
445                  * MP_CAPABLE-based mapping
446                  */
447                 if (unlikely(!READ_ONCE(msk->can_ack))) {
448                         if (!subflow->mpc_map) {
449                                 ssk->sk_err = EBADMSG;
450                                 goto fatal;
451                         }
452                         WRITE_ONCE(msk->remote_key, subflow->remote_key);
453                         WRITE_ONCE(msk->ack_seq, subflow->map_seq);
454                         WRITE_ONCE(msk->can_ack, true);
455                 }
456
457                 old_ack = READ_ONCE(msk->ack_seq);
458                 ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
459                 pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
460                          ack_seq);
461                 if (ack_seq == old_ack)
462                         break;
463
464                 /* only accept in-sequence mapping. Old values are spurious
465                  * retransmission; we can hit "future" values on active backup
466                  * subflow switch, we relay on retransmissions to get
467                  * in-sequence data.
468                  * Cuncurrent subflows support will require subflow data
469                  * reordering
470                  */
471                 map_remaining = subflow->map_data_len -
472                                 mptcp_subflow_get_map_offset(subflow);
473                 if (before64(ack_seq, old_ack))
474                         delta = min_t(size_t, old_ack - ack_seq, map_remaining);
475                 else
476                         delta = min_t(size_t, ack_seq - old_ack, map_remaining);
477
478                 /* discard mapped data */
479                 pr_debug("discarding %zu bytes, current map len=%d", delta,
480                          map_remaining);
481                 if (delta) {
482                         struct mptcp_read_arg arg = {
483                                 .msg = NULL,
484                         };
485                         read_descriptor_t desc = {
486                                 .count = delta,
487                                 .arg.data = &arg,
488                         };
489                         int ret;
490
491                         ret = tcp_read_sock(ssk, &desc, mptcp_read_actor);
492                         if (ret < 0) {
493                                 ssk->sk_err = -ret;
494                                 goto fatal;
495                         }
496                         if (ret < delta)
497                                 return false;
498                         if (delta == map_remaining)
499                                 subflow->map_valid = 0;
500                 }
501         }
502         return true;
503
504 fatal:
505         /* fatal protocol error, close the socket */
506         /* This barrier is coupled with smp_rmb() in tcp_poll() */
507         smp_wmb();
508         ssk->sk_error_report(ssk);
509         tcp_set_state(ssk, TCP_CLOSE);
510         tcp_send_active_reset(ssk, GFP_ATOMIC);
511         return false;
512 }
513
514 bool mptcp_subflow_data_available(struct sock *sk)
515 {
516         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
517         struct sk_buff *skb;
518
519         /* check if current mapping is still valid */
520         if (subflow->map_valid &&
521             mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
522                 subflow->map_valid = 0;
523                 subflow->data_avail = 0;
524
525                 pr_debug("Done with mapping: seq=%u data_len=%u",
526                          subflow->map_subflow_seq,
527                          subflow->map_data_len);
528         }
529
530         if (!subflow_check_data_avail(sk)) {
531                 subflow->data_avail = 0;
532                 return false;
533         }
534
535         skb = skb_peek(&sk->sk_receive_queue);
536         subflow->data_avail = skb &&
537                        before(tcp_sk(sk)->copied_seq, TCP_SKB_CB(skb)->end_seq);
538         return subflow->data_avail;
539 }
540
541 static void subflow_data_ready(struct sock *sk)
542 {
543         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
544         struct sock *parent = subflow->conn;
545
546         if (!parent || !subflow->mp_capable) {
547                 subflow->tcp_data_ready(sk);
548
549                 if (parent)
550                         parent->sk_data_ready(parent);
551                 return;
552         }
553
554         if (mptcp_subflow_data_available(sk)) {
555                 set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
556
557                 parent->sk_data_ready(parent);
558         }
559 }
560
561 static void subflow_write_space(struct sock *sk)
562 {
563         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
564         struct sock *parent = subflow->conn;
565
566         sk_stream_write_space(sk);
567         if (parent && sk_stream_is_writeable(sk)) {
568                 set_bit(MPTCP_SEND_SPACE, &mptcp_sk(parent)->flags);
569                 smp_mb__after_atomic();
570                 /* set SEND_SPACE before sk_stream_write_space clears NOSPACE */
571                 sk_stream_write_space(parent);
572         }
573 }
574
575 static struct inet_connection_sock_af_ops *
576 subflow_default_af_ops(struct sock *sk)
577 {
578 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
579         if (sk->sk_family == AF_INET6)
580                 return &subflow_v6_specific;
581 #endif
582         return &subflow_specific;
583 }
584
585 void mptcp_handle_ipv6_mapped(struct sock *sk, bool mapped)
586 {
587 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
588         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
589         struct inet_connection_sock *icsk = inet_csk(sk);
590         struct inet_connection_sock_af_ops *target;
591
592         target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
593
594         pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
595                  subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
596
597         if (likely(icsk->icsk_af_ops == target))
598                 return;
599
600         subflow->icsk_af_ops = icsk->icsk_af_ops;
601         icsk->icsk_af_ops = target;
602 #endif
603 }
604
605 int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock)
606 {
607         struct mptcp_subflow_context *subflow;
608         struct net *net = sock_net(sk);
609         struct socket *sf;
610         int err;
611
612         err = sock_create_kern(net, sk->sk_family, SOCK_STREAM, IPPROTO_TCP,
613                                &sf);
614         if (err)
615                 return err;
616
617         lock_sock(sf->sk);
618
619         /* kernel sockets do not by default acquire net ref, but TCP timer
620          * needs it.
621          */
622         sf->sk->sk_net_refcnt = 1;
623         get_net(net);
624         this_cpu_add(*net->core.sock_inuse, 1);
625         err = tcp_set_ulp(sf->sk, "mptcp");
626         release_sock(sf->sk);
627
628         if (err)
629                 return err;
630
631         subflow = mptcp_subflow_ctx(sf->sk);
632         pr_debug("subflow=%p", subflow);
633
634         *new_sock = sf;
635         sock_hold(sk);
636         subflow->conn = sk;
637
638         return 0;
639 }
640
641 static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
642                                                         gfp_t priority)
643 {
644         struct inet_connection_sock *icsk = inet_csk(sk);
645         struct mptcp_subflow_context *ctx;
646
647         ctx = kzalloc(sizeof(*ctx), priority);
648         if (!ctx)
649                 return NULL;
650
651         rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
652         INIT_LIST_HEAD(&ctx->node);
653
654         pr_debug("subflow=%p", ctx);
655
656         ctx->tcp_sock = sk;
657
658         return ctx;
659 }
660
661 static void __subflow_state_change(struct sock *sk)
662 {
663         struct socket_wq *wq;
664
665         rcu_read_lock();
666         wq = rcu_dereference(sk->sk_wq);
667         if (skwq_has_sleeper(wq))
668                 wake_up_interruptible_all(&wq->wait);
669         rcu_read_unlock();
670 }
671
672 static bool subflow_is_done(const struct sock *sk)
673 {
674         return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
675 }
676
677 static void subflow_state_change(struct sock *sk)
678 {
679         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
680         struct sock *parent = READ_ONCE(subflow->conn);
681
682         __subflow_state_change(sk);
683
684         /* as recvmsg() does not acquire the subflow socket for ssk selection
685          * a fin packet carrying a DSS can be unnoticed if we don't trigger
686          * the data available machinery here.
687          */
688         if (parent && subflow->mp_capable && mptcp_subflow_data_available(sk)) {
689                 set_bit(MPTCP_DATA_READY, &mptcp_sk(parent)->flags);
690
691                 parent->sk_data_ready(parent);
692         }
693
694         if (parent && !(parent->sk_shutdown & RCV_SHUTDOWN) &&
695             !subflow->rx_eof && subflow_is_done(sk)) {
696                 subflow->rx_eof = 1;
697                 parent->sk_shutdown |= RCV_SHUTDOWN;
698                 __subflow_state_change(parent);
699         }
700 }
701
702 static int subflow_ulp_init(struct sock *sk)
703 {
704         struct inet_connection_sock *icsk = inet_csk(sk);
705         struct mptcp_subflow_context *ctx;
706         struct tcp_sock *tp = tcp_sk(sk);
707         int err = 0;
708
709         /* disallow attaching ULP to a socket unless it has been
710          * created with sock_create_kern()
711          */
712         if (!sk->sk_kern_sock) {
713                 err = -EOPNOTSUPP;
714                 goto out;
715         }
716
717         ctx = subflow_create_ctx(sk, GFP_KERNEL);
718         if (!ctx) {
719                 err = -ENOMEM;
720                 goto out;
721         }
722
723         pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
724
725         tp->is_mptcp = 1;
726         ctx->icsk_af_ops = icsk->icsk_af_ops;
727         icsk->icsk_af_ops = subflow_default_af_ops(sk);
728         ctx->tcp_data_ready = sk->sk_data_ready;
729         ctx->tcp_state_change = sk->sk_state_change;
730         ctx->tcp_write_space = sk->sk_write_space;
731         sk->sk_data_ready = subflow_data_ready;
732         sk->sk_write_space = subflow_write_space;
733         sk->sk_state_change = subflow_state_change;
734 out:
735         return err;
736 }
737
738 static void subflow_ulp_release(struct sock *sk)
739 {
740         struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(sk);
741
742         if (!ctx)
743                 return;
744
745         if (ctx->conn)
746                 sock_put(ctx->conn);
747
748         kfree_rcu(ctx, rcu);
749 }
750
751 static void subflow_ulp_fallback(struct sock *sk,
752                                  struct mptcp_subflow_context *old_ctx)
753 {
754         struct inet_connection_sock *icsk = inet_csk(sk);
755
756         mptcp_subflow_tcp_fallback(sk, old_ctx);
757         icsk->icsk_ulp_ops = NULL;
758         rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
759         tcp_sk(sk)->is_mptcp = 0;
760 }
761
762 static void subflow_ulp_clone(const struct request_sock *req,
763                               struct sock *newsk,
764                               const gfp_t priority)
765 {
766         struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
767         struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
768         struct mptcp_subflow_context *new_ctx;
769
770         if (!subflow_req->mp_capable) {
771                 subflow_ulp_fallback(newsk, old_ctx);
772                 return;
773         }
774
775         new_ctx = subflow_create_ctx(newsk, priority);
776         if (!new_ctx) {
777                 subflow_ulp_fallback(newsk, old_ctx);
778                 return;
779         }
780
781         /* see comments in subflow_syn_recv_sock(), MPTCP connection is fully
782          * established only after we receive the remote key
783          */
784         new_ctx->conn_finished = 1;
785         new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
786         new_ctx->tcp_data_ready = old_ctx->tcp_data_ready;
787         new_ctx->tcp_state_change = old_ctx->tcp_state_change;
788         new_ctx->tcp_write_space = old_ctx->tcp_write_space;
789         new_ctx->mp_capable = 1;
790         new_ctx->fourth_ack = subflow_req->remote_key_valid;
791         new_ctx->can_ack = subflow_req->remote_key_valid;
792         new_ctx->remote_key = subflow_req->remote_key;
793         new_ctx->local_key = subflow_req->local_key;
794         new_ctx->token = subflow_req->token;
795         new_ctx->ssn_offset = subflow_req->ssn_offset;
796         new_ctx->idsn = subflow_req->idsn;
797 }
798
799 static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
800         .name           = "mptcp",
801         .owner          = THIS_MODULE,
802         .init           = subflow_ulp_init,
803         .release        = subflow_ulp_release,
804         .clone          = subflow_ulp_clone,
805 };
806
807 static int subflow_ops_init(struct request_sock_ops *subflow_ops)
808 {
809         subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
810         subflow_ops->slab_name = "request_sock_subflow";
811
812         subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
813                                               subflow_ops->obj_size, 0,
814                                               SLAB_ACCOUNT |
815                                               SLAB_TYPESAFE_BY_RCU,
816                                               NULL);
817         if (!subflow_ops->slab)
818                 return -ENOMEM;
819
820         subflow_ops->destructor = subflow_req_destructor;
821
822         return 0;
823 }
824
825 void mptcp_subflow_init(void)
826 {
827         subflow_request_sock_ops = tcp_request_sock_ops;
828         if (subflow_ops_init(&subflow_request_sock_ops) != 0)
829                 panic("MPTCP: failed to init subflow request sock ops\n");
830
831         subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
832         subflow_request_sock_ipv4_ops.init_req = subflow_v4_init_req;
833
834         subflow_specific = ipv4_specific;
835         subflow_specific.conn_request = subflow_v4_conn_request;
836         subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
837         subflow_specific.sk_rx_dst_set = subflow_finish_connect;
838         subflow_specific.rebuild_header = subflow_rebuild_header;
839
840 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
841         subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
842         subflow_request_sock_ipv6_ops.init_req = subflow_v6_init_req;
843
844         subflow_v6_specific = ipv6_specific;
845         subflow_v6_specific.conn_request = subflow_v6_conn_request;
846         subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
847         subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
848         subflow_v6_specific.rebuild_header = subflow_rebuild_header;
849
850         subflow_v6m_specific = subflow_v6_specific;
851         subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
852         subflow_v6m_specific.send_check = ipv4_specific.send_check;
853         subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
854         subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
855         subflow_v6m_specific.net_frag_header_len = 0;
856 #endif
857
858         if (tcp_register_ulp(&subflow_ulp_ops) != 0)
859                 panic("MPTCP: failed to register subflows to ULP\n");
860 }