Merge tag 'drm-misc-next-fixes-2023-09-01' of git://anongit.freedesktop.org/drm/drm...
[platform/kernel/linux-starfive.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203         struct inet_timewait_death_row *tcp_death_row;
204         struct inet_sock *inet = inet_sk(sk);
205         struct tcp_sock *tp = tcp_sk(sk);
206         struct ip_options_rcu *inet_opt;
207         struct net *net = sock_net(sk);
208         __be16 orig_sport, orig_dport;
209         __be32 daddr, nexthop;
210         struct flowi4 *fl4;
211         struct rtable *rt;
212         int err;
213
214         if (addr_len < sizeof(struct sockaddr_in))
215                 return -EINVAL;
216
217         if (usin->sin_family != AF_INET)
218                 return -EAFNOSUPPORT;
219
220         nexthop = daddr = usin->sin_addr.s_addr;
221         inet_opt = rcu_dereference_protected(inet->inet_opt,
222                                              lockdep_sock_is_held(sk));
223         if (inet_opt && inet_opt->opt.srr) {
224                 if (!daddr)
225                         return -EINVAL;
226                 nexthop = inet_opt->opt.faddr;
227         }
228
229         orig_sport = inet->inet_sport;
230         orig_dport = usin->sin_port;
231         fl4 = &inet->cork.fl.u.ip4;
232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234                               orig_dport, sk);
235         if (IS_ERR(rt)) {
236                 err = PTR_ERR(rt);
237                 if (err == -ENETUNREACH)
238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
239                 return err;
240         }
241
242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243                 ip_rt_put(rt);
244                 return -ENETUNREACH;
245         }
246
247         if (!inet_opt || !inet_opt->opt.srr)
248                 daddr = fl4->daddr;
249
250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251
252         if (!inet->inet_saddr) {
253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
254                 if (err) {
255                         ip_rt_put(rt);
256                         return err;
257                 }
258         } else {
259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
260         }
261
262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263                 /* Reset inherited state */
264                 tp->rx_opt.ts_recent       = 0;
265                 tp->rx_opt.ts_recent_stamp = 0;
266                 if (likely(!tp->repair))
267                         WRITE_ONCE(tp->write_seq, 0);
268         }
269
270         inet->inet_dport = usin->sin_port;
271         sk_daddr_set(sk, daddr);
272
273         inet_csk(sk)->icsk_ext_hdr_len = 0;
274         if (inet_opt)
275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276
277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278
279         /* Socket identity is still unknown (sport may be zero).
280          * However we set state to SYN-SENT and not releasing socket
281          * lock select source port, enter ourselves into the hash tables and
282          * complete initialization after this.
283          */
284         tcp_set_state(sk, TCP_SYN_SENT);
285         err = inet_hash_connect(tcp_death_row, sk);
286         if (err)
287                 goto failure;
288
289         sk_set_txhash(sk);
290
291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292                                inet->inet_sport, inet->inet_dport, sk);
293         if (IS_ERR(rt)) {
294                 err = PTR_ERR(rt);
295                 rt = NULL;
296                 goto failure;
297         }
298         /* OK, now commit destination to socket.  */
299         sk->sk_gso_type = SKB_GSO_TCPV4;
300         sk_setup_caps(sk, &rt->dst);
301         rt = NULL;
302
303         if (likely(!tp->repair)) {
304                 if (!tp->write_seq)
305                         WRITE_ONCE(tp->write_seq,
306                                    secure_tcp_seq(inet->inet_saddr,
307                                                   inet->inet_daddr,
308                                                   inet->inet_sport,
309                                                   usin->sin_port));
310                 WRITE_ONCE(tp->tsoffset,
311                            secure_tcp_ts_off(net, inet->inet_saddr,
312                                              inet->inet_daddr));
313         }
314
315         inet->inet_id = get_random_u16();
316
317         if (tcp_fastopen_defer_connect(sk, &err))
318                 return err;
319         if (err)
320                 goto failure;
321
322         err = tcp_connect(sk);
323
324         if (err)
325                 goto failure;
326
327         return 0;
328
329 failure:
330         /*
331          * This unhashes the socket and releases the local port,
332          * if necessary.
333          */
334         tcp_set_state(sk, TCP_CLOSE);
335         inet_bhash2_reset_saddr(sk);
336         ip_rt_put(rt);
337         sk->sk_route_caps = 0;
338         inet->inet_dport = 0;
339         return err;
340 }
341 EXPORT_SYMBOL(tcp_v4_connect);
342
343 /*
344  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
345  * It can be called through tcp_release_cb() if socket was owned by user
346  * at the time tcp_v4_err() was called to handle ICMP message.
347  */
348 void tcp_v4_mtu_reduced(struct sock *sk)
349 {
350         struct inet_sock *inet = inet_sk(sk);
351         struct dst_entry *dst;
352         u32 mtu;
353
354         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
355                 return;
356         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
357         dst = inet_csk_update_pmtu(sk, mtu);
358         if (!dst)
359                 return;
360
361         /* Something is about to be wrong... Remember soft error
362          * for the case, if this connection will not able to recover.
363          */
364         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
365                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
366
367         mtu = dst_mtu(dst);
368
369         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
370             ip_sk_accept_pmtu(sk) &&
371             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
372                 tcp_sync_mss(sk, mtu);
373
374                 /* Resend the TCP packet because it's
375                  * clear that the old packet has been
376                  * dropped. This is the new "fast" path mtu
377                  * discovery.
378                  */
379                 tcp_simple_retransmit(sk);
380         } /* else let the usual retransmit timer handle it */
381 }
382 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
383
384 static void do_redirect(struct sk_buff *skb, struct sock *sk)
385 {
386         struct dst_entry *dst = __sk_dst_check(sk, 0);
387
388         if (dst)
389                 dst->ops->redirect(dst, sk, skb);
390 }
391
392
393 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
394 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
395 {
396         struct request_sock *req = inet_reqsk(sk);
397         struct net *net = sock_net(sk);
398
399         /* ICMPs are not backlogged, hence we cannot get
400          * an established socket here.
401          */
402         if (seq != tcp_rsk(req)->snt_isn) {
403                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
404         } else if (abort) {
405                 /*
406                  * Still in SYN_RECV, just remove it silently.
407                  * There is no good way to pass the error to the newly
408                  * created socket, and POSIX does not want network
409                  * errors returned from accept().
410                  */
411                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
412                 tcp_listendrop(req->rsk_listener);
413         }
414         reqsk_put(req);
415 }
416 EXPORT_SYMBOL(tcp_req_err);
417
418 /* TCP-LD (RFC 6069) logic */
419 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
420 {
421         struct inet_connection_sock *icsk = inet_csk(sk);
422         struct tcp_sock *tp = tcp_sk(sk);
423         struct sk_buff *skb;
424         s32 remaining;
425         u32 delta_us;
426
427         if (sock_owned_by_user(sk))
428                 return;
429
430         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
431             !icsk->icsk_backoff)
432                 return;
433
434         skb = tcp_rtx_queue_head(sk);
435         if (WARN_ON_ONCE(!skb))
436                 return;
437
438         icsk->icsk_backoff--;
439         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
440         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
441
442         tcp_mstamp_refresh(tp);
443         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
444         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
445
446         if (remaining > 0) {
447                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
448                                           remaining, TCP_RTO_MAX);
449         } else {
450                 /* RTO revert clocked out retransmission.
451                  * Will retransmit now.
452                  */
453                 tcp_retransmit_timer(sk);
454         }
455 }
456 EXPORT_SYMBOL(tcp_ld_RTO_revert);
457
458 /*
459  * This routine is called by the ICMP module when it gets some
460  * sort of error condition.  If err < 0 then the socket should
461  * be closed and the error returned to the user.  If err > 0
462  * it's just the icmp type << 8 | icmp code.  After adjustment
463  * header points to the first 8 bytes of the tcp header.  We need
464  * to find the appropriate port.
465  *
466  * The locking strategy used here is very "optimistic". When
467  * someone else accesses the socket the ICMP is just dropped
468  * and for some paths there is no check at all.
469  * A more general error queue to queue errors for later handling
470  * is probably better.
471  *
472  */
473
474 int tcp_v4_err(struct sk_buff *skb, u32 info)
475 {
476         const struct iphdr *iph = (const struct iphdr *)skb->data;
477         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
478         struct tcp_sock *tp;
479         struct inet_sock *inet;
480         const int type = icmp_hdr(skb)->type;
481         const int code = icmp_hdr(skb)->code;
482         struct sock *sk;
483         struct request_sock *fastopen;
484         u32 seq, snd_una;
485         int err;
486         struct net *net = dev_net(skb->dev);
487
488         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489                                        iph->daddr, th->dest, iph->saddr,
490                                        ntohs(th->source), inet_iif(skb), 0);
491         if (!sk) {
492                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493                 return -ENOENT;
494         }
495         if (sk->sk_state == TCP_TIME_WAIT) {
496                 inet_twsk_put(inet_twsk(sk));
497                 return 0;
498         }
499         seq = ntohl(th->seq);
500         if (sk->sk_state == TCP_NEW_SYN_RECV) {
501                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502                                      type == ICMP_TIME_EXCEEDED ||
503                                      (type == ICMP_DEST_UNREACH &&
504                                       (code == ICMP_NET_UNREACH ||
505                                        code == ICMP_HOST_UNREACH)));
506                 return 0;
507         }
508
509         bh_lock_sock(sk);
510         /* If too many ICMPs get dropped on busy
511          * servers this needs to be solved differently.
512          * We do take care of PMTU discovery (RFC1191) special case :
513          * we can receive locally generated ICMP messages while socket is held.
514          */
515         if (sock_owned_by_user(sk)) {
516                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518         }
519         if (sk->sk_state == TCP_CLOSE)
520                 goto out;
521
522         if (static_branch_unlikely(&ip4_min_ttl)) {
523                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
524                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526                         goto out;
527                 }
528         }
529
530         tp = tcp_sk(sk);
531         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532         fastopen = rcu_dereference(tp->fastopen_rsk);
533         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534         if (sk->sk_state != TCP_LISTEN &&
535             !between(seq, snd_una, tp->snd_nxt)) {
536                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537                 goto out;
538         }
539
540         switch (type) {
541         case ICMP_REDIRECT:
542                 if (!sock_owned_by_user(sk))
543                         do_redirect(skb, sk);
544                 goto out;
545         case ICMP_SOURCE_QUENCH:
546                 /* Just silently ignore these. */
547                 goto out;
548         case ICMP_PARAMETERPROB:
549                 err = EPROTO;
550                 break;
551         case ICMP_DEST_UNREACH:
552                 if (code > NR_ICMP_UNREACH)
553                         goto out;
554
555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556                         /* We are not interested in TCP_LISTEN and open_requests
557                          * (SYN-ACKs send out by Linux are always <576bytes so
558                          * they should go through unfragmented).
559                          */
560                         if (sk->sk_state == TCP_LISTEN)
561                                 goto out;
562
563                         WRITE_ONCE(tp->mtu_info, info);
564                         if (!sock_owned_by_user(sk)) {
565                                 tcp_v4_mtu_reduced(sk);
566                         } else {
567                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568                                         sock_hold(sk);
569                         }
570                         goto out;
571                 }
572
573                 err = icmp_err_convert[code].errno;
574                 /* check if this ICMP message allows revert of backoff.
575                  * (see RFC 6069)
576                  */
577                 if (!fastopen &&
578                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579                         tcp_ld_RTO_revert(sk, seq);
580                 break;
581         case ICMP_TIME_EXCEEDED:
582                 err = EHOSTUNREACH;
583                 break;
584         default:
585                 goto out;
586         }
587
588         switch (sk->sk_state) {
589         case TCP_SYN_SENT:
590         case TCP_SYN_RECV:
591                 /* Only in fast or simultaneous open. If a fast open socket is
592                  * already accepted it is treated as a connected one below.
593                  */
594                 if (fastopen && !fastopen->sk)
595                         break;
596
597                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598
599                 if (!sock_owned_by_user(sk)) {
600                         WRITE_ONCE(sk->sk_err, err);
601
602                         sk_error_report(sk);
603
604                         tcp_done(sk);
605                 } else {
606                         WRITE_ONCE(sk->sk_err_soft, err);
607                 }
608                 goto out;
609         }
610
611         /* If we've already connected we will keep trying
612          * until we time out, or the user gives up.
613          *
614          * rfc1122 4.2.3.9 allows to consider as hard errors
615          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616          * but it is obsoleted by pmtu discovery).
617          *
618          * Note, that in modern internet, where routing is unreliable
619          * and in each dark corner broken firewalls sit, sending random
620          * errors ordered by their masters even this two messages finally lose
621          * their original sense (even Linux sends invalid PORT_UNREACHs)
622          *
623          * Now we are in compliance with RFCs.
624          *                                                      --ANK (980905)
625          */
626
627         inet = inet_sk(sk);
628         if (!sock_owned_by_user(sk) && inet->recverr) {
629                 WRITE_ONCE(sk->sk_err, err);
630                 sk_error_report(sk);
631         } else  { /* Only an error on timeout */
632                 WRITE_ONCE(sk->sk_err_soft, err);
633         }
634
635 out:
636         bh_unlock_sock(sk);
637         sock_put(sk);
638         return 0;
639 }
640
641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 {
643         struct tcphdr *th = tcp_hdr(skb);
644
645         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646         skb->csum_start = skb_transport_header(skb) - skb->head;
647         skb->csum_offset = offsetof(struct tcphdr, check);
648 }
649
650 /* This routine computes an IPv4 TCP checksum. */
651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 {
653         const struct inet_sock *inet = inet_sk(sk);
654
655         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 }
657 EXPORT_SYMBOL(tcp_v4_send_check);
658
659 /*
660  *      This routine will send an RST to the other tcp.
661  *
662  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663  *                    for reset.
664  *      Answer: if a packet caused RST, it is not for a socket
665  *              existing in our system, if it is matched to a socket,
666  *              it is just duplicate segment or bug in other side's TCP.
667  *              So that we build reply only basing on parameters
668  *              arrived with segment.
669  *      Exception: precedence violation. We do not implement it in any case.
670  */
671
672 #ifdef CONFIG_TCP_MD5SIG
673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #else
675 #define OPTION_BYTES sizeof(__be32)
676 #endif
677
678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 {
680         const struct tcphdr *th = tcp_hdr(skb);
681         struct {
682                 struct tcphdr th;
683                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
684         } rep;
685         struct ip_reply_arg arg;
686 #ifdef CONFIG_TCP_MD5SIG
687         struct tcp_md5sig_key *key = NULL;
688         const __u8 *hash_location = NULL;
689         unsigned char newhash[16];
690         int genhash;
691         struct sock *sk1 = NULL;
692 #endif
693         u64 transmit_time = 0;
694         struct sock *ctl_sk;
695         struct net *net;
696         u32 txhash = 0;
697
698         /* Never send a reset in response to a reset. */
699         if (th->rst)
700                 return;
701
702         /* If sk not NULL, it means we did a successful lookup and incoming
703          * route had to be correct. prequeue might have dropped our dst.
704          */
705         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706                 return;
707
708         /* Swap the send and the receive. */
709         memset(&rep, 0, sizeof(rep));
710         rep.th.dest   = th->source;
711         rep.th.source = th->dest;
712         rep.th.doff   = sizeof(struct tcphdr) / 4;
713         rep.th.rst    = 1;
714
715         if (th->ack) {
716                 rep.th.seq = th->ack_seq;
717         } else {
718                 rep.th.ack = 1;
719                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720                                        skb->len - (th->doff << 2));
721         }
722
723         memset(&arg, 0, sizeof(arg));
724         arg.iov[0].iov_base = (unsigned char *)&rep;
725         arg.iov[0].iov_len  = sizeof(rep.th);
726
727         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728 #ifdef CONFIG_TCP_MD5SIG
729         rcu_read_lock();
730         hash_location = tcp_parse_md5sig_option(th);
731         if (sk && sk_fullsock(sk)) {
732                 const union tcp_md5_addr *addr;
733                 int l3index;
734
735                 /* sdif set, means packet ingressed via a device
736                  * in an L3 domain and inet_iif is set to it.
737                  */
738                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741         } else if (hash_location) {
742                 const union tcp_md5_addr *addr;
743                 int sdif = tcp_v4_sdif(skb);
744                 int dif = inet_iif(skb);
745                 int l3index;
746
747                 /*
748                  * active side is lost. Try to find listening socket through
749                  * source port, and then find md5 key through listening socket.
750                  * we are not loose security here:
751                  * Incoming packet is checked with md5 hash with finding key,
752                  * no RST generated if md5 hash doesn't match.
753                  */
754                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755                                              NULL, 0, ip_hdr(skb)->saddr,
756                                              th->source, ip_hdr(skb)->daddr,
757                                              ntohs(th->source), dif, sdif);
758                 /* don't send rst if it can't find key */
759                 if (!sk1)
760                         goto out;
761
762                 /* sdif set, means packet ingressed via a device
763                  * in an L3 domain and dif is set to it.
764                  */
765                 l3index = sdif ? dif : 0;
766                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768                 if (!key)
769                         goto out;
770
771
772                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
774                         goto out;
775
776         }
777
778         if (key) {
779                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780                                    (TCPOPT_NOP << 16) |
781                                    (TCPOPT_MD5SIG << 8) |
782                                    TCPOLEN_MD5SIG);
783                 /* Update length and the length the header thinks exists */
784                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785                 rep.th.doff = arg.iov[0].iov_len / 4;
786
787                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788                                      key, ip_hdr(skb)->saddr,
789                                      ip_hdr(skb)->daddr, &rep.th);
790         }
791 #endif
792         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793         if (rep.opt[0] == 0) {
794                 __be32 mrst = mptcp_reset_option(skb);
795
796                 if (mrst) {
797                         rep.opt[0] = mrst;
798                         arg.iov[0].iov_len += sizeof(mrst);
799                         rep.th.doff = arg.iov[0].iov_len / 4;
800                 }
801         }
802
803         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804                                       ip_hdr(skb)->saddr, /* XXX */
805                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
806         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808
809         /* When socket is gone, all binding information is lost.
810          * routing might fail in this case. No choice here, if we choose to force
811          * input interface, we will misroute in case of asymmetric route.
812          */
813         if (sk) {
814                 arg.bound_dev_if = sk->sk_bound_dev_if;
815                 if (sk_fullsock(sk))
816                         trace_tcp_send_reset(sk, skb);
817         }
818
819         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821
822         arg.tos = ip_hdr(skb)->tos;
823         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824         local_bh_disable();
825         ctl_sk = this_cpu_read(ipv4_tcp_sk);
826         sock_net_set(ctl_sk, net);
827         if (sk) {
828                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
830                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
832                 transmit_time = tcp_transmit_time(sk);
833                 xfrm_sk_clone_policy(ctl_sk, sk);
834                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836         } else {
837                 ctl_sk->sk_mark = 0;
838                 ctl_sk->sk_priority = 0;
839         }
840         ip_send_unicast_reply(ctl_sk,
841                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
842                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843                               &arg, arg.iov[0].iov_len,
844                               transmit_time, txhash);
845
846         xfrm_sk_free_policy(ctl_sk);
847         sock_net_set(ctl_sk, &init_net);
848         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850         local_bh_enable();
851
852 #ifdef CONFIG_TCP_MD5SIG
853 out:
854         rcu_read_unlock();
855 #endif
856 }
857
858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859    outside socket context is ugly, certainly. What can I do?
860  */
861
862 static void tcp_v4_send_ack(const struct sock *sk,
863                             struct sk_buff *skb, u32 seq, u32 ack,
864                             u32 win, u32 tsval, u32 tsecr, int oif,
865                             struct tcp_md5sig_key *key,
866                             int reply_flags, u8 tos, u32 txhash)
867 {
868         const struct tcphdr *th = tcp_hdr(skb);
869         struct {
870                 struct tcphdr th;
871                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872 #ifdef CONFIG_TCP_MD5SIG
873                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874 #endif
875                         ];
876         } rep;
877         struct net *net = sock_net(sk);
878         struct ip_reply_arg arg;
879         struct sock *ctl_sk;
880         u64 transmit_time;
881
882         memset(&rep.th, 0, sizeof(struct tcphdr));
883         memset(&arg, 0, sizeof(arg));
884
885         arg.iov[0].iov_base = (unsigned char *)&rep;
886         arg.iov[0].iov_len  = sizeof(rep.th);
887         if (tsecr) {
888                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889                                    (TCPOPT_TIMESTAMP << 8) |
890                                    TCPOLEN_TIMESTAMP);
891                 rep.opt[1] = htonl(tsval);
892                 rep.opt[2] = htonl(tsecr);
893                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894         }
895
896         /* Swap the send and the receive. */
897         rep.th.dest    = th->source;
898         rep.th.source  = th->dest;
899         rep.th.doff    = arg.iov[0].iov_len / 4;
900         rep.th.seq     = htonl(seq);
901         rep.th.ack_seq = htonl(ack);
902         rep.th.ack     = 1;
903         rep.th.window  = htons(win);
904
905 #ifdef CONFIG_TCP_MD5SIG
906         if (key) {
907                 int offset = (tsecr) ? 3 : 0;
908
909                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910                                           (TCPOPT_NOP << 16) |
911                                           (TCPOPT_MD5SIG << 8) |
912                                           TCPOLEN_MD5SIG);
913                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914                 rep.th.doff = arg.iov[0].iov_len/4;
915
916                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917                                     key, ip_hdr(skb)->saddr,
918                                     ip_hdr(skb)->daddr, &rep.th);
919         }
920 #endif
921         arg.flags = reply_flags;
922         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923                                       ip_hdr(skb)->saddr, /* XXX */
924                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
925         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926         if (oif)
927                 arg.bound_dev_if = oif;
928         arg.tos = tos;
929         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930         local_bh_disable();
931         ctl_sk = this_cpu_read(ipv4_tcp_sk);
932         sock_net_set(ctl_sk, net);
933         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937         transmit_time = tcp_transmit_time(sk);
938         ip_send_unicast_reply(ctl_sk,
939                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
940                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941                               &arg, arg.iov[0].iov_len,
942                               transmit_time, txhash);
943
944         sock_net_set(ctl_sk, &init_net);
945         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946         local_bh_enable();
947 }
948
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951         struct inet_timewait_sock *tw = inet_twsk(sk);
952         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953
954         tcp_v4_send_ack(sk, skb,
955                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958                         tcptw->tw_ts_recent,
959                         tw->tw_bound_dev_if,
960                         tcp_twsk_md5_key(tcptw),
961                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962                         tw->tw_tos,
963                         tw->tw_txhash
964                         );
965
966         inet_twsk_put(tw);
967 }
968
969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970                                   struct request_sock *req)
971 {
972         const union tcp_md5_addr *addr;
973         int l3index;
974
975         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977          */
978         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979                                              tcp_sk(sk)->snd_nxt;
980
981         /* RFC 7323 2.3
982          * The window field (SEG.WND) of every outgoing segment, with the
983          * exception of <SYN> segments, MUST be right-shifted by
984          * Rcv.Wind.Shift bits:
985          */
986         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988         tcp_v4_send_ack(sk, skb, seq,
989                         tcp_rsk(req)->rcv_nxt,
990                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992                         READ_ONCE(req->ts_recent),
993                         0,
994                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996                         ip_hdr(skb)->tos,
997                         READ_ONCE(tcp_rsk(req)->txhash));
998 }
999
1000 /*
1001  *      Send a SYN-ACK after having received a SYN.
1002  *      This still operates on a request_sock only, not on a big
1003  *      socket.
1004  */
1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006                               struct flowi *fl,
1007                               struct request_sock *req,
1008                               struct tcp_fastopen_cookie *foc,
1009                               enum tcp_synack_type synack_type,
1010                               struct sk_buff *syn_skb)
1011 {
1012         const struct inet_request_sock *ireq = inet_rsk(req);
1013         struct flowi4 fl4;
1014         int err = -1;
1015         struct sk_buff *skb;
1016         u8 tos;
1017
1018         /* First, grab a route. */
1019         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020                 return -1;
1021
1022         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023
1024         if (skb) {
1025                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026
1027                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1030                                 inet_sk(sk)->tos;
1031
1032                 if (!INET_ECN_is_capable(tos) &&
1033                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1034                         tos |= INET_ECN_ECT_0;
1035
1036                 rcu_read_lock();
1037                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038                                             ireq->ir_rmt_addr,
1039                                             rcu_dereference(ireq->ireq_opt),
1040                                             tos);
1041                 rcu_read_unlock();
1042                 err = net_xmit_eval(err);
1043         }
1044
1045         return err;
1046 }
1047
1048 /*
1049  *      IPv4 request_sock destructor.
1050  */
1051 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 {
1053         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054 }
1055
1056 #ifdef CONFIG_TCP_MD5SIG
1057 /*
1058  * RFC2385 MD5 checksumming requires a mapping of
1059  * IP address->MD5 Key.
1060  * We need to maintain these in the sk structure.
1061  */
1062
1063 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1064 EXPORT_SYMBOL(tcp_md5_needed);
1065
1066 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067 {
1068         if (!old)
1069                 return true;
1070
1071         /* l3index always overrides non-l3index */
1072         if (old->l3index && new->l3index == 0)
1073                 return false;
1074         if (old->l3index == 0 && new->l3index)
1075                 return true;
1076
1077         return old->prefixlen < new->prefixlen;
1078 }
1079
1080 /* Find the Key structure for an address.  */
1081 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082                                            const union tcp_md5_addr *addr,
1083                                            int family)
1084 {
1085         const struct tcp_sock *tp = tcp_sk(sk);
1086         struct tcp_md5sig_key *key;
1087         const struct tcp_md5sig_info *md5sig;
1088         __be32 mask;
1089         struct tcp_md5sig_key *best_match = NULL;
1090         bool match;
1091
1092         /* caller either holds rcu_read_lock() or socket lock */
1093         md5sig = rcu_dereference_check(tp->md5sig_info,
1094                                        lockdep_sock_is_held(sk));
1095         if (!md5sig)
1096                 return NULL;
1097
1098         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099                                  lockdep_sock_is_held(sk)) {
1100                 if (key->family != family)
1101                         continue;
1102                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103                         continue;
1104                 if (family == AF_INET) {
1105                         mask = inet_make_mask(key->prefixlen);
1106                         match = (key->addr.a4.s_addr & mask) ==
1107                                 (addr->a4.s_addr & mask);
1108 #if IS_ENABLED(CONFIG_IPV6)
1109                 } else if (family == AF_INET6) {
1110                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111                                                   key->prefixlen);
1112 #endif
1113                 } else {
1114                         match = false;
1115                 }
1116
1117                 if (match && better_md5_match(best_match, key))
1118                         best_match = key;
1119         }
1120         return best_match;
1121 }
1122 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123
1124 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125                                                       const union tcp_md5_addr *addr,
1126                                                       int family, u8 prefixlen,
1127                                                       int l3index, u8 flags)
1128 {
1129         const struct tcp_sock *tp = tcp_sk(sk);
1130         struct tcp_md5sig_key *key;
1131         unsigned int size = sizeof(struct in_addr);
1132         const struct tcp_md5sig_info *md5sig;
1133
1134         /* caller either holds rcu_read_lock() or socket lock */
1135         md5sig = rcu_dereference_check(tp->md5sig_info,
1136                                        lockdep_sock_is_held(sk));
1137         if (!md5sig)
1138                 return NULL;
1139 #if IS_ENABLED(CONFIG_IPV6)
1140         if (family == AF_INET6)
1141                 size = sizeof(struct in6_addr);
1142 #endif
1143         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144                                  lockdep_sock_is_held(sk)) {
1145                 if (key->family != family)
1146                         continue;
1147                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148                         continue;
1149                 if (key->l3index != l3index)
1150                         continue;
1151                 if (!memcmp(&key->addr, addr, size) &&
1152                     key->prefixlen == prefixlen)
1153                         return key;
1154         }
1155         return NULL;
1156 }
1157
1158 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159                                          const struct sock *addr_sk)
1160 {
1161         const union tcp_md5_addr *addr;
1162         int l3index;
1163
1164         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165                                                  addr_sk->sk_bound_dev_if);
1166         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 }
1169 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170
1171 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172 {
1173         struct tcp_sock *tp = tcp_sk(sk);
1174         struct tcp_md5sig_info *md5sig;
1175
1176         md5sig = kmalloc(sizeof(*md5sig), gfp);
1177         if (!md5sig)
1178                 return -ENOMEM;
1179
1180         sk_gso_disable(sk);
1181         INIT_HLIST_HEAD(&md5sig->head);
1182         rcu_assign_pointer(tp->md5sig_info, md5sig);
1183         return 0;
1184 }
1185
1186 /* This can be called on a newly created socket, from other files */
1187 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1188                             int family, u8 prefixlen, int l3index, u8 flags,
1189                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190 {
1191         /* Add Key to the list */
1192         struct tcp_md5sig_key *key;
1193         struct tcp_sock *tp = tcp_sk(sk);
1194         struct tcp_md5sig_info *md5sig;
1195
1196         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197         if (key) {
1198                 /* Pre-existing entry - just update that one.
1199                  * Note that the key might be used concurrently.
1200                  * data_race() is telling kcsan that we do not care of
1201                  * key mismatches, since changing MD5 key on live flows
1202                  * can lead to packet drops.
1203                  */
1204                 data_race(memcpy(key->key, newkey, newkeylen));
1205
1206                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1207                  * Also note that a reader could catch new key->keylen value
1208                  * but old key->key[], this is the reason we use __GFP_ZERO
1209                  * at sock_kmalloc() time below these lines.
1210                  */
1211                 WRITE_ONCE(key->keylen, newkeylen);
1212
1213                 return 0;
1214         }
1215
1216         md5sig = rcu_dereference_protected(tp->md5sig_info,
1217                                            lockdep_sock_is_held(sk));
1218
1219         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220         if (!key)
1221                 return -ENOMEM;
1222         if (!tcp_alloc_md5sig_pool()) {
1223                 sock_kfree_s(sk, key, sizeof(*key));
1224                 return -ENOMEM;
1225         }
1226
1227         memcpy(key->key, newkey, newkeylen);
1228         key->keylen = newkeylen;
1229         key->family = family;
1230         key->prefixlen = prefixlen;
1231         key->l3index = l3index;
1232         key->flags = flags;
1233         memcpy(&key->addr, addr,
1234                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235                                                                  sizeof(struct in_addr));
1236         hlist_add_head_rcu(&key->node, &md5sig->head);
1237         return 0;
1238 }
1239
1240 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1241                    int family, u8 prefixlen, int l3index, u8 flags,
1242                    const u8 *newkey, u8 newkeylen)
1243 {
1244         struct tcp_sock *tp = tcp_sk(sk);
1245
1246         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1247                 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1248                         return -ENOMEM;
1249
1250                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1251                         struct tcp_md5sig_info *md5sig;
1252
1253                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1254                         rcu_assign_pointer(tp->md5sig_info, NULL);
1255                         kfree_rcu(md5sig, rcu);
1256                         return -EUSERS;
1257                 }
1258         }
1259
1260         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1261                                 newkey, newkeylen, GFP_KERNEL);
1262 }
1263 EXPORT_SYMBOL(tcp_md5_do_add);
1264
1265 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1266                      int family, u8 prefixlen, int l3index,
1267                      struct tcp_md5sig_key *key)
1268 {
1269         struct tcp_sock *tp = tcp_sk(sk);
1270
1271         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1272                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1273                         return -ENOMEM;
1274
1275                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1276                         struct tcp_md5sig_info *md5sig;
1277
1278                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1279                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1280                         rcu_assign_pointer(tp->md5sig_info, NULL);
1281                         kfree_rcu(md5sig, rcu);
1282                         return -EUSERS;
1283                 }
1284         }
1285
1286         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1287                                 key->flags, key->key, key->keylen,
1288                                 sk_gfp_mask(sk, GFP_ATOMIC));
1289 }
1290 EXPORT_SYMBOL(tcp_md5_key_copy);
1291
1292 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1293                    u8 prefixlen, int l3index, u8 flags)
1294 {
1295         struct tcp_md5sig_key *key;
1296
1297         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1298         if (!key)
1299                 return -ENOENT;
1300         hlist_del_rcu(&key->node);
1301         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1302         kfree_rcu(key, rcu);
1303         return 0;
1304 }
1305 EXPORT_SYMBOL(tcp_md5_do_del);
1306
1307 static void tcp_clear_md5_list(struct sock *sk)
1308 {
1309         struct tcp_sock *tp = tcp_sk(sk);
1310         struct tcp_md5sig_key *key;
1311         struct hlist_node *n;
1312         struct tcp_md5sig_info *md5sig;
1313
1314         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315
1316         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1317                 hlist_del_rcu(&key->node);
1318                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1319                 kfree_rcu(key, rcu);
1320         }
1321 }
1322
1323 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1324                                  sockptr_t optval, int optlen)
1325 {
1326         struct tcp_md5sig cmd;
1327         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1328         const union tcp_md5_addr *addr;
1329         u8 prefixlen = 32;
1330         int l3index = 0;
1331         u8 flags;
1332
1333         if (optlen < sizeof(cmd))
1334                 return -EINVAL;
1335
1336         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1337                 return -EFAULT;
1338
1339         if (sin->sin_family != AF_INET)
1340                 return -EINVAL;
1341
1342         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343
1344         if (optname == TCP_MD5SIG_EXT &&
1345             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1346                 prefixlen = cmd.tcpm_prefixlen;
1347                 if (prefixlen > 32)
1348                         return -EINVAL;
1349         }
1350
1351         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1352             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1353                 struct net_device *dev;
1354
1355                 rcu_read_lock();
1356                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1357                 if (dev && netif_is_l3_master(dev))
1358                         l3index = dev->ifindex;
1359
1360                 rcu_read_unlock();
1361
1362                 /* ok to reference set/not set outside of rcu;
1363                  * right now device MUST be an L3 master
1364                  */
1365                 if (!dev || !l3index)
1366                         return -EINVAL;
1367         }
1368
1369         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370
1371         if (!cmd.tcpm_keylen)
1372                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373
1374         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1375                 return -EINVAL;
1376
1377         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1378                               cmd.tcpm_key, cmd.tcpm_keylen);
1379 }
1380
1381 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1382                                    __be32 daddr, __be32 saddr,
1383                                    const struct tcphdr *th, int nbytes)
1384 {
1385         struct tcp4_pseudohdr *bp;
1386         struct scatterlist sg;
1387         struct tcphdr *_th;
1388
1389         bp = hp->scratch;
1390         bp->saddr = saddr;
1391         bp->daddr = daddr;
1392         bp->pad = 0;
1393         bp->protocol = IPPROTO_TCP;
1394         bp->len = cpu_to_be16(nbytes);
1395
1396         _th = (struct tcphdr *)(bp + 1);
1397         memcpy(_th, th, sizeof(*th));
1398         _th->check = 0;
1399
1400         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1401         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1402                                 sizeof(*bp) + sizeof(*th));
1403         return crypto_ahash_update(hp->md5_req);
1404 }
1405
1406 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1407                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408 {
1409         struct tcp_md5sig_pool *hp;
1410         struct ahash_request *req;
1411
1412         hp = tcp_get_md5sig_pool();
1413         if (!hp)
1414                 goto clear_hash_noput;
1415         req = hp->md5_req;
1416
1417         if (crypto_ahash_init(req))
1418                 goto clear_hash;
1419         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420                 goto clear_hash;
1421         if (tcp_md5_hash_key(hp, key))
1422                 goto clear_hash;
1423         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1424         if (crypto_ahash_final(req))
1425                 goto clear_hash;
1426
1427         tcp_put_md5sig_pool();
1428         return 0;
1429
1430 clear_hash:
1431         tcp_put_md5sig_pool();
1432 clear_hash_noput:
1433         memset(md5_hash, 0, 16);
1434         return 1;
1435 }
1436
1437 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1438                         const struct sock *sk,
1439                         const struct sk_buff *skb)
1440 {
1441         struct tcp_md5sig_pool *hp;
1442         struct ahash_request *req;
1443         const struct tcphdr *th = tcp_hdr(skb);
1444         __be32 saddr, daddr;
1445
1446         if (sk) { /* valid for establish/request sockets */
1447                 saddr = sk->sk_rcv_saddr;
1448                 daddr = sk->sk_daddr;
1449         } else {
1450                 const struct iphdr *iph = ip_hdr(skb);
1451                 saddr = iph->saddr;
1452                 daddr = iph->daddr;
1453         }
1454
1455         hp = tcp_get_md5sig_pool();
1456         if (!hp)
1457                 goto clear_hash_noput;
1458         req = hp->md5_req;
1459
1460         if (crypto_ahash_init(req))
1461                 goto clear_hash;
1462
1463         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464                 goto clear_hash;
1465         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466                 goto clear_hash;
1467         if (tcp_md5_hash_key(hp, key))
1468                 goto clear_hash;
1469         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1470         if (crypto_ahash_final(req))
1471                 goto clear_hash;
1472
1473         tcp_put_md5sig_pool();
1474         return 0;
1475
1476 clear_hash:
1477         tcp_put_md5sig_pool();
1478 clear_hash_noput:
1479         memset(md5_hash, 0, 16);
1480         return 1;
1481 }
1482 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1483
1484 #endif
1485
1486 static void tcp_v4_init_req(struct request_sock *req,
1487                             const struct sock *sk_listener,
1488                             struct sk_buff *skb)
1489 {
1490         struct inet_request_sock *ireq = inet_rsk(req);
1491         struct net *net = sock_net(sk_listener);
1492
1493         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1494         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1495         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1496 }
1497
1498 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1499                                           struct sk_buff *skb,
1500                                           struct flowi *fl,
1501                                           struct request_sock *req)
1502 {
1503         tcp_v4_init_req(req, sk, skb);
1504
1505         if (security_inet_conn_request(sk, skb, req))
1506                 return NULL;
1507
1508         return inet_csk_route_req(sk, &fl->u.ip4, req);
1509 }
1510
1511 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512         .family         =       PF_INET,
1513         .obj_size       =       sizeof(struct tcp_request_sock),
1514         .rtx_syn_ack    =       tcp_rtx_synack,
1515         .send_ack       =       tcp_v4_reqsk_send_ack,
1516         .destructor     =       tcp_v4_reqsk_destructor,
1517         .send_reset     =       tcp_v4_send_reset,
1518         .syn_ack_timeout =      tcp_syn_ack_timeout,
1519 };
1520
1521 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1522         .mss_clamp      =       TCP_MSS_DEFAULT,
1523 #ifdef CONFIG_TCP_MD5SIG
1524         .req_md5_lookup =       tcp_v4_md5_lookup,
1525         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1526 #endif
1527 #ifdef CONFIG_SYN_COOKIES
1528         .cookie_init_seq =      cookie_v4_init_sequence,
1529 #endif
1530         .route_req      =       tcp_v4_route_req,
1531         .init_seq       =       tcp_v4_init_seq,
1532         .init_ts_off    =       tcp_v4_init_ts_off,
1533         .send_synack    =       tcp_v4_send_synack,
1534 };
1535
1536 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537 {
1538         /* Never answer to SYNs send to broadcast or multicast */
1539         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1540                 goto drop;
1541
1542         return tcp_conn_request(&tcp_request_sock_ops,
1543                                 &tcp_request_sock_ipv4_ops, sk, skb);
1544
1545 drop:
1546         tcp_listendrop(sk);
1547         return 0;
1548 }
1549 EXPORT_SYMBOL(tcp_v4_conn_request);
1550
1551
1552 /*
1553  * The three way handshake has completed - we got a valid synack -
1554  * now create the new socket.
1555  */
1556 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1557                                   struct request_sock *req,
1558                                   struct dst_entry *dst,
1559                                   struct request_sock *req_unhash,
1560                                   bool *own_req)
1561 {
1562         struct inet_request_sock *ireq;
1563         bool found_dup_sk = false;
1564         struct inet_sock *newinet;
1565         struct tcp_sock *newtp;
1566         struct sock *newsk;
1567 #ifdef CONFIG_TCP_MD5SIG
1568         const union tcp_md5_addr *addr;
1569         struct tcp_md5sig_key *key;
1570         int l3index;
1571 #endif
1572         struct ip_options_rcu *inet_opt;
1573
1574         if (sk_acceptq_is_full(sk))
1575                 goto exit_overflow;
1576
1577         newsk = tcp_create_openreq_child(sk, req, skb);
1578         if (!newsk)
1579                 goto exit_nonewsk;
1580
1581         newsk->sk_gso_type = SKB_GSO_TCPV4;
1582         inet_sk_rx_dst_set(newsk, skb);
1583
1584         newtp                 = tcp_sk(newsk);
1585         newinet               = inet_sk(newsk);
1586         ireq                  = inet_rsk(req);
1587         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1588         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1589         newsk->sk_bound_dev_if = ireq->ir_iif;
1590         newinet->inet_saddr   = ireq->ir_loc_addr;
1591         inet_opt              = rcu_dereference(ireq->ireq_opt);
1592         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1593         newinet->mc_index     = inet_iif(skb);
1594         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1595         newinet->rcv_tos      = ip_hdr(skb)->tos;
1596         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597         if (inet_opt)
1598                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1599         newinet->inet_id = get_random_u16();
1600
1601         /* Set ToS of the new socket based upon the value of incoming SYN.
1602          * ECT bits are set later in tcp_init_transfer().
1603          */
1604         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1605                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1606
1607         if (!dst) {
1608                 dst = inet_csk_route_child_sock(sk, newsk, req);
1609                 if (!dst)
1610                         goto put_and_exit;
1611         } else {
1612                 /* syncookie case : see end of cookie_v4_check() */
1613         }
1614         sk_setup_caps(newsk, dst);
1615
1616         tcp_ca_openreq_child(newsk, dst);
1617
1618         tcp_sync_mss(newsk, dst_mtu(dst));
1619         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620
1621         tcp_initialize_rcv_mss(newsk);
1622
1623 #ifdef CONFIG_TCP_MD5SIG
1624         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1625         /* Copy over the MD5 key from the original socket */
1626         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1627         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628         if (key) {
1629                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630                         goto put_and_exit;
1631                 sk_gso_disable(newsk);
1632         }
1633 #endif
1634
1635         if (__inet_inherit_port(sk, newsk) < 0)
1636                 goto put_and_exit;
1637         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638                                        &found_dup_sk);
1639         if (likely(*own_req)) {
1640                 tcp_move_syn(newtp, req);
1641                 ireq->ireq_opt = NULL;
1642         } else {
1643                 newinet->inet_opt = NULL;
1644
1645                 if (!req_unhash && found_dup_sk) {
1646                         /* This code path should only be executed in the
1647                          * syncookie case only
1648                          */
1649                         bh_unlock_sock(newsk);
1650                         sock_put(newsk);
1651                         newsk = NULL;
1652                 }
1653         }
1654         return newsk;
1655
1656 exit_overflow:
1657         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659         dst_release(dst);
1660 exit:
1661         tcp_listendrop(sk);
1662         return NULL;
1663 put_and_exit:
1664         newinet->inet_opt = NULL;
1665         inet_csk_prepare_forced_close(newsk);
1666         tcp_done(newsk);
1667         goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674         const struct tcphdr *th = tcp_hdr(skb);
1675
1676         if (!th->syn)
1677                 sk = cookie_v4_check(sk, skb);
1678 #endif
1679         return sk;
1680 }
1681
1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683                          struct tcphdr *th, u32 *cookie)
1684 {
1685         u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688                                     &tcp_request_sock_ipv4_ops, sk, th);
1689         if (mss) {
1690                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691                 tcp_synq_overflow(sk);
1692         }
1693 #endif
1694         return mss;
1695 }
1696
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698                                                            u32));
1699 /* The socket must have it's spinlock held when we get
1700  * here, unless it is a TCP_LISTEN socket.
1701  *
1702  * We have a potential double-lock case here, so even when
1703  * doing backlog processing we use the BH locking scheme.
1704  * This is because we cannot sleep with the original spinlock
1705  * held.
1706  */
1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709         enum skb_drop_reason reason;
1710         struct sock *rsk;
1711
1712         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1713                 struct dst_entry *dst;
1714
1715                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1716                                                 lockdep_sock_is_held(sk));
1717
1718                 sock_rps_save_rxhash(sk, skb);
1719                 sk_mark_napi_id(sk, skb);
1720                 if (dst) {
1721                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1722                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723                                              dst, 0)) {
1724                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1725                                 dst_release(dst);
1726                         }
1727                 }
1728                 tcp_rcv_established(sk, skb);
1729                 return 0;
1730         }
1731
1732         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1733         if (tcp_checksum_complete(skb))
1734                 goto csum_err;
1735
1736         if (sk->sk_state == TCP_LISTEN) {
1737                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1738
1739                 if (!nsk)
1740                         goto discard;
1741                 if (nsk != sk) {
1742                         if (tcp_child_process(sk, nsk, skb)) {
1743                                 rsk = nsk;
1744                                 goto reset;
1745                         }
1746                         return 0;
1747                 }
1748         } else
1749                 sock_rps_save_rxhash(sk, skb);
1750
1751         if (tcp_rcv_state_process(sk, skb)) {
1752                 rsk = sk;
1753                 goto reset;
1754         }
1755         return 0;
1756
1757 reset:
1758         tcp_v4_send_reset(rsk, skb);
1759 discard:
1760         kfree_skb_reason(skb, reason);
1761         /* Be careful here. If this function gets more complicated and
1762          * gcc suffers from register pressure on the x86, sk (in %ebx)
1763          * might be destroyed here. This current version compiles correctly,
1764          * but you have been warned.
1765          */
1766         return 0;
1767
1768 csum_err:
1769         reason = SKB_DROP_REASON_TCP_CSUM;
1770         trace_tcp_bad_csum(skb);
1771         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1772         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1773         goto discard;
1774 }
1775 EXPORT_SYMBOL(tcp_v4_do_rcv);
1776
1777 int tcp_v4_early_demux(struct sk_buff *skb)
1778 {
1779         struct net *net = dev_net(skb->dev);
1780         const struct iphdr *iph;
1781         const struct tcphdr *th;
1782         struct sock *sk;
1783
1784         if (skb->pkt_type != PACKET_HOST)
1785                 return 0;
1786
1787         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1788                 return 0;
1789
1790         iph = ip_hdr(skb);
1791         th = tcp_hdr(skb);
1792
1793         if (th->doff < sizeof(struct tcphdr) / 4)
1794                 return 0;
1795
1796         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1797                                        iph->saddr, th->source,
1798                                        iph->daddr, ntohs(th->dest),
1799                                        skb->skb_iif, inet_sdif(skb));
1800         if (sk) {
1801                 skb->sk = sk;
1802                 skb->destructor = sock_edemux;
1803                 if (sk_fullsock(sk)) {
1804                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1805
1806                         if (dst)
1807                                 dst = dst_check(dst, 0);
1808                         if (dst &&
1809                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1810                                 skb_dst_set_noref(skb, dst);
1811                 }
1812         }
1813         return 0;
1814 }
1815
1816 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1817                      enum skb_drop_reason *reason)
1818 {
1819         u32 limit, tail_gso_size, tail_gso_segs;
1820         struct skb_shared_info *shinfo;
1821         const struct tcphdr *th;
1822         struct tcphdr *thtail;
1823         struct sk_buff *tail;
1824         unsigned int hdrlen;
1825         bool fragstolen;
1826         u32 gso_segs;
1827         u32 gso_size;
1828         int delta;
1829
1830         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1831          * we can fix skb->truesize to its real value to avoid future drops.
1832          * This is valid because skb is not yet charged to the socket.
1833          * It has been noticed pure SACK packets were sometimes dropped
1834          * (if cooked by drivers without copybreak feature).
1835          */
1836         skb_condense(skb);
1837
1838         skb_dst_drop(skb);
1839
1840         if (unlikely(tcp_checksum_complete(skb))) {
1841                 bh_unlock_sock(sk);
1842                 trace_tcp_bad_csum(skb);
1843                 *reason = SKB_DROP_REASON_TCP_CSUM;
1844                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1845                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1846                 return true;
1847         }
1848
1849         /* Attempt coalescing to last skb in backlog, even if we are
1850          * above the limits.
1851          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852          */
1853         th = (const struct tcphdr *)skb->data;
1854         hdrlen = th->doff * 4;
1855
1856         tail = sk->sk_backlog.tail;
1857         if (!tail)
1858                 goto no_coalesce;
1859         thtail = (struct tcphdr *)tail->data;
1860
1861         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1862             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1863             ((TCP_SKB_CB(tail)->tcp_flags |
1864               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1865             !((TCP_SKB_CB(tail)->tcp_flags &
1866               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1867             ((TCP_SKB_CB(tail)->tcp_flags ^
1868               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1869 #ifdef CONFIG_TLS_DEVICE
1870             tail->decrypted != skb->decrypted ||
1871 #endif
1872             thtail->doff != th->doff ||
1873             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1874                 goto no_coalesce;
1875
1876         __skb_pull(skb, hdrlen);
1877
1878         shinfo = skb_shinfo(skb);
1879         gso_size = shinfo->gso_size ?: skb->len;
1880         gso_segs = shinfo->gso_segs ?: 1;
1881
1882         shinfo = skb_shinfo(tail);
1883         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1884         tail_gso_segs = shinfo->gso_segs ?: 1;
1885
1886         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1887                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1888
1889                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1890                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1891                         thtail->window = th->window;
1892                 }
1893
1894                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1895                  * thtail->fin, so that the fast path in tcp_rcv_established()
1896                  * is not entered if we append a packet with a FIN.
1897                  * SYN, RST, URG are not present.
1898                  * ACK is set on both packets.
1899                  * PSH : we do not really care in TCP stack,
1900                  *       at least for 'GRO' packets.
1901                  */
1902                 thtail->fin |= th->fin;
1903                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1904
1905                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1906                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1907                         tail->tstamp = skb->tstamp;
1908                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1909                 }
1910
1911                 /* Not as strict as GRO. We only need to carry mss max value */
1912                 shinfo->gso_size = max(gso_size, tail_gso_size);
1913                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1914
1915                 sk->sk_backlog.len += delta;
1916                 __NET_INC_STATS(sock_net(sk),
1917                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1918                 kfree_skb_partial(skb, fragstolen);
1919                 return false;
1920         }
1921         __skb_push(skb, hdrlen);
1922
1923 no_coalesce:
1924         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1925
1926         /* Only socket owner can try to collapse/prune rx queues
1927          * to reduce memory overhead, so add a little headroom here.
1928          * Few sockets backlog are possibly concurrently non empty.
1929          */
1930         limit += 64 * 1024;
1931
1932         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1933                 bh_unlock_sock(sk);
1934                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1935                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1936                 return true;
1937         }
1938         return false;
1939 }
1940 EXPORT_SYMBOL(tcp_add_backlog);
1941
1942 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1943 {
1944         struct tcphdr *th = (struct tcphdr *)skb->data;
1945
1946         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1947 }
1948 EXPORT_SYMBOL(tcp_filter);
1949
1950 static void tcp_v4_restore_cb(struct sk_buff *skb)
1951 {
1952         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1953                 sizeof(struct inet_skb_parm));
1954 }
1955
1956 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1957                            const struct tcphdr *th)
1958 {
1959         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1960          * barrier() makes sure compiler wont play fool^Waliasing games.
1961          */
1962         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1963                 sizeof(struct inet_skb_parm));
1964         barrier();
1965
1966         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1967         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1968                                     skb->len - th->doff * 4);
1969         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1970         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1971         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1972         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1973         TCP_SKB_CB(skb)->sacked  = 0;
1974         TCP_SKB_CB(skb)->has_rxtstamp =
1975                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1976 }
1977
1978 /*
1979  *      From tcp_input.c
1980  */
1981
1982 int tcp_v4_rcv(struct sk_buff *skb)
1983 {
1984         struct net *net = dev_net(skb->dev);
1985         enum skb_drop_reason drop_reason;
1986         int sdif = inet_sdif(skb);
1987         int dif = inet_iif(skb);
1988         const struct iphdr *iph;
1989         const struct tcphdr *th;
1990         bool refcounted;
1991         struct sock *sk;
1992         int ret;
1993
1994         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1995         if (skb->pkt_type != PACKET_HOST)
1996                 goto discard_it;
1997
1998         /* Count it even if it's bad */
1999         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2000
2001         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2002                 goto discard_it;
2003
2004         th = (const struct tcphdr *)skb->data;
2005
2006         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2007                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2008                 goto bad_packet;
2009         }
2010         if (!pskb_may_pull(skb, th->doff * 4))
2011                 goto discard_it;
2012
2013         /* An explanation is required here, I think.
2014          * Packet length and doff are validated by header prediction,
2015          * provided case of th->doff==0 is eliminated.
2016          * So, we defer the checks. */
2017
2018         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2019                 goto csum_error;
2020
2021         th = (const struct tcphdr *)skb->data;
2022         iph = ip_hdr(skb);
2023 lookup:
2024         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2025                                skb, __tcp_hdrlen(th), th->source,
2026                                th->dest, sdif, &refcounted);
2027         if (!sk)
2028                 goto no_tcp_socket;
2029
2030 process:
2031         if (sk->sk_state == TCP_TIME_WAIT)
2032                 goto do_time_wait;
2033
2034         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2035                 struct request_sock *req = inet_reqsk(sk);
2036                 bool req_stolen = false;
2037                 struct sock *nsk;
2038
2039                 sk = req->rsk_listener;
2040                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2041                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042                 else
2043                         drop_reason = tcp_inbound_md5_hash(sk, skb,
2044                                                    &iph->saddr, &iph->daddr,
2045                                                    AF_INET, dif, sdif);
2046                 if (unlikely(drop_reason)) {
2047                         sk_drops_add(sk, skb);
2048                         reqsk_put(req);
2049                         goto discard_it;
2050                 }
2051                 if (tcp_checksum_complete(skb)) {
2052                         reqsk_put(req);
2053                         goto csum_error;
2054                 }
2055                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2056                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2057                         if (!nsk) {
2058                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2059                                 goto lookup;
2060                         }
2061                         sk = nsk;
2062                         /* reuseport_migrate_sock() has already held one sk_refcnt
2063                          * before returning.
2064                          */
2065                 } else {
2066                         /* We own a reference on the listener, increase it again
2067                          * as we might lose it too soon.
2068                          */
2069                         sock_hold(sk);
2070                 }
2071                 refcounted = true;
2072                 nsk = NULL;
2073                 if (!tcp_filter(sk, skb)) {
2074                         th = (const struct tcphdr *)skb->data;
2075                         iph = ip_hdr(skb);
2076                         tcp_v4_fill_cb(skb, iph, th);
2077                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2078                 } else {
2079                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2080                 }
2081                 if (!nsk) {
2082                         reqsk_put(req);
2083                         if (req_stolen) {
2084                                 /* Another cpu got exclusive access to req
2085                                  * and created a full blown socket.
2086                                  * Try to feed this packet to this socket
2087                                  * instead of discarding it.
2088                                  */
2089                                 tcp_v4_restore_cb(skb);
2090                                 sock_put(sk);
2091                                 goto lookup;
2092                         }
2093                         goto discard_and_relse;
2094                 }
2095                 nf_reset_ct(skb);
2096                 if (nsk == sk) {
2097                         reqsk_put(req);
2098                         tcp_v4_restore_cb(skb);
2099                 } else if (tcp_child_process(sk, nsk, skb)) {
2100                         tcp_v4_send_reset(nsk, skb);
2101                         goto discard_and_relse;
2102                 } else {
2103                         sock_put(sk);
2104                         return 0;
2105                 }
2106         }
2107
2108         if (static_branch_unlikely(&ip4_min_ttl)) {
2109                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2110                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2111                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2112                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2113                         goto discard_and_relse;
2114                 }
2115         }
2116
2117         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2118                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2119                 goto discard_and_relse;
2120         }
2121
2122         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2123                                            &iph->daddr, AF_INET, dif, sdif);
2124         if (drop_reason)
2125                 goto discard_and_relse;
2126
2127         nf_reset_ct(skb);
2128
2129         if (tcp_filter(sk, skb)) {
2130                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2131                 goto discard_and_relse;
2132         }
2133         th = (const struct tcphdr *)skb->data;
2134         iph = ip_hdr(skb);
2135         tcp_v4_fill_cb(skb, iph, th);
2136
2137         skb->dev = NULL;
2138
2139         if (sk->sk_state == TCP_LISTEN) {
2140                 ret = tcp_v4_do_rcv(sk, skb);
2141                 goto put_and_return;
2142         }
2143
2144         sk_incoming_cpu_update(sk);
2145
2146         bh_lock_sock_nested(sk);
2147         tcp_segs_in(tcp_sk(sk), skb);
2148         ret = 0;
2149         if (!sock_owned_by_user(sk)) {
2150                 ret = tcp_v4_do_rcv(sk, skb);
2151         } else {
2152                 if (tcp_add_backlog(sk, skb, &drop_reason))
2153                         goto discard_and_relse;
2154         }
2155         bh_unlock_sock(sk);
2156
2157 put_and_return:
2158         if (refcounted)
2159                 sock_put(sk);
2160
2161         return ret;
2162
2163 no_tcp_socket:
2164         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2165         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2166                 goto discard_it;
2167
2168         tcp_v4_fill_cb(skb, iph, th);
2169
2170         if (tcp_checksum_complete(skb)) {
2171 csum_error:
2172                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2173                 trace_tcp_bad_csum(skb);
2174                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2175 bad_packet:
2176                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2177         } else {
2178                 tcp_v4_send_reset(NULL, skb);
2179         }
2180
2181 discard_it:
2182         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2183         /* Discard frame. */
2184         kfree_skb_reason(skb, drop_reason);
2185         return 0;
2186
2187 discard_and_relse:
2188         sk_drops_add(sk, skb);
2189         if (refcounted)
2190                 sock_put(sk);
2191         goto discard_it;
2192
2193 do_time_wait:
2194         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2195                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2196                 inet_twsk_put(inet_twsk(sk));
2197                 goto discard_it;
2198         }
2199
2200         tcp_v4_fill_cb(skb, iph, th);
2201
2202         if (tcp_checksum_complete(skb)) {
2203                 inet_twsk_put(inet_twsk(sk));
2204                 goto csum_error;
2205         }
2206         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2207         case TCP_TW_SYN: {
2208                 struct sock *sk2 = inet_lookup_listener(net,
2209                                                         net->ipv4.tcp_death_row.hashinfo,
2210                                                         skb, __tcp_hdrlen(th),
2211                                                         iph->saddr, th->source,
2212                                                         iph->daddr, th->dest,
2213                                                         inet_iif(skb),
2214                                                         sdif);
2215                 if (sk2) {
2216                         inet_twsk_deschedule_put(inet_twsk(sk));
2217                         sk = sk2;
2218                         tcp_v4_restore_cb(skb);
2219                         refcounted = false;
2220                         goto process;
2221                 }
2222         }
2223                 /* to ACK */
2224                 fallthrough;
2225         case TCP_TW_ACK:
2226                 tcp_v4_timewait_ack(sk, skb);
2227                 break;
2228         case TCP_TW_RST:
2229                 tcp_v4_send_reset(sk, skb);
2230                 inet_twsk_deschedule_put(inet_twsk(sk));
2231                 goto discard_it;
2232         case TCP_TW_SUCCESS:;
2233         }
2234         goto discard_it;
2235 }
2236
2237 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2238         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2239         .twsk_unique    = tcp_twsk_unique,
2240         .twsk_destructor= tcp_twsk_destructor,
2241 };
2242
2243 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2244 {
2245         struct dst_entry *dst = skb_dst(skb);
2246
2247         if (dst && dst_hold_safe(dst)) {
2248                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2249                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2250         }
2251 }
2252 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2253
2254 const struct inet_connection_sock_af_ops ipv4_specific = {
2255         .queue_xmit        = ip_queue_xmit,
2256         .send_check        = tcp_v4_send_check,
2257         .rebuild_header    = inet_sk_rebuild_header,
2258         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2259         .conn_request      = tcp_v4_conn_request,
2260         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2261         .net_header_len    = sizeof(struct iphdr),
2262         .setsockopt        = ip_setsockopt,
2263         .getsockopt        = ip_getsockopt,
2264         .addr2sockaddr     = inet_csk_addr2sockaddr,
2265         .sockaddr_len      = sizeof(struct sockaddr_in),
2266         .mtu_reduced       = tcp_v4_mtu_reduced,
2267 };
2268 EXPORT_SYMBOL(ipv4_specific);
2269
2270 #ifdef CONFIG_TCP_MD5SIG
2271 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2272         .md5_lookup             = tcp_v4_md5_lookup,
2273         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2274         .md5_parse              = tcp_v4_parse_md5_keys,
2275 };
2276 #endif
2277
2278 /* NOTE: A lot of things set to zero explicitly by call to
2279  *       sk_alloc() so need not be done here.
2280  */
2281 static int tcp_v4_init_sock(struct sock *sk)
2282 {
2283         struct inet_connection_sock *icsk = inet_csk(sk);
2284
2285         tcp_init_sock(sk);
2286
2287         icsk->icsk_af_ops = &ipv4_specific;
2288
2289 #ifdef CONFIG_TCP_MD5SIG
2290         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2291 #endif
2292
2293         return 0;
2294 }
2295
2296 void tcp_v4_destroy_sock(struct sock *sk)
2297 {
2298         struct tcp_sock *tp = tcp_sk(sk);
2299
2300         trace_tcp_destroy_sock(sk);
2301
2302         tcp_clear_xmit_timers(sk);
2303
2304         tcp_cleanup_congestion_control(sk);
2305
2306         tcp_cleanup_ulp(sk);
2307
2308         /* Cleanup up the write buffer. */
2309         tcp_write_queue_purge(sk);
2310
2311         /* Check if we want to disable active TFO */
2312         tcp_fastopen_active_disable_ofo_check(sk);
2313
2314         /* Cleans up our, hopefully empty, out_of_order_queue. */
2315         skb_rbtree_purge(&tp->out_of_order_queue);
2316
2317 #ifdef CONFIG_TCP_MD5SIG
2318         /* Clean up the MD5 key list, if any */
2319         if (tp->md5sig_info) {
2320                 tcp_clear_md5_list(sk);
2321                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2322                 tp->md5sig_info = NULL;
2323                 static_branch_slow_dec_deferred(&tcp_md5_needed);
2324         }
2325 #endif
2326
2327         /* Clean up a referenced TCP bind bucket. */
2328         if (inet_csk(sk)->icsk_bind_hash)
2329                 inet_put_port(sk);
2330
2331         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2332
2333         /* If socket is aborted during connect operation */
2334         tcp_free_fastopen_req(tp);
2335         tcp_fastopen_destroy_cipher(sk);
2336         tcp_saved_syn_free(tp);
2337
2338         sk_sockets_allocated_dec(sk);
2339 }
2340 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2341
2342 #ifdef CONFIG_PROC_FS
2343 /* Proc filesystem TCP sock list dumping. */
2344
2345 static unsigned short seq_file_family(const struct seq_file *seq);
2346
2347 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2348 {
2349         unsigned short family = seq_file_family(seq);
2350
2351         /* AF_UNSPEC is used as a match all */
2352         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2353                 net_eq(sock_net(sk), seq_file_net(seq)));
2354 }
2355
2356 /* Find a non empty bucket (starting from st->bucket)
2357  * and return the first sk from it.
2358  */
2359 static void *listening_get_first(struct seq_file *seq)
2360 {
2361         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2362         struct tcp_iter_state *st = seq->private;
2363
2364         st->offset = 0;
2365         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2366                 struct inet_listen_hashbucket *ilb2;
2367                 struct hlist_nulls_node *node;
2368                 struct sock *sk;
2369
2370                 ilb2 = &hinfo->lhash2[st->bucket];
2371                 if (hlist_nulls_empty(&ilb2->nulls_head))
2372                         continue;
2373
2374                 spin_lock(&ilb2->lock);
2375                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2376                         if (seq_sk_match(seq, sk))
2377                                 return sk;
2378                 }
2379                 spin_unlock(&ilb2->lock);
2380         }
2381
2382         return NULL;
2383 }
2384
2385 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2386  * If "cur" is the last one in the st->bucket,
2387  * call listening_get_first() to return the first sk of the next
2388  * non empty bucket.
2389  */
2390 static void *listening_get_next(struct seq_file *seq, void *cur)
2391 {
2392         struct tcp_iter_state *st = seq->private;
2393         struct inet_listen_hashbucket *ilb2;
2394         struct hlist_nulls_node *node;
2395         struct inet_hashinfo *hinfo;
2396         struct sock *sk = cur;
2397
2398         ++st->num;
2399         ++st->offset;
2400
2401         sk = sk_nulls_next(sk);
2402         sk_nulls_for_each_from(sk, node) {
2403                 if (seq_sk_match(seq, sk))
2404                         return sk;
2405         }
2406
2407         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2408         ilb2 = &hinfo->lhash2[st->bucket];
2409         spin_unlock(&ilb2->lock);
2410         ++st->bucket;
2411         return listening_get_first(seq);
2412 }
2413
2414 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2415 {
2416         struct tcp_iter_state *st = seq->private;
2417         void *rc;
2418
2419         st->bucket = 0;
2420         st->offset = 0;
2421         rc = listening_get_first(seq);
2422
2423         while (rc && *pos) {
2424                 rc = listening_get_next(seq, rc);
2425                 --*pos;
2426         }
2427         return rc;
2428 }
2429
2430 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2431                                 const struct tcp_iter_state *st)
2432 {
2433         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2434 }
2435
2436 /*
2437  * Get first established socket starting from bucket given in st->bucket.
2438  * If st->bucket is zero, the very first socket in the hash is returned.
2439  */
2440 static void *established_get_first(struct seq_file *seq)
2441 {
2442         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2443         struct tcp_iter_state *st = seq->private;
2444
2445         st->offset = 0;
2446         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2447                 struct sock *sk;
2448                 struct hlist_nulls_node *node;
2449                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2450
2451                 /* Lockless fast path for the common case of empty buckets */
2452                 if (empty_bucket(hinfo, st))
2453                         continue;
2454
2455                 spin_lock_bh(lock);
2456                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2457                         if (seq_sk_match(seq, sk))
2458                                 return sk;
2459                 }
2460                 spin_unlock_bh(lock);
2461         }
2462
2463         return NULL;
2464 }
2465
2466 static void *established_get_next(struct seq_file *seq, void *cur)
2467 {
2468         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2469         struct tcp_iter_state *st = seq->private;
2470         struct hlist_nulls_node *node;
2471         struct sock *sk = cur;
2472
2473         ++st->num;
2474         ++st->offset;
2475
2476         sk = sk_nulls_next(sk);
2477
2478         sk_nulls_for_each_from(sk, node) {
2479                 if (seq_sk_match(seq, sk))
2480                         return sk;
2481         }
2482
2483         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2484         ++st->bucket;
2485         return established_get_first(seq);
2486 }
2487
2488 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2489 {
2490         struct tcp_iter_state *st = seq->private;
2491         void *rc;
2492
2493         st->bucket = 0;
2494         rc = established_get_first(seq);
2495
2496         while (rc && pos) {
2497                 rc = established_get_next(seq, rc);
2498                 --pos;
2499         }
2500         return rc;
2501 }
2502
2503 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2504 {
2505         void *rc;
2506         struct tcp_iter_state *st = seq->private;
2507
2508         st->state = TCP_SEQ_STATE_LISTENING;
2509         rc        = listening_get_idx(seq, &pos);
2510
2511         if (!rc) {
2512                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2513                 rc        = established_get_idx(seq, pos);
2514         }
2515
2516         return rc;
2517 }
2518
2519 static void *tcp_seek_last_pos(struct seq_file *seq)
2520 {
2521         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2522         struct tcp_iter_state *st = seq->private;
2523         int bucket = st->bucket;
2524         int offset = st->offset;
2525         int orig_num = st->num;
2526         void *rc = NULL;
2527
2528         switch (st->state) {
2529         case TCP_SEQ_STATE_LISTENING:
2530                 if (st->bucket > hinfo->lhash2_mask)
2531                         break;
2532                 rc = listening_get_first(seq);
2533                 while (offset-- && rc && bucket == st->bucket)
2534                         rc = listening_get_next(seq, rc);
2535                 if (rc)
2536                         break;
2537                 st->bucket = 0;
2538                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2539                 fallthrough;
2540         case TCP_SEQ_STATE_ESTABLISHED:
2541                 if (st->bucket > hinfo->ehash_mask)
2542                         break;
2543                 rc = established_get_first(seq);
2544                 while (offset-- && rc && bucket == st->bucket)
2545                         rc = established_get_next(seq, rc);
2546         }
2547
2548         st->num = orig_num;
2549
2550         return rc;
2551 }
2552
2553 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2554 {
2555         struct tcp_iter_state *st = seq->private;
2556         void *rc;
2557
2558         if (*pos && *pos == st->last_pos) {
2559                 rc = tcp_seek_last_pos(seq);
2560                 if (rc)
2561                         goto out;
2562         }
2563
2564         st->state = TCP_SEQ_STATE_LISTENING;
2565         st->num = 0;
2566         st->bucket = 0;
2567         st->offset = 0;
2568         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2569
2570 out:
2571         st->last_pos = *pos;
2572         return rc;
2573 }
2574 EXPORT_SYMBOL(tcp_seq_start);
2575
2576 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2577 {
2578         struct tcp_iter_state *st = seq->private;
2579         void *rc = NULL;
2580
2581         if (v == SEQ_START_TOKEN) {
2582                 rc = tcp_get_idx(seq, 0);
2583                 goto out;
2584         }
2585
2586         switch (st->state) {
2587         case TCP_SEQ_STATE_LISTENING:
2588                 rc = listening_get_next(seq, v);
2589                 if (!rc) {
2590                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2591                         st->bucket = 0;
2592                         st->offset = 0;
2593                         rc        = established_get_first(seq);
2594                 }
2595                 break;
2596         case TCP_SEQ_STATE_ESTABLISHED:
2597                 rc = established_get_next(seq, v);
2598                 break;
2599         }
2600 out:
2601         ++*pos;
2602         st->last_pos = *pos;
2603         return rc;
2604 }
2605 EXPORT_SYMBOL(tcp_seq_next);
2606
2607 void tcp_seq_stop(struct seq_file *seq, void *v)
2608 {
2609         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2610         struct tcp_iter_state *st = seq->private;
2611
2612         switch (st->state) {
2613         case TCP_SEQ_STATE_LISTENING:
2614                 if (v != SEQ_START_TOKEN)
2615                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2616                 break;
2617         case TCP_SEQ_STATE_ESTABLISHED:
2618                 if (v)
2619                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2620                 break;
2621         }
2622 }
2623 EXPORT_SYMBOL(tcp_seq_stop);
2624
2625 static void get_openreq4(const struct request_sock *req,
2626                          struct seq_file *f, int i)
2627 {
2628         const struct inet_request_sock *ireq = inet_rsk(req);
2629         long delta = req->rsk_timer.expires - jiffies;
2630
2631         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2632                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2633                 i,
2634                 ireq->ir_loc_addr,
2635                 ireq->ir_num,
2636                 ireq->ir_rmt_addr,
2637                 ntohs(ireq->ir_rmt_port),
2638                 TCP_SYN_RECV,
2639                 0, 0, /* could print option size, but that is af dependent. */
2640                 1,    /* timers active (only the expire timer) */
2641                 jiffies_delta_to_clock_t(delta),
2642                 req->num_timeout,
2643                 from_kuid_munged(seq_user_ns(f),
2644                                  sock_i_uid(req->rsk_listener)),
2645                 0,  /* non standard timer */
2646                 0, /* open_requests have no inode */
2647                 0,
2648                 req);
2649 }
2650
2651 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2652 {
2653         int timer_active;
2654         unsigned long timer_expires;
2655         const struct tcp_sock *tp = tcp_sk(sk);
2656         const struct inet_connection_sock *icsk = inet_csk(sk);
2657         const struct inet_sock *inet = inet_sk(sk);
2658         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2659         __be32 dest = inet->inet_daddr;
2660         __be32 src = inet->inet_rcv_saddr;
2661         __u16 destp = ntohs(inet->inet_dport);
2662         __u16 srcp = ntohs(inet->inet_sport);
2663         int rx_queue;
2664         int state;
2665
2666         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2667             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2668             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2669                 timer_active    = 1;
2670                 timer_expires   = icsk->icsk_timeout;
2671         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2672                 timer_active    = 4;
2673                 timer_expires   = icsk->icsk_timeout;
2674         } else if (timer_pending(&sk->sk_timer)) {
2675                 timer_active    = 2;
2676                 timer_expires   = sk->sk_timer.expires;
2677         } else {
2678                 timer_active    = 0;
2679                 timer_expires = jiffies;
2680         }
2681
2682         state = inet_sk_state_load(sk);
2683         if (state == TCP_LISTEN)
2684                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2685         else
2686                 /* Because we don't lock the socket,
2687                  * we might find a transient negative value.
2688                  */
2689                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2690                                       READ_ONCE(tp->copied_seq), 0);
2691
2692         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2693                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2694                 i, src, srcp, dest, destp, state,
2695                 READ_ONCE(tp->write_seq) - tp->snd_una,
2696                 rx_queue,
2697                 timer_active,
2698                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2699                 icsk->icsk_retransmits,
2700                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2701                 icsk->icsk_probes_out,
2702                 sock_i_ino(sk),
2703                 refcount_read(&sk->sk_refcnt), sk,
2704                 jiffies_to_clock_t(icsk->icsk_rto),
2705                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2706                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2707                 tcp_snd_cwnd(tp),
2708                 state == TCP_LISTEN ?
2709                     fastopenq->max_qlen :
2710                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2711 }
2712
2713 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2714                                struct seq_file *f, int i)
2715 {
2716         long delta = tw->tw_timer.expires - jiffies;
2717         __be32 dest, src;
2718         __u16 destp, srcp;
2719
2720         dest  = tw->tw_daddr;
2721         src   = tw->tw_rcv_saddr;
2722         destp = ntohs(tw->tw_dport);
2723         srcp  = ntohs(tw->tw_sport);
2724
2725         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2726                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2727                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2728                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2729                 refcount_read(&tw->tw_refcnt), tw);
2730 }
2731
2732 #define TMPSZ 150
2733
2734 static int tcp4_seq_show(struct seq_file *seq, void *v)
2735 {
2736         struct tcp_iter_state *st;
2737         struct sock *sk = v;
2738
2739         seq_setwidth(seq, TMPSZ - 1);
2740         if (v == SEQ_START_TOKEN) {
2741                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2742                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2743                            "inode");
2744                 goto out;
2745         }
2746         st = seq->private;
2747
2748         if (sk->sk_state == TCP_TIME_WAIT)
2749                 get_timewait4_sock(v, seq, st->num);
2750         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2751                 get_openreq4(v, seq, st->num);
2752         else
2753                 get_tcp4_sock(v, seq, st->num);
2754 out:
2755         seq_pad(seq, '\n');
2756         return 0;
2757 }
2758
2759 #ifdef CONFIG_BPF_SYSCALL
2760 struct bpf_tcp_iter_state {
2761         struct tcp_iter_state state;
2762         unsigned int cur_sk;
2763         unsigned int end_sk;
2764         unsigned int max_sk;
2765         struct sock **batch;
2766         bool st_bucket_done;
2767 };
2768
2769 struct bpf_iter__tcp {
2770         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2771         __bpf_md_ptr(struct sock_common *, sk_common);
2772         uid_t uid __aligned(8);
2773 };
2774
2775 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2776                              struct sock_common *sk_common, uid_t uid)
2777 {
2778         struct bpf_iter__tcp ctx;
2779
2780         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2781         ctx.meta = meta;
2782         ctx.sk_common = sk_common;
2783         ctx.uid = uid;
2784         return bpf_iter_run_prog(prog, &ctx);
2785 }
2786
2787 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2788 {
2789         while (iter->cur_sk < iter->end_sk)
2790                 sock_gen_put(iter->batch[iter->cur_sk++]);
2791 }
2792
2793 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2794                                       unsigned int new_batch_sz)
2795 {
2796         struct sock **new_batch;
2797
2798         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2799                              GFP_USER | __GFP_NOWARN);
2800         if (!new_batch)
2801                 return -ENOMEM;
2802
2803         bpf_iter_tcp_put_batch(iter);
2804         kvfree(iter->batch);
2805         iter->batch = new_batch;
2806         iter->max_sk = new_batch_sz;
2807
2808         return 0;
2809 }
2810
2811 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2812                                                  struct sock *start_sk)
2813 {
2814         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2815         struct bpf_tcp_iter_state *iter = seq->private;
2816         struct tcp_iter_state *st = &iter->state;
2817         struct hlist_nulls_node *node;
2818         unsigned int expected = 1;
2819         struct sock *sk;
2820
2821         sock_hold(start_sk);
2822         iter->batch[iter->end_sk++] = start_sk;
2823
2824         sk = sk_nulls_next(start_sk);
2825         sk_nulls_for_each_from(sk, node) {
2826                 if (seq_sk_match(seq, sk)) {
2827                         if (iter->end_sk < iter->max_sk) {
2828                                 sock_hold(sk);
2829                                 iter->batch[iter->end_sk++] = sk;
2830                         }
2831                         expected++;
2832                 }
2833         }
2834         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2835
2836         return expected;
2837 }
2838
2839 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2840                                                    struct sock *start_sk)
2841 {
2842         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2843         struct bpf_tcp_iter_state *iter = seq->private;
2844         struct tcp_iter_state *st = &iter->state;
2845         struct hlist_nulls_node *node;
2846         unsigned int expected = 1;
2847         struct sock *sk;
2848
2849         sock_hold(start_sk);
2850         iter->batch[iter->end_sk++] = start_sk;
2851
2852         sk = sk_nulls_next(start_sk);
2853         sk_nulls_for_each_from(sk, node) {
2854                 if (seq_sk_match(seq, sk)) {
2855                         if (iter->end_sk < iter->max_sk) {
2856                                 sock_hold(sk);
2857                                 iter->batch[iter->end_sk++] = sk;
2858                         }
2859                         expected++;
2860                 }
2861         }
2862         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2863
2864         return expected;
2865 }
2866
2867 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2868 {
2869         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2870         struct bpf_tcp_iter_state *iter = seq->private;
2871         struct tcp_iter_state *st = &iter->state;
2872         unsigned int expected;
2873         bool resized = false;
2874         struct sock *sk;
2875
2876         /* The st->bucket is done.  Directly advance to the next
2877          * bucket instead of having the tcp_seek_last_pos() to skip
2878          * one by one in the current bucket and eventually find out
2879          * it has to advance to the next bucket.
2880          */
2881         if (iter->st_bucket_done) {
2882                 st->offset = 0;
2883                 st->bucket++;
2884                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2885                     st->bucket > hinfo->lhash2_mask) {
2886                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2887                         st->bucket = 0;
2888                 }
2889         }
2890
2891 again:
2892         /* Get a new batch */
2893         iter->cur_sk = 0;
2894         iter->end_sk = 0;
2895         iter->st_bucket_done = false;
2896
2897         sk = tcp_seek_last_pos(seq);
2898         if (!sk)
2899                 return NULL; /* Done */
2900
2901         if (st->state == TCP_SEQ_STATE_LISTENING)
2902                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2903         else
2904                 expected = bpf_iter_tcp_established_batch(seq, sk);
2905
2906         if (iter->end_sk == expected) {
2907                 iter->st_bucket_done = true;
2908                 return sk;
2909         }
2910
2911         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2912                 resized = true;
2913                 goto again;
2914         }
2915
2916         return sk;
2917 }
2918
2919 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2920 {
2921         /* bpf iter does not support lseek, so it always
2922          * continue from where it was stop()-ped.
2923          */
2924         if (*pos)
2925                 return bpf_iter_tcp_batch(seq);
2926
2927         return SEQ_START_TOKEN;
2928 }
2929
2930 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2931 {
2932         struct bpf_tcp_iter_state *iter = seq->private;
2933         struct tcp_iter_state *st = &iter->state;
2934         struct sock *sk;
2935
2936         /* Whenever seq_next() is called, the iter->cur_sk is
2937          * done with seq_show(), so advance to the next sk in
2938          * the batch.
2939          */
2940         if (iter->cur_sk < iter->end_sk) {
2941                 /* Keeping st->num consistent in tcp_iter_state.
2942                  * bpf_iter_tcp does not use st->num.
2943                  * meta.seq_num is used instead.
2944                  */
2945                 st->num++;
2946                 /* Move st->offset to the next sk in the bucket such that
2947                  * the future start() will resume at st->offset in
2948                  * st->bucket.  See tcp_seek_last_pos().
2949                  */
2950                 st->offset++;
2951                 sock_gen_put(iter->batch[iter->cur_sk++]);
2952         }
2953
2954         if (iter->cur_sk < iter->end_sk)
2955                 sk = iter->batch[iter->cur_sk];
2956         else
2957                 sk = bpf_iter_tcp_batch(seq);
2958
2959         ++*pos;
2960         /* Keeping st->last_pos consistent in tcp_iter_state.
2961          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2962          */
2963         st->last_pos = *pos;
2964         return sk;
2965 }
2966
2967 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2968 {
2969         struct bpf_iter_meta meta;
2970         struct bpf_prog *prog;
2971         struct sock *sk = v;
2972         uid_t uid;
2973         int ret;
2974
2975         if (v == SEQ_START_TOKEN)
2976                 return 0;
2977
2978         if (sk_fullsock(sk))
2979                 lock_sock(sk);
2980
2981         if (unlikely(sk_unhashed(sk))) {
2982                 ret = SEQ_SKIP;
2983                 goto unlock;
2984         }
2985
2986         if (sk->sk_state == TCP_TIME_WAIT) {
2987                 uid = 0;
2988         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2989                 const struct request_sock *req = v;
2990
2991                 uid = from_kuid_munged(seq_user_ns(seq),
2992                                        sock_i_uid(req->rsk_listener));
2993         } else {
2994                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2995         }
2996
2997         meta.seq = seq;
2998         prog = bpf_iter_get_info(&meta, false);
2999         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3000
3001 unlock:
3002         if (sk_fullsock(sk))
3003                 release_sock(sk);
3004         return ret;
3005
3006 }
3007
3008 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3009 {
3010         struct bpf_tcp_iter_state *iter = seq->private;
3011         struct bpf_iter_meta meta;
3012         struct bpf_prog *prog;
3013
3014         if (!v) {
3015                 meta.seq = seq;
3016                 prog = bpf_iter_get_info(&meta, true);
3017                 if (prog)
3018                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3019         }
3020
3021         if (iter->cur_sk < iter->end_sk) {
3022                 bpf_iter_tcp_put_batch(iter);
3023                 iter->st_bucket_done = false;
3024         }
3025 }
3026
3027 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3028         .show           = bpf_iter_tcp_seq_show,
3029         .start          = bpf_iter_tcp_seq_start,
3030         .next           = bpf_iter_tcp_seq_next,
3031         .stop           = bpf_iter_tcp_seq_stop,
3032 };
3033 #endif
3034 static unsigned short seq_file_family(const struct seq_file *seq)
3035 {
3036         const struct tcp_seq_afinfo *afinfo;
3037
3038 #ifdef CONFIG_BPF_SYSCALL
3039         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3040         if (seq->op == &bpf_iter_tcp_seq_ops)
3041                 return AF_UNSPEC;
3042 #endif
3043
3044         /* Iterated from proc fs */
3045         afinfo = pde_data(file_inode(seq->file));
3046         return afinfo->family;
3047 }
3048
3049 static const struct seq_operations tcp4_seq_ops = {
3050         .show           = tcp4_seq_show,
3051         .start          = tcp_seq_start,
3052         .next           = tcp_seq_next,
3053         .stop           = tcp_seq_stop,
3054 };
3055
3056 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3057         .family         = AF_INET,
3058 };
3059
3060 static int __net_init tcp4_proc_init_net(struct net *net)
3061 {
3062         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3063                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3064                 return -ENOMEM;
3065         return 0;
3066 }
3067
3068 static void __net_exit tcp4_proc_exit_net(struct net *net)
3069 {
3070         remove_proc_entry("tcp", net->proc_net);
3071 }
3072
3073 static struct pernet_operations tcp4_net_ops = {
3074         .init = tcp4_proc_init_net,
3075         .exit = tcp4_proc_exit_net,
3076 };
3077
3078 int __init tcp4_proc_init(void)
3079 {
3080         return register_pernet_subsys(&tcp4_net_ops);
3081 }
3082
3083 void tcp4_proc_exit(void)
3084 {
3085         unregister_pernet_subsys(&tcp4_net_ops);
3086 }
3087 #endif /* CONFIG_PROC_FS */
3088
3089 /* @wake is one when sk_stream_write_space() calls us.
3090  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3091  * This mimics the strategy used in sock_def_write_space().
3092  */
3093 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3094 {
3095         const struct tcp_sock *tp = tcp_sk(sk);
3096         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3097                             READ_ONCE(tp->snd_nxt);
3098
3099         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3100 }
3101 EXPORT_SYMBOL(tcp_stream_memory_free);
3102
3103 struct proto tcp_prot = {
3104         .name                   = "TCP",
3105         .owner                  = THIS_MODULE,
3106         .close                  = tcp_close,
3107         .pre_connect            = tcp_v4_pre_connect,
3108         .connect                = tcp_v4_connect,
3109         .disconnect             = tcp_disconnect,
3110         .accept                 = inet_csk_accept,
3111         .ioctl                  = tcp_ioctl,
3112         .init                   = tcp_v4_init_sock,
3113         .destroy                = tcp_v4_destroy_sock,
3114         .shutdown               = tcp_shutdown,
3115         .setsockopt             = tcp_setsockopt,
3116         .getsockopt             = tcp_getsockopt,
3117         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3118         .keepalive              = tcp_set_keepalive,
3119         .recvmsg                = tcp_recvmsg,
3120         .sendmsg                = tcp_sendmsg,
3121         .splice_eof             = tcp_splice_eof,
3122         .backlog_rcv            = tcp_v4_do_rcv,
3123         .release_cb             = tcp_release_cb,
3124         .hash                   = inet_hash,
3125         .unhash                 = inet_unhash,
3126         .get_port               = inet_csk_get_port,
3127         .put_port               = inet_put_port,
3128 #ifdef CONFIG_BPF_SYSCALL
3129         .psock_update_sk_prot   = tcp_bpf_update_proto,
3130 #endif
3131         .enter_memory_pressure  = tcp_enter_memory_pressure,
3132         .leave_memory_pressure  = tcp_leave_memory_pressure,
3133         .stream_memory_free     = tcp_stream_memory_free,
3134         .sockets_allocated      = &tcp_sockets_allocated,
3135         .orphan_count           = &tcp_orphan_count,
3136
3137         .memory_allocated       = &tcp_memory_allocated,
3138         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3139
3140         .memory_pressure        = &tcp_memory_pressure,
3141         .sysctl_mem             = sysctl_tcp_mem,
3142         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3143         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3144         .max_header             = MAX_TCP_HEADER,
3145         .obj_size               = sizeof(struct tcp_sock),
3146         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3147         .twsk_prot              = &tcp_timewait_sock_ops,
3148         .rsk_prot               = &tcp_request_sock_ops,
3149         .h.hashinfo             = NULL,
3150         .no_autobind            = true,
3151         .diag_destroy           = tcp_abort,
3152 };
3153 EXPORT_SYMBOL(tcp_prot);
3154
3155 static void __net_exit tcp_sk_exit(struct net *net)
3156 {
3157         if (net->ipv4.tcp_congestion_control)
3158                 bpf_module_put(net->ipv4.tcp_congestion_control,
3159                                net->ipv4.tcp_congestion_control->owner);
3160 }
3161
3162 static void __net_init tcp_set_hashinfo(struct net *net)
3163 {
3164         struct inet_hashinfo *hinfo;
3165         unsigned int ehash_entries;
3166         struct net *old_net;
3167
3168         if (net_eq(net, &init_net))
3169                 goto fallback;
3170
3171         old_net = current->nsproxy->net_ns;
3172         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3173         if (!ehash_entries)
3174                 goto fallback;
3175
3176         ehash_entries = roundup_pow_of_two(ehash_entries);
3177         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3178         if (!hinfo) {
3179                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3180                         "for a netns, fallback to the global one\n",
3181                         ehash_entries);
3182 fallback:
3183                 hinfo = &tcp_hashinfo;
3184                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3185         }
3186
3187         net->ipv4.tcp_death_row.hashinfo = hinfo;
3188         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3189         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3190 }
3191
3192 static int __net_init tcp_sk_init(struct net *net)
3193 {
3194         net->ipv4.sysctl_tcp_ecn = 2;
3195         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3196
3197         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3198         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3199         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3200         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3201         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3202
3203         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3204         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3205         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3206
3207         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3208         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3209         net->ipv4.sysctl_tcp_syncookies = 1;
3210         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3211         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3212         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3213         net->ipv4.sysctl_tcp_orphan_retries = 0;
3214         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3215         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3216         net->ipv4.sysctl_tcp_tw_reuse = 2;
3217         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3218
3219         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3220         tcp_set_hashinfo(net);
3221
3222         net->ipv4.sysctl_tcp_sack = 1;
3223         net->ipv4.sysctl_tcp_window_scaling = 1;
3224         net->ipv4.sysctl_tcp_timestamps = 1;
3225         net->ipv4.sysctl_tcp_early_retrans = 3;
3226         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3227         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3228         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3229         net->ipv4.sysctl_tcp_max_reordering = 300;
3230         net->ipv4.sysctl_tcp_dsack = 1;
3231         net->ipv4.sysctl_tcp_app_win = 31;
3232         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3233         net->ipv4.sysctl_tcp_frto = 2;
3234         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3235         /* This limits the percentage of the congestion window which we
3236          * will allow a single TSO frame to consume.  Building TSO frames
3237          * which are too large can cause TCP streams to be bursty.
3238          */
3239         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3240         /* Default TSQ limit of 16 TSO segments */
3241         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3242
3243         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3244         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3245
3246         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3247         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3248         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3249         net->ipv4.sysctl_tcp_autocorking = 1;
3250         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3251         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3252         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3253         if (net != &init_net) {
3254                 memcpy(net->ipv4.sysctl_tcp_rmem,
3255                        init_net.ipv4.sysctl_tcp_rmem,
3256                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3257                 memcpy(net->ipv4.sysctl_tcp_wmem,
3258                        init_net.ipv4.sysctl_tcp_wmem,
3259                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3260         }
3261         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3262         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3263         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3264         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3265         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3266         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3267
3268         /* Set default values for PLB */
3269         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3270         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3271         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3272         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3273         /* Default congestion threshold for PLB to mark a round is 50% */
3274         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3275
3276         /* Reno is always built in */
3277         if (!net_eq(net, &init_net) &&
3278             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3279                                init_net.ipv4.tcp_congestion_control->owner))
3280                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3281         else
3282                 net->ipv4.tcp_congestion_control = &tcp_reno;
3283
3284         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3285         net->ipv4.sysctl_tcp_shrink_window = 0;
3286
3287         return 0;
3288 }
3289
3290 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3291 {
3292         struct net *net;
3293
3294         tcp_twsk_purge(net_exit_list, AF_INET);
3295
3296         list_for_each_entry(net, net_exit_list, exit_list) {
3297                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3298                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3299                 tcp_fastopen_ctx_destroy(net);
3300         }
3301 }
3302
3303 static struct pernet_operations __net_initdata tcp_sk_ops = {
3304        .init       = tcp_sk_init,
3305        .exit       = tcp_sk_exit,
3306        .exit_batch = tcp_sk_exit_batch,
3307 };
3308
3309 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3310 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3311                      struct sock_common *sk_common, uid_t uid)
3312
3313 #define INIT_BATCH_SZ 16
3314
3315 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3316 {
3317         struct bpf_tcp_iter_state *iter = priv_data;
3318         int err;
3319
3320         err = bpf_iter_init_seq_net(priv_data, aux);
3321         if (err)
3322                 return err;
3323
3324         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3325         if (err) {
3326                 bpf_iter_fini_seq_net(priv_data);
3327                 return err;
3328         }
3329
3330         return 0;
3331 }
3332
3333 static void bpf_iter_fini_tcp(void *priv_data)
3334 {
3335         struct bpf_tcp_iter_state *iter = priv_data;
3336
3337         bpf_iter_fini_seq_net(priv_data);
3338         kvfree(iter->batch);
3339 }
3340
3341 static const struct bpf_iter_seq_info tcp_seq_info = {
3342         .seq_ops                = &bpf_iter_tcp_seq_ops,
3343         .init_seq_private       = bpf_iter_init_tcp,
3344         .fini_seq_private       = bpf_iter_fini_tcp,
3345         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3346 };
3347
3348 static const struct bpf_func_proto *
3349 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3350                             const struct bpf_prog *prog)
3351 {
3352         switch (func_id) {
3353         case BPF_FUNC_setsockopt:
3354                 return &bpf_sk_setsockopt_proto;
3355         case BPF_FUNC_getsockopt:
3356                 return &bpf_sk_getsockopt_proto;
3357         default:
3358                 return NULL;
3359         }
3360 }
3361
3362 static struct bpf_iter_reg tcp_reg_info = {
3363         .target                 = "tcp",
3364         .ctx_arg_info_size      = 1,
3365         .ctx_arg_info           = {
3366                 { offsetof(struct bpf_iter__tcp, sk_common),
3367                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3368         },
3369         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3370         .seq_info               = &tcp_seq_info,
3371 };
3372
3373 static void __init bpf_iter_register(void)
3374 {
3375         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3376         if (bpf_iter_reg_target(&tcp_reg_info))
3377                 pr_warn("Warning: could not register bpf iterator tcp\n");
3378 }
3379
3380 #endif
3381
3382 void __init tcp_v4_init(void)
3383 {
3384         int cpu, res;
3385
3386         for_each_possible_cpu(cpu) {
3387                 struct sock *sk;
3388
3389                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3390                                            IPPROTO_TCP, &init_net);
3391                 if (res)
3392                         panic("Failed to create the TCP control socket.\n");
3393                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3394
3395                 /* Please enforce IP_DF and IPID==0 for RST and
3396                  * ACK sent in SYN-RECV and TIME-WAIT state.
3397                  */
3398                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3399
3400                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3401         }
3402         if (register_pernet_subsys(&tcp_sk_ops))
3403                 panic("Failed to create the TCP control socket.\n");
3404
3405 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3406         bpf_iter_register();
3407 #endif
3408 }