7a250ef9d1b7b5eedc10dbf4f599343d0fa464d2
[platform/kernel/linux-rpi.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204         struct inet_timewait_death_row *tcp_death_row;
205         __be32 daddr, nexthop, prev_sk_rcv_saddr;
206         struct inet_sock *inet = inet_sk(sk);
207         struct tcp_sock *tp = tcp_sk(sk);
208         struct ip_options_rcu *inet_opt;
209         struct net *net = sock_net(sk);
210         __be16 orig_sport, orig_dport;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235                               orig_dport, sk);
236         if (IS_ERR(rt)) {
237                 err = PTR_ERR(rt);
238                 if (err == -ENETUNREACH)
239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240                 return err;
241         }
242
243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244                 ip_rt_put(rt);
245                 return -ENETUNREACH;
246         }
247
248         if (!inet_opt || !inet_opt->opt.srr)
249                 daddr = fl4->daddr;
250
251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253         if (!inet->inet_saddr) {
254                 if (inet_csk(sk)->icsk_bind2_hash) {
255                         prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
256                                                                      sk, net, inet->inet_num);
257                         prev_sk_rcv_saddr = sk->sk_rcv_saddr;
258                 }
259                 inet->inet_saddr = fl4->saddr;
260         }
261
262         sk_rcv_saddr_set(sk, inet->inet_saddr);
263
264         if (prev_addr_hashbucket) {
265                 err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
266                 if (err) {
267                         inet->inet_saddr = 0;
268                         sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
269                         ip_rt_put(rt);
270                         return err;
271                 }
272         }
273
274         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
275                 /* Reset inherited state */
276                 tp->rx_opt.ts_recent       = 0;
277                 tp->rx_opt.ts_recent_stamp = 0;
278                 if (likely(!tp->repair))
279                         WRITE_ONCE(tp->write_seq, 0);
280         }
281
282         inet->inet_dport = usin->sin_port;
283         sk_daddr_set(sk, daddr);
284
285         inet_csk(sk)->icsk_ext_hdr_len = 0;
286         if (inet_opt)
287                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
288
289         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
290
291         /* Socket identity is still unknown (sport may be zero).
292          * However we set state to SYN-SENT and not releasing socket
293          * lock select source port, enter ourselves into the hash tables and
294          * complete initialization after this.
295          */
296         tcp_set_state(sk, TCP_SYN_SENT);
297         err = inet_hash_connect(tcp_death_row, sk);
298         if (err)
299                 goto failure;
300
301         sk_set_txhash(sk);
302
303         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
304                                inet->inet_sport, inet->inet_dport, sk);
305         if (IS_ERR(rt)) {
306                 err = PTR_ERR(rt);
307                 rt = NULL;
308                 goto failure;
309         }
310         /* OK, now commit destination to socket.  */
311         sk->sk_gso_type = SKB_GSO_TCPV4;
312         sk_setup_caps(sk, &rt->dst);
313         rt = NULL;
314
315         if (likely(!tp->repair)) {
316                 if (!tp->write_seq)
317                         WRITE_ONCE(tp->write_seq,
318                                    secure_tcp_seq(inet->inet_saddr,
319                                                   inet->inet_daddr,
320                                                   inet->inet_sport,
321                                                   usin->sin_port));
322                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
323                                                  inet->inet_daddr);
324         }
325
326         inet->inet_id = get_random_u16();
327
328         if (tcp_fastopen_defer_connect(sk, &err))
329                 return err;
330         if (err)
331                 goto failure;
332
333         err = tcp_connect(sk);
334
335         if (err)
336                 goto failure;
337
338         return 0;
339
340 failure:
341         /*
342          * This unhashes the socket and releases the local port,
343          * if necessary.
344          */
345         tcp_set_state(sk, TCP_CLOSE);
346         ip_rt_put(rt);
347         sk->sk_route_caps = 0;
348         inet->inet_dport = 0;
349         return err;
350 }
351 EXPORT_SYMBOL(tcp_v4_connect);
352
353 /*
354  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
355  * It can be called through tcp_release_cb() if socket was owned by user
356  * at the time tcp_v4_err() was called to handle ICMP message.
357  */
358 void tcp_v4_mtu_reduced(struct sock *sk)
359 {
360         struct inet_sock *inet = inet_sk(sk);
361         struct dst_entry *dst;
362         u32 mtu;
363
364         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
365                 return;
366         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
367         dst = inet_csk_update_pmtu(sk, mtu);
368         if (!dst)
369                 return;
370
371         /* Something is about to be wrong... Remember soft error
372          * for the case, if this connection will not able to recover.
373          */
374         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
375                 sk->sk_err_soft = EMSGSIZE;
376
377         mtu = dst_mtu(dst);
378
379         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
380             ip_sk_accept_pmtu(sk) &&
381             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
382                 tcp_sync_mss(sk, mtu);
383
384                 /* Resend the TCP packet because it's
385                  * clear that the old packet has been
386                  * dropped. This is the new "fast" path mtu
387                  * discovery.
388                  */
389                 tcp_simple_retransmit(sk);
390         } /* else let the usual retransmit timer handle it */
391 }
392 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
393
394 static void do_redirect(struct sk_buff *skb, struct sock *sk)
395 {
396         struct dst_entry *dst = __sk_dst_check(sk, 0);
397
398         if (dst)
399                 dst->ops->redirect(dst, sk, skb);
400 }
401
402
403 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
404 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
405 {
406         struct request_sock *req = inet_reqsk(sk);
407         struct net *net = sock_net(sk);
408
409         /* ICMPs are not backlogged, hence we cannot get
410          * an established socket here.
411          */
412         if (seq != tcp_rsk(req)->snt_isn) {
413                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
414         } else if (abort) {
415                 /*
416                  * Still in SYN_RECV, just remove it silently.
417                  * There is no good way to pass the error to the newly
418                  * created socket, and POSIX does not want network
419                  * errors returned from accept().
420                  */
421                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
422                 tcp_listendrop(req->rsk_listener);
423         }
424         reqsk_put(req);
425 }
426 EXPORT_SYMBOL(tcp_req_err);
427
428 /* TCP-LD (RFC 6069) logic */
429 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
430 {
431         struct inet_connection_sock *icsk = inet_csk(sk);
432         struct tcp_sock *tp = tcp_sk(sk);
433         struct sk_buff *skb;
434         s32 remaining;
435         u32 delta_us;
436
437         if (sock_owned_by_user(sk))
438                 return;
439
440         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
441             !icsk->icsk_backoff)
442                 return;
443
444         skb = tcp_rtx_queue_head(sk);
445         if (WARN_ON_ONCE(!skb))
446                 return;
447
448         icsk->icsk_backoff--;
449         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
450         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
451
452         tcp_mstamp_refresh(tp);
453         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
454         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
455
456         if (remaining > 0) {
457                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
458                                           remaining, TCP_RTO_MAX);
459         } else {
460                 /* RTO revert clocked out retransmission.
461                  * Will retransmit now.
462                  */
463                 tcp_retransmit_timer(sk);
464         }
465 }
466 EXPORT_SYMBOL(tcp_ld_RTO_revert);
467
468 /*
469  * This routine is called by the ICMP module when it gets some
470  * sort of error condition.  If err < 0 then the socket should
471  * be closed and the error returned to the user.  If err > 0
472  * it's just the icmp type << 8 | icmp code.  After adjustment
473  * header points to the first 8 bytes of the tcp header.  We need
474  * to find the appropriate port.
475  *
476  * The locking strategy used here is very "optimistic". When
477  * someone else accesses the socket the ICMP is just dropped
478  * and for some paths there is no check at all.
479  * A more general error queue to queue errors for later handling
480  * is probably better.
481  *
482  */
483
484 int tcp_v4_err(struct sk_buff *skb, u32 info)
485 {
486         const struct iphdr *iph = (const struct iphdr *)skb->data;
487         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
488         struct tcp_sock *tp;
489         struct inet_sock *inet;
490         const int type = icmp_hdr(skb)->type;
491         const int code = icmp_hdr(skb)->code;
492         struct sock *sk;
493         struct request_sock *fastopen;
494         u32 seq, snd_una;
495         int err;
496         struct net *net = dev_net(skb->dev);
497
498         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
499                                        iph->daddr, th->dest, iph->saddr,
500                                        ntohs(th->source), inet_iif(skb), 0);
501         if (!sk) {
502                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
503                 return -ENOENT;
504         }
505         if (sk->sk_state == TCP_TIME_WAIT) {
506                 inet_twsk_put(inet_twsk(sk));
507                 return 0;
508         }
509         seq = ntohl(th->seq);
510         if (sk->sk_state == TCP_NEW_SYN_RECV) {
511                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
512                                      type == ICMP_TIME_EXCEEDED ||
513                                      (type == ICMP_DEST_UNREACH &&
514                                       (code == ICMP_NET_UNREACH ||
515                                        code == ICMP_HOST_UNREACH)));
516                 return 0;
517         }
518
519         bh_lock_sock(sk);
520         /* If too many ICMPs get dropped on busy
521          * servers this needs to be solved differently.
522          * We do take care of PMTU discovery (RFC1191) special case :
523          * we can receive locally generated ICMP messages while socket is held.
524          */
525         if (sock_owned_by_user(sk)) {
526                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
527                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
528         }
529         if (sk->sk_state == TCP_CLOSE)
530                 goto out;
531
532         if (static_branch_unlikely(&ip4_min_ttl)) {
533                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
534                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
535                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
536                         goto out;
537                 }
538         }
539
540         tp = tcp_sk(sk);
541         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
542         fastopen = rcu_dereference(tp->fastopen_rsk);
543         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
544         if (sk->sk_state != TCP_LISTEN &&
545             !between(seq, snd_una, tp->snd_nxt)) {
546                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
547                 goto out;
548         }
549
550         switch (type) {
551         case ICMP_REDIRECT:
552                 if (!sock_owned_by_user(sk))
553                         do_redirect(skb, sk);
554                 goto out;
555         case ICMP_SOURCE_QUENCH:
556                 /* Just silently ignore these. */
557                 goto out;
558         case ICMP_PARAMETERPROB:
559                 err = EPROTO;
560                 break;
561         case ICMP_DEST_UNREACH:
562                 if (code > NR_ICMP_UNREACH)
563                         goto out;
564
565                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
566                         /* We are not interested in TCP_LISTEN and open_requests
567                          * (SYN-ACKs send out by Linux are always <576bytes so
568                          * they should go through unfragmented).
569                          */
570                         if (sk->sk_state == TCP_LISTEN)
571                                 goto out;
572
573                         WRITE_ONCE(tp->mtu_info, info);
574                         if (!sock_owned_by_user(sk)) {
575                                 tcp_v4_mtu_reduced(sk);
576                         } else {
577                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
578                                         sock_hold(sk);
579                         }
580                         goto out;
581                 }
582
583                 err = icmp_err_convert[code].errno;
584                 /* check if this ICMP message allows revert of backoff.
585                  * (see RFC 6069)
586                  */
587                 if (!fastopen &&
588                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
589                         tcp_ld_RTO_revert(sk, seq);
590                 break;
591         case ICMP_TIME_EXCEEDED:
592                 err = EHOSTUNREACH;
593                 break;
594         default:
595                 goto out;
596         }
597
598         switch (sk->sk_state) {
599         case TCP_SYN_SENT:
600         case TCP_SYN_RECV:
601                 /* Only in fast or simultaneous open. If a fast open socket is
602                  * already accepted it is treated as a connected one below.
603                  */
604                 if (fastopen && !fastopen->sk)
605                         break;
606
607                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
608
609                 if (!sock_owned_by_user(sk)) {
610                         sk->sk_err = err;
611
612                         sk_error_report(sk);
613
614                         tcp_done(sk);
615                 } else {
616                         sk->sk_err_soft = err;
617                 }
618                 goto out;
619         }
620
621         /* If we've already connected we will keep trying
622          * until we time out, or the user gives up.
623          *
624          * rfc1122 4.2.3.9 allows to consider as hard errors
625          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
626          * but it is obsoleted by pmtu discovery).
627          *
628          * Note, that in modern internet, where routing is unreliable
629          * and in each dark corner broken firewalls sit, sending random
630          * errors ordered by their masters even this two messages finally lose
631          * their original sense (even Linux sends invalid PORT_UNREACHs)
632          *
633          * Now we are in compliance with RFCs.
634          *                                                      --ANK (980905)
635          */
636
637         inet = inet_sk(sk);
638         if (!sock_owned_by_user(sk) && inet->recverr) {
639                 sk->sk_err = err;
640                 sk_error_report(sk);
641         } else  { /* Only an error on timeout */
642                 sk->sk_err_soft = err;
643         }
644
645 out:
646         bh_unlock_sock(sk);
647         sock_put(sk);
648         return 0;
649 }
650
651 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
652 {
653         struct tcphdr *th = tcp_hdr(skb);
654
655         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
656         skb->csum_start = skb_transport_header(skb) - skb->head;
657         skb->csum_offset = offsetof(struct tcphdr, check);
658 }
659
660 /* This routine computes an IPv4 TCP checksum. */
661 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
662 {
663         const struct inet_sock *inet = inet_sk(sk);
664
665         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
666 }
667 EXPORT_SYMBOL(tcp_v4_send_check);
668
669 /*
670  *      This routine will send an RST to the other tcp.
671  *
672  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
673  *                    for reset.
674  *      Answer: if a packet caused RST, it is not for a socket
675  *              existing in our system, if it is matched to a socket,
676  *              it is just duplicate segment or bug in other side's TCP.
677  *              So that we build reply only basing on parameters
678  *              arrived with segment.
679  *      Exception: precedence violation. We do not implement it in any case.
680  */
681
682 #ifdef CONFIG_TCP_MD5SIG
683 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
684 #else
685 #define OPTION_BYTES sizeof(__be32)
686 #endif
687
688 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
689 {
690         const struct tcphdr *th = tcp_hdr(skb);
691         struct {
692                 struct tcphdr th;
693                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
694         } rep;
695         struct ip_reply_arg arg;
696 #ifdef CONFIG_TCP_MD5SIG
697         struct tcp_md5sig_key *key = NULL;
698         const __u8 *hash_location = NULL;
699         unsigned char newhash[16];
700         int genhash;
701         struct sock *sk1 = NULL;
702 #endif
703         u64 transmit_time = 0;
704         struct sock *ctl_sk;
705         struct net *net;
706
707         /* Never send a reset in response to a reset. */
708         if (th->rst)
709                 return;
710
711         /* If sk not NULL, it means we did a successful lookup and incoming
712          * route had to be correct. prequeue might have dropped our dst.
713          */
714         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
715                 return;
716
717         /* Swap the send and the receive. */
718         memset(&rep, 0, sizeof(rep));
719         rep.th.dest   = th->source;
720         rep.th.source = th->dest;
721         rep.th.doff   = sizeof(struct tcphdr) / 4;
722         rep.th.rst    = 1;
723
724         if (th->ack) {
725                 rep.th.seq = th->ack_seq;
726         } else {
727                 rep.th.ack = 1;
728                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
729                                        skb->len - (th->doff << 2));
730         }
731
732         memset(&arg, 0, sizeof(arg));
733         arg.iov[0].iov_base = (unsigned char *)&rep;
734         arg.iov[0].iov_len  = sizeof(rep.th);
735
736         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
737 #ifdef CONFIG_TCP_MD5SIG
738         rcu_read_lock();
739         hash_location = tcp_parse_md5sig_option(th);
740         if (sk && sk_fullsock(sk)) {
741                 const union tcp_md5_addr *addr;
742                 int l3index;
743
744                 /* sdif set, means packet ingressed via a device
745                  * in an L3 domain and inet_iif is set to it.
746                  */
747                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
748                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
749                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
750         } else if (hash_location) {
751                 const union tcp_md5_addr *addr;
752                 int sdif = tcp_v4_sdif(skb);
753                 int dif = inet_iif(skb);
754                 int l3index;
755
756                 /*
757                  * active side is lost. Try to find listening socket through
758                  * source port, and then find md5 key through listening socket.
759                  * we are not loose security here:
760                  * Incoming packet is checked with md5 hash with finding key,
761                  * no RST generated if md5 hash doesn't match.
762                  */
763                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
764                                              NULL, 0, ip_hdr(skb)->saddr,
765                                              th->source, ip_hdr(skb)->daddr,
766                                              ntohs(th->source), dif, sdif);
767                 /* don't send rst if it can't find key */
768                 if (!sk1)
769                         goto out;
770
771                 /* sdif set, means packet ingressed via a device
772                  * in an L3 domain and dif is set to it.
773                  */
774                 l3index = sdif ? dif : 0;
775                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
776                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
777                 if (!key)
778                         goto out;
779
780
781                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
782                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
783                         goto out;
784
785         }
786
787         if (key) {
788                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
789                                    (TCPOPT_NOP << 16) |
790                                    (TCPOPT_MD5SIG << 8) |
791                                    TCPOLEN_MD5SIG);
792                 /* Update length and the length the header thinks exists */
793                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
794                 rep.th.doff = arg.iov[0].iov_len / 4;
795
796                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
797                                      key, ip_hdr(skb)->saddr,
798                                      ip_hdr(skb)->daddr, &rep.th);
799         }
800 #endif
801         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
802         if (rep.opt[0] == 0) {
803                 __be32 mrst = mptcp_reset_option(skb);
804
805                 if (mrst) {
806                         rep.opt[0] = mrst;
807                         arg.iov[0].iov_len += sizeof(mrst);
808                         rep.th.doff = arg.iov[0].iov_len / 4;
809                 }
810         }
811
812         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
813                                       ip_hdr(skb)->saddr, /* XXX */
814                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
815         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
816         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
817
818         /* When socket is gone, all binding information is lost.
819          * routing might fail in this case. No choice here, if we choose to force
820          * input interface, we will misroute in case of asymmetric route.
821          */
822         if (sk) {
823                 arg.bound_dev_if = sk->sk_bound_dev_if;
824                 if (sk_fullsock(sk))
825                         trace_tcp_send_reset(sk, skb);
826         }
827
828         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
829                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
830
831         arg.tos = ip_hdr(skb)->tos;
832         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
833         local_bh_disable();
834         ctl_sk = this_cpu_read(ipv4_tcp_sk);
835         sock_net_set(ctl_sk, net);
836         if (sk) {
837                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
838                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
839                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
840                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
841                 transmit_time = tcp_transmit_time(sk);
842                 xfrm_sk_clone_policy(ctl_sk, sk);
843         }
844         ip_send_unicast_reply(ctl_sk,
845                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
846                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
847                               &arg, arg.iov[0].iov_len,
848                               transmit_time);
849
850         ctl_sk->sk_mark = 0;
851         xfrm_sk_free_policy(ctl_sk);
852         sock_net_set(ctl_sk, &init_net);
853         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
854         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
855         local_bh_enable();
856
857 #ifdef CONFIG_TCP_MD5SIG
858 out:
859         rcu_read_unlock();
860 #endif
861 }
862
863 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
864    outside socket context is ugly, certainly. What can I do?
865  */
866
867 static void tcp_v4_send_ack(const struct sock *sk,
868                             struct sk_buff *skb, u32 seq, u32 ack,
869                             u32 win, u32 tsval, u32 tsecr, int oif,
870                             struct tcp_md5sig_key *key,
871                             int reply_flags, u8 tos)
872 {
873         const struct tcphdr *th = tcp_hdr(skb);
874         struct {
875                 struct tcphdr th;
876                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
877 #ifdef CONFIG_TCP_MD5SIG
878                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
879 #endif
880                         ];
881         } rep;
882         struct net *net = sock_net(sk);
883         struct ip_reply_arg arg;
884         struct sock *ctl_sk;
885         u64 transmit_time;
886
887         memset(&rep.th, 0, sizeof(struct tcphdr));
888         memset(&arg, 0, sizeof(arg));
889
890         arg.iov[0].iov_base = (unsigned char *)&rep;
891         arg.iov[0].iov_len  = sizeof(rep.th);
892         if (tsecr) {
893                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
894                                    (TCPOPT_TIMESTAMP << 8) |
895                                    TCPOLEN_TIMESTAMP);
896                 rep.opt[1] = htonl(tsval);
897                 rep.opt[2] = htonl(tsecr);
898                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
899         }
900
901         /* Swap the send and the receive. */
902         rep.th.dest    = th->source;
903         rep.th.source  = th->dest;
904         rep.th.doff    = arg.iov[0].iov_len / 4;
905         rep.th.seq     = htonl(seq);
906         rep.th.ack_seq = htonl(ack);
907         rep.th.ack     = 1;
908         rep.th.window  = htons(win);
909
910 #ifdef CONFIG_TCP_MD5SIG
911         if (key) {
912                 int offset = (tsecr) ? 3 : 0;
913
914                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
915                                           (TCPOPT_NOP << 16) |
916                                           (TCPOPT_MD5SIG << 8) |
917                                           TCPOLEN_MD5SIG);
918                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
919                 rep.th.doff = arg.iov[0].iov_len/4;
920
921                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
922                                     key, ip_hdr(skb)->saddr,
923                                     ip_hdr(skb)->daddr, &rep.th);
924         }
925 #endif
926         arg.flags = reply_flags;
927         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
928                                       ip_hdr(skb)->saddr, /* XXX */
929                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
930         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
931         if (oif)
932                 arg.bound_dev_if = oif;
933         arg.tos = tos;
934         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
935         local_bh_disable();
936         ctl_sk = this_cpu_read(ipv4_tcp_sk);
937         sock_net_set(ctl_sk, net);
938         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
939                            inet_twsk(sk)->tw_mark : sk->sk_mark;
940         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
941                            inet_twsk(sk)->tw_priority : sk->sk_priority;
942         transmit_time = tcp_transmit_time(sk);
943         ip_send_unicast_reply(ctl_sk,
944                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
945                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
946                               &arg, arg.iov[0].iov_len,
947                               transmit_time);
948
949         ctl_sk->sk_mark = 0;
950         sock_net_set(ctl_sk, &init_net);
951         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
952         local_bh_enable();
953 }
954
955 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
956 {
957         struct inet_timewait_sock *tw = inet_twsk(sk);
958         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
959
960         tcp_v4_send_ack(sk, skb,
961                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
962                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
963                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
964                         tcptw->tw_ts_recent,
965                         tw->tw_bound_dev_if,
966                         tcp_twsk_md5_key(tcptw),
967                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
968                         tw->tw_tos
969                         );
970
971         inet_twsk_put(tw);
972 }
973
974 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
975                                   struct request_sock *req)
976 {
977         const union tcp_md5_addr *addr;
978         int l3index;
979
980         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
981          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
982          */
983         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
984                                              tcp_sk(sk)->snd_nxt;
985
986         /* RFC 7323 2.3
987          * The window field (SEG.WND) of every outgoing segment, with the
988          * exception of <SYN> segments, MUST be right-shifted by
989          * Rcv.Wind.Shift bits:
990          */
991         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
992         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
993         tcp_v4_send_ack(sk, skb, seq,
994                         tcp_rsk(req)->rcv_nxt,
995                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
996                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
997                         req->ts_recent,
998                         0,
999                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
1000                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
1001                         ip_hdr(skb)->tos);
1002 }
1003
1004 /*
1005  *      Send a SYN-ACK after having received a SYN.
1006  *      This still operates on a request_sock only, not on a big
1007  *      socket.
1008  */
1009 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1010                               struct flowi *fl,
1011                               struct request_sock *req,
1012                               struct tcp_fastopen_cookie *foc,
1013                               enum tcp_synack_type synack_type,
1014                               struct sk_buff *syn_skb)
1015 {
1016         const struct inet_request_sock *ireq = inet_rsk(req);
1017         struct flowi4 fl4;
1018         int err = -1;
1019         struct sk_buff *skb;
1020         u8 tos;
1021
1022         /* First, grab a route. */
1023         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1024                 return -1;
1025
1026         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1027
1028         if (skb) {
1029                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1030
1031                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1032                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1033                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1034                                 inet_sk(sk)->tos;
1035
1036                 if (!INET_ECN_is_capable(tos) &&
1037                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1038                         tos |= INET_ECN_ECT_0;
1039
1040                 rcu_read_lock();
1041                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1042                                             ireq->ir_rmt_addr,
1043                                             rcu_dereference(ireq->ireq_opt),
1044                                             tos);
1045                 rcu_read_unlock();
1046                 err = net_xmit_eval(err);
1047         }
1048
1049         return err;
1050 }
1051
1052 /*
1053  *      IPv4 request_sock destructor.
1054  */
1055 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1056 {
1057         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1058 }
1059
1060 #ifdef CONFIG_TCP_MD5SIG
1061 /*
1062  * RFC2385 MD5 checksumming requires a mapping of
1063  * IP address->MD5 Key.
1064  * We need to maintain these in the sk structure.
1065  */
1066
1067 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1068 EXPORT_SYMBOL(tcp_md5_needed);
1069
1070 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1071 {
1072         if (!old)
1073                 return true;
1074
1075         /* l3index always overrides non-l3index */
1076         if (old->l3index && new->l3index == 0)
1077                 return false;
1078         if (old->l3index == 0 && new->l3index)
1079                 return true;
1080
1081         return old->prefixlen < new->prefixlen;
1082 }
1083
1084 /* Find the Key structure for an address.  */
1085 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1086                                            const union tcp_md5_addr *addr,
1087                                            int family)
1088 {
1089         const struct tcp_sock *tp = tcp_sk(sk);
1090         struct tcp_md5sig_key *key;
1091         const struct tcp_md5sig_info *md5sig;
1092         __be32 mask;
1093         struct tcp_md5sig_key *best_match = NULL;
1094         bool match;
1095
1096         /* caller either holds rcu_read_lock() or socket lock */
1097         md5sig = rcu_dereference_check(tp->md5sig_info,
1098                                        lockdep_sock_is_held(sk));
1099         if (!md5sig)
1100                 return NULL;
1101
1102         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1103                                  lockdep_sock_is_held(sk)) {
1104                 if (key->family != family)
1105                         continue;
1106                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1107                         continue;
1108                 if (family == AF_INET) {
1109                         mask = inet_make_mask(key->prefixlen);
1110                         match = (key->addr.a4.s_addr & mask) ==
1111                                 (addr->a4.s_addr & mask);
1112 #if IS_ENABLED(CONFIG_IPV6)
1113                 } else if (family == AF_INET6) {
1114                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1115                                                   key->prefixlen);
1116 #endif
1117                 } else {
1118                         match = false;
1119                 }
1120
1121                 if (match && better_md5_match(best_match, key))
1122                         best_match = key;
1123         }
1124         return best_match;
1125 }
1126 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1127
1128 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1129                                                       const union tcp_md5_addr *addr,
1130                                                       int family, u8 prefixlen,
1131                                                       int l3index, u8 flags)
1132 {
1133         const struct tcp_sock *tp = tcp_sk(sk);
1134         struct tcp_md5sig_key *key;
1135         unsigned int size = sizeof(struct in_addr);
1136         const struct tcp_md5sig_info *md5sig;
1137
1138         /* caller either holds rcu_read_lock() or socket lock */
1139         md5sig = rcu_dereference_check(tp->md5sig_info,
1140                                        lockdep_sock_is_held(sk));
1141         if (!md5sig)
1142                 return NULL;
1143 #if IS_ENABLED(CONFIG_IPV6)
1144         if (family == AF_INET6)
1145                 size = sizeof(struct in6_addr);
1146 #endif
1147         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1148                                  lockdep_sock_is_held(sk)) {
1149                 if (key->family != family)
1150                         continue;
1151                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1152                         continue;
1153                 if (key->l3index != l3index)
1154                         continue;
1155                 if (!memcmp(&key->addr, addr, size) &&
1156                     key->prefixlen == prefixlen)
1157                         return key;
1158         }
1159         return NULL;
1160 }
1161
1162 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1163                                          const struct sock *addr_sk)
1164 {
1165         const union tcp_md5_addr *addr;
1166         int l3index;
1167
1168         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1169                                                  addr_sk->sk_bound_dev_if);
1170         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1171         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1172 }
1173 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1174
1175 /* This can be called on a newly created socket, from other files */
1176 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1177                    int family, u8 prefixlen, int l3index, u8 flags,
1178                    const u8 *newkey, u8 newkeylen, gfp_t gfp)
1179 {
1180         /* Add Key to the list */
1181         struct tcp_md5sig_key *key;
1182         struct tcp_sock *tp = tcp_sk(sk);
1183         struct tcp_md5sig_info *md5sig;
1184
1185         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1186         if (key) {
1187                 /* Pre-existing entry - just update that one.
1188                  * Note that the key might be used concurrently.
1189                  * data_race() is telling kcsan that we do not care of
1190                  * key mismatches, since changing MD5 key on live flows
1191                  * can lead to packet drops.
1192                  */
1193                 data_race(memcpy(key->key, newkey, newkeylen));
1194
1195                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1196                  * Also note that a reader could catch new key->keylen value
1197                  * but old key->key[], this is the reason we use __GFP_ZERO
1198                  * at sock_kmalloc() time below these lines.
1199                  */
1200                 WRITE_ONCE(key->keylen, newkeylen);
1201
1202                 return 0;
1203         }
1204
1205         md5sig = rcu_dereference_protected(tp->md5sig_info,
1206                                            lockdep_sock_is_held(sk));
1207         if (!md5sig) {
1208                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1209                 if (!md5sig)
1210                         return -ENOMEM;
1211
1212                 sk_gso_disable(sk);
1213                 INIT_HLIST_HEAD(&md5sig->head);
1214                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1215         }
1216
1217         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1218         if (!key)
1219                 return -ENOMEM;
1220         if (!tcp_alloc_md5sig_pool()) {
1221                 sock_kfree_s(sk, key, sizeof(*key));
1222                 return -ENOMEM;
1223         }
1224
1225         memcpy(key->key, newkey, newkeylen);
1226         key->keylen = newkeylen;
1227         key->family = family;
1228         key->prefixlen = prefixlen;
1229         key->l3index = l3index;
1230         key->flags = flags;
1231         memcpy(&key->addr, addr,
1232                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1233                                                                  sizeof(struct in_addr));
1234         hlist_add_head_rcu(&key->node, &md5sig->head);
1235         return 0;
1236 }
1237 EXPORT_SYMBOL(tcp_md5_do_add);
1238
1239 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1240                    u8 prefixlen, int l3index, u8 flags)
1241 {
1242         struct tcp_md5sig_key *key;
1243
1244         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1245         if (!key)
1246                 return -ENOENT;
1247         hlist_del_rcu(&key->node);
1248         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1249         kfree_rcu(key, rcu);
1250         return 0;
1251 }
1252 EXPORT_SYMBOL(tcp_md5_do_del);
1253
1254 static void tcp_clear_md5_list(struct sock *sk)
1255 {
1256         struct tcp_sock *tp = tcp_sk(sk);
1257         struct tcp_md5sig_key *key;
1258         struct hlist_node *n;
1259         struct tcp_md5sig_info *md5sig;
1260
1261         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1262
1263         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1264                 hlist_del_rcu(&key->node);
1265                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1266                 kfree_rcu(key, rcu);
1267         }
1268 }
1269
1270 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1271                                  sockptr_t optval, int optlen)
1272 {
1273         struct tcp_md5sig cmd;
1274         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1275         const union tcp_md5_addr *addr;
1276         u8 prefixlen = 32;
1277         int l3index = 0;
1278         u8 flags;
1279
1280         if (optlen < sizeof(cmd))
1281                 return -EINVAL;
1282
1283         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1284                 return -EFAULT;
1285
1286         if (sin->sin_family != AF_INET)
1287                 return -EINVAL;
1288
1289         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1290
1291         if (optname == TCP_MD5SIG_EXT &&
1292             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1293                 prefixlen = cmd.tcpm_prefixlen;
1294                 if (prefixlen > 32)
1295                         return -EINVAL;
1296         }
1297
1298         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1299             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1300                 struct net_device *dev;
1301
1302                 rcu_read_lock();
1303                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1304                 if (dev && netif_is_l3_master(dev))
1305                         l3index = dev->ifindex;
1306
1307                 rcu_read_unlock();
1308
1309                 /* ok to reference set/not set outside of rcu;
1310                  * right now device MUST be an L3 master
1311                  */
1312                 if (!dev || !l3index)
1313                         return -EINVAL;
1314         }
1315
1316         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1317
1318         if (!cmd.tcpm_keylen)
1319                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1320
1321         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1322                 return -EINVAL;
1323
1324         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1325                               cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1326 }
1327
1328 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1329                                    __be32 daddr, __be32 saddr,
1330                                    const struct tcphdr *th, int nbytes)
1331 {
1332         struct tcp4_pseudohdr *bp;
1333         struct scatterlist sg;
1334         struct tcphdr *_th;
1335
1336         bp = hp->scratch;
1337         bp->saddr = saddr;
1338         bp->daddr = daddr;
1339         bp->pad = 0;
1340         bp->protocol = IPPROTO_TCP;
1341         bp->len = cpu_to_be16(nbytes);
1342
1343         _th = (struct tcphdr *)(bp + 1);
1344         memcpy(_th, th, sizeof(*th));
1345         _th->check = 0;
1346
1347         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1348         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1349                                 sizeof(*bp) + sizeof(*th));
1350         return crypto_ahash_update(hp->md5_req);
1351 }
1352
1353 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1354                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1355 {
1356         struct tcp_md5sig_pool *hp;
1357         struct ahash_request *req;
1358
1359         hp = tcp_get_md5sig_pool();
1360         if (!hp)
1361                 goto clear_hash_noput;
1362         req = hp->md5_req;
1363
1364         if (crypto_ahash_init(req))
1365                 goto clear_hash;
1366         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1367                 goto clear_hash;
1368         if (tcp_md5_hash_key(hp, key))
1369                 goto clear_hash;
1370         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1371         if (crypto_ahash_final(req))
1372                 goto clear_hash;
1373
1374         tcp_put_md5sig_pool();
1375         return 0;
1376
1377 clear_hash:
1378         tcp_put_md5sig_pool();
1379 clear_hash_noput:
1380         memset(md5_hash, 0, 16);
1381         return 1;
1382 }
1383
1384 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1385                         const struct sock *sk,
1386                         const struct sk_buff *skb)
1387 {
1388         struct tcp_md5sig_pool *hp;
1389         struct ahash_request *req;
1390         const struct tcphdr *th = tcp_hdr(skb);
1391         __be32 saddr, daddr;
1392
1393         if (sk) { /* valid for establish/request sockets */
1394                 saddr = sk->sk_rcv_saddr;
1395                 daddr = sk->sk_daddr;
1396         } else {
1397                 const struct iphdr *iph = ip_hdr(skb);
1398                 saddr = iph->saddr;
1399                 daddr = iph->daddr;
1400         }
1401
1402         hp = tcp_get_md5sig_pool();
1403         if (!hp)
1404                 goto clear_hash_noput;
1405         req = hp->md5_req;
1406
1407         if (crypto_ahash_init(req))
1408                 goto clear_hash;
1409
1410         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1411                 goto clear_hash;
1412         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1413                 goto clear_hash;
1414         if (tcp_md5_hash_key(hp, key))
1415                 goto clear_hash;
1416         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417         if (crypto_ahash_final(req))
1418                 goto clear_hash;
1419
1420         tcp_put_md5sig_pool();
1421         return 0;
1422
1423 clear_hash:
1424         tcp_put_md5sig_pool();
1425 clear_hash_noput:
1426         memset(md5_hash, 0, 16);
1427         return 1;
1428 }
1429 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1430
1431 #endif
1432
1433 static void tcp_v4_init_req(struct request_sock *req,
1434                             const struct sock *sk_listener,
1435                             struct sk_buff *skb)
1436 {
1437         struct inet_request_sock *ireq = inet_rsk(req);
1438         struct net *net = sock_net(sk_listener);
1439
1440         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1441         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1442         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1443 }
1444
1445 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1446                                           struct sk_buff *skb,
1447                                           struct flowi *fl,
1448                                           struct request_sock *req)
1449 {
1450         tcp_v4_init_req(req, sk, skb);
1451
1452         if (security_inet_conn_request(sk, skb, req))
1453                 return NULL;
1454
1455         return inet_csk_route_req(sk, &fl->u.ip4, req);
1456 }
1457
1458 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1459         .family         =       PF_INET,
1460         .obj_size       =       sizeof(struct tcp_request_sock),
1461         .rtx_syn_ack    =       tcp_rtx_synack,
1462         .send_ack       =       tcp_v4_reqsk_send_ack,
1463         .destructor     =       tcp_v4_reqsk_destructor,
1464         .send_reset     =       tcp_v4_send_reset,
1465         .syn_ack_timeout =      tcp_syn_ack_timeout,
1466 };
1467
1468 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1469         .mss_clamp      =       TCP_MSS_DEFAULT,
1470 #ifdef CONFIG_TCP_MD5SIG
1471         .req_md5_lookup =       tcp_v4_md5_lookup,
1472         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1473 #endif
1474 #ifdef CONFIG_SYN_COOKIES
1475         .cookie_init_seq =      cookie_v4_init_sequence,
1476 #endif
1477         .route_req      =       tcp_v4_route_req,
1478         .init_seq       =       tcp_v4_init_seq,
1479         .init_ts_off    =       tcp_v4_init_ts_off,
1480         .send_synack    =       tcp_v4_send_synack,
1481 };
1482
1483 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1484 {
1485         /* Never answer to SYNs send to broadcast or multicast */
1486         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1487                 goto drop;
1488
1489         return tcp_conn_request(&tcp_request_sock_ops,
1490                                 &tcp_request_sock_ipv4_ops, sk, skb);
1491
1492 drop:
1493         tcp_listendrop(sk);
1494         return 0;
1495 }
1496 EXPORT_SYMBOL(tcp_v4_conn_request);
1497
1498
1499 /*
1500  * The three way handshake has completed - we got a valid synack -
1501  * now create the new socket.
1502  */
1503 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1504                                   struct request_sock *req,
1505                                   struct dst_entry *dst,
1506                                   struct request_sock *req_unhash,
1507                                   bool *own_req)
1508 {
1509         struct inet_request_sock *ireq;
1510         bool found_dup_sk = false;
1511         struct inet_sock *newinet;
1512         struct tcp_sock *newtp;
1513         struct sock *newsk;
1514 #ifdef CONFIG_TCP_MD5SIG
1515         const union tcp_md5_addr *addr;
1516         struct tcp_md5sig_key *key;
1517         int l3index;
1518 #endif
1519         struct ip_options_rcu *inet_opt;
1520
1521         if (sk_acceptq_is_full(sk))
1522                 goto exit_overflow;
1523
1524         newsk = tcp_create_openreq_child(sk, req, skb);
1525         if (!newsk)
1526                 goto exit_nonewsk;
1527
1528         newsk->sk_gso_type = SKB_GSO_TCPV4;
1529         inet_sk_rx_dst_set(newsk, skb);
1530
1531         newtp                 = tcp_sk(newsk);
1532         newinet               = inet_sk(newsk);
1533         ireq                  = inet_rsk(req);
1534         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1535         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1536         newsk->sk_bound_dev_if = ireq->ir_iif;
1537         newinet->inet_saddr   = ireq->ir_loc_addr;
1538         inet_opt              = rcu_dereference(ireq->ireq_opt);
1539         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1540         newinet->mc_index     = inet_iif(skb);
1541         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1542         newinet->rcv_tos      = ip_hdr(skb)->tos;
1543         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1544         if (inet_opt)
1545                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1546         newinet->inet_id = get_random_u16();
1547
1548         /* Set ToS of the new socket based upon the value of incoming SYN.
1549          * ECT bits are set later in tcp_init_transfer().
1550          */
1551         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1552                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1553
1554         if (!dst) {
1555                 dst = inet_csk_route_child_sock(sk, newsk, req);
1556                 if (!dst)
1557                         goto put_and_exit;
1558         } else {
1559                 /* syncookie case : see end of cookie_v4_check() */
1560         }
1561         sk_setup_caps(newsk, dst);
1562
1563         tcp_ca_openreq_child(newsk, dst);
1564
1565         tcp_sync_mss(newsk, dst_mtu(dst));
1566         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1567
1568         tcp_initialize_rcv_mss(newsk);
1569
1570 #ifdef CONFIG_TCP_MD5SIG
1571         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1572         /* Copy over the MD5 key from the original socket */
1573         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1574         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1575         if (key) {
1576                 /*
1577                  * We're using one, so create a matching key
1578                  * on the newsk structure. If we fail to get
1579                  * memory, then we end up not copying the key
1580                  * across. Shucks.
1581                  */
1582                 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1583                                key->key, key->keylen, GFP_ATOMIC);
1584                 sk_gso_disable(newsk);
1585         }
1586 #endif
1587
1588         if (__inet_inherit_port(sk, newsk) < 0)
1589                 goto put_and_exit;
1590         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1591                                        &found_dup_sk);
1592         if (likely(*own_req)) {
1593                 tcp_move_syn(newtp, req);
1594                 ireq->ireq_opt = NULL;
1595         } else {
1596                 newinet->inet_opt = NULL;
1597
1598                 if (!req_unhash && found_dup_sk) {
1599                         /* This code path should only be executed in the
1600                          * syncookie case only
1601                          */
1602                         bh_unlock_sock(newsk);
1603                         sock_put(newsk);
1604                         newsk = NULL;
1605                 }
1606         }
1607         return newsk;
1608
1609 exit_overflow:
1610         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1611 exit_nonewsk:
1612         dst_release(dst);
1613 exit:
1614         tcp_listendrop(sk);
1615         return NULL;
1616 put_and_exit:
1617         newinet->inet_opt = NULL;
1618         inet_csk_prepare_forced_close(newsk);
1619         tcp_done(newsk);
1620         goto exit;
1621 }
1622 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1623
1624 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1625 {
1626 #ifdef CONFIG_SYN_COOKIES
1627         const struct tcphdr *th = tcp_hdr(skb);
1628
1629         if (!th->syn)
1630                 sk = cookie_v4_check(sk, skb);
1631 #endif
1632         return sk;
1633 }
1634
1635 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1636                          struct tcphdr *th, u32 *cookie)
1637 {
1638         u16 mss = 0;
1639 #ifdef CONFIG_SYN_COOKIES
1640         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1641                                     &tcp_request_sock_ipv4_ops, sk, th);
1642         if (mss) {
1643                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1644                 tcp_synq_overflow(sk);
1645         }
1646 #endif
1647         return mss;
1648 }
1649
1650 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1651                                                            u32));
1652 /* The socket must have it's spinlock held when we get
1653  * here, unless it is a TCP_LISTEN socket.
1654  *
1655  * We have a potential double-lock case here, so even when
1656  * doing backlog processing we use the BH locking scheme.
1657  * This is because we cannot sleep with the original spinlock
1658  * held.
1659  */
1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662         enum skb_drop_reason reason;
1663         struct sock *rsk;
1664
1665         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1666                 struct dst_entry *dst;
1667
1668                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1669                                                 lockdep_sock_is_held(sk));
1670
1671                 sock_rps_save_rxhash(sk, skb);
1672                 sk_mark_napi_id(sk, skb);
1673                 if (dst) {
1674                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1675                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1676                                              dst, 0)) {
1677                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1678                                 dst_release(dst);
1679                         }
1680                 }
1681                 tcp_rcv_established(sk, skb);
1682                 return 0;
1683         }
1684
1685         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1686         if (tcp_checksum_complete(skb))
1687                 goto csum_err;
1688
1689         if (sk->sk_state == TCP_LISTEN) {
1690                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1691
1692                 if (!nsk)
1693                         goto discard;
1694                 if (nsk != sk) {
1695                         if (tcp_child_process(sk, nsk, skb)) {
1696                                 rsk = nsk;
1697                                 goto reset;
1698                         }
1699                         return 0;
1700                 }
1701         } else
1702                 sock_rps_save_rxhash(sk, skb);
1703
1704         if (tcp_rcv_state_process(sk, skb)) {
1705                 rsk = sk;
1706                 goto reset;
1707         }
1708         return 0;
1709
1710 reset:
1711         tcp_v4_send_reset(rsk, skb);
1712 discard:
1713         kfree_skb_reason(skb, reason);
1714         /* Be careful here. If this function gets more complicated and
1715          * gcc suffers from register pressure on the x86, sk (in %ebx)
1716          * might be destroyed here. This current version compiles correctly,
1717          * but you have been warned.
1718          */
1719         return 0;
1720
1721 csum_err:
1722         reason = SKB_DROP_REASON_TCP_CSUM;
1723         trace_tcp_bad_csum(skb);
1724         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1725         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1726         goto discard;
1727 }
1728 EXPORT_SYMBOL(tcp_v4_do_rcv);
1729
1730 int tcp_v4_early_demux(struct sk_buff *skb)
1731 {
1732         struct net *net = dev_net(skb->dev);
1733         const struct iphdr *iph;
1734         const struct tcphdr *th;
1735         struct sock *sk;
1736
1737         if (skb->pkt_type != PACKET_HOST)
1738                 return 0;
1739
1740         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1741                 return 0;
1742
1743         iph = ip_hdr(skb);
1744         th = tcp_hdr(skb);
1745
1746         if (th->doff < sizeof(struct tcphdr) / 4)
1747                 return 0;
1748
1749         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1750                                        iph->saddr, th->source,
1751                                        iph->daddr, ntohs(th->dest),
1752                                        skb->skb_iif, inet_sdif(skb));
1753         if (sk) {
1754                 skb->sk = sk;
1755                 skb->destructor = sock_edemux;
1756                 if (sk_fullsock(sk)) {
1757                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1758
1759                         if (dst)
1760                                 dst = dst_check(dst, 0);
1761                         if (dst &&
1762                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1763                                 skb_dst_set_noref(skb, dst);
1764                 }
1765         }
1766         return 0;
1767 }
1768
1769 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1770                      enum skb_drop_reason *reason)
1771 {
1772         u32 limit, tail_gso_size, tail_gso_segs;
1773         struct skb_shared_info *shinfo;
1774         const struct tcphdr *th;
1775         struct tcphdr *thtail;
1776         struct sk_buff *tail;
1777         unsigned int hdrlen;
1778         bool fragstolen;
1779         u32 gso_segs;
1780         u32 gso_size;
1781         int delta;
1782
1783         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1784          * we can fix skb->truesize to its real value to avoid future drops.
1785          * This is valid because skb is not yet charged to the socket.
1786          * It has been noticed pure SACK packets were sometimes dropped
1787          * (if cooked by drivers without copybreak feature).
1788          */
1789         skb_condense(skb);
1790
1791         skb_dst_drop(skb);
1792
1793         if (unlikely(tcp_checksum_complete(skb))) {
1794                 bh_unlock_sock(sk);
1795                 trace_tcp_bad_csum(skb);
1796                 *reason = SKB_DROP_REASON_TCP_CSUM;
1797                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1798                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1799                 return true;
1800         }
1801
1802         /* Attempt coalescing to last skb in backlog, even if we are
1803          * above the limits.
1804          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1805          */
1806         th = (const struct tcphdr *)skb->data;
1807         hdrlen = th->doff * 4;
1808
1809         tail = sk->sk_backlog.tail;
1810         if (!tail)
1811                 goto no_coalesce;
1812         thtail = (struct tcphdr *)tail->data;
1813
1814         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1815             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1816             ((TCP_SKB_CB(tail)->tcp_flags |
1817               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1818             !((TCP_SKB_CB(tail)->tcp_flags &
1819               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1820             ((TCP_SKB_CB(tail)->tcp_flags ^
1821               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1822 #ifdef CONFIG_TLS_DEVICE
1823             tail->decrypted != skb->decrypted ||
1824 #endif
1825             thtail->doff != th->doff ||
1826             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1827                 goto no_coalesce;
1828
1829         __skb_pull(skb, hdrlen);
1830
1831         shinfo = skb_shinfo(skb);
1832         gso_size = shinfo->gso_size ?: skb->len;
1833         gso_segs = shinfo->gso_segs ?: 1;
1834
1835         shinfo = skb_shinfo(tail);
1836         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1837         tail_gso_segs = shinfo->gso_segs ?: 1;
1838
1839         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1840                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1841
1842                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1843                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1844                         thtail->window = th->window;
1845                 }
1846
1847                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1848                  * thtail->fin, so that the fast path in tcp_rcv_established()
1849                  * is not entered if we append a packet with a FIN.
1850                  * SYN, RST, URG are not present.
1851                  * ACK is set on both packets.
1852                  * PSH : we do not really care in TCP stack,
1853                  *       at least for 'GRO' packets.
1854                  */
1855                 thtail->fin |= th->fin;
1856                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1857
1858                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1859                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1860                         tail->tstamp = skb->tstamp;
1861                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1862                 }
1863
1864                 /* Not as strict as GRO. We only need to carry mss max value */
1865                 shinfo->gso_size = max(gso_size, tail_gso_size);
1866                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1867
1868                 sk->sk_backlog.len += delta;
1869                 __NET_INC_STATS(sock_net(sk),
1870                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1871                 kfree_skb_partial(skb, fragstolen);
1872                 return false;
1873         }
1874         __skb_push(skb, hdrlen);
1875
1876 no_coalesce:
1877         /* Only socket owner can try to collapse/prune rx queues
1878          * to reduce memory overhead, so add a little headroom here.
1879          * Few sockets backlog are possibly concurrently non empty.
1880          */
1881         limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1882
1883         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1884                 bh_unlock_sock(sk);
1885                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1886                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1887                 return true;
1888         }
1889         return false;
1890 }
1891 EXPORT_SYMBOL(tcp_add_backlog);
1892
1893 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1894 {
1895         struct tcphdr *th = (struct tcphdr *)skb->data;
1896
1897         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1898 }
1899 EXPORT_SYMBOL(tcp_filter);
1900
1901 static void tcp_v4_restore_cb(struct sk_buff *skb)
1902 {
1903         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1904                 sizeof(struct inet_skb_parm));
1905 }
1906
1907 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1908                            const struct tcphdr *th)
1909 {
1910         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1911          * barrier() makes sure compiler wont play fool^Waliasing games.
1912          */
1913         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1914                 sizeof(struct inet_skb_parm));
1915         barrier();
1916
1917         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1918         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1919                                     skb->len - th->doff * 4);
1920         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1921         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1922         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1923         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1924         TCP_SKB_CB(skb)->sacked  = 0;
1925         TCP_SKB_CB(skb)->has_rxtstamp =
1926                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1927 }
1928
1929 /*
1930  *      From tcp_input.c
1931  */
1932
1933 int tcp_v4_rcv(struct sk_buff *skb)
1934 {
1935         struct net *net = dev_net(skb->dev);
1936         enum skb_drop_reason drop_reason;
1937         int sdif = inet_sdif(skb);
1938         int dif = inet_iif(skb);
1939         const struct iphdr *iph;
1940         const struct tcphdr *th;
1941         bool refcounted;
1942         struct sock *sk;
1943         int ret;
1944
1945         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1946         if (skb->pkt_type != PACKET_HOST)
1947                 goto discard_it;
1948
1949         /* Count it even if it's bad */
1950         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1951
1952         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1953                 goto discard_it;
1954
1955         th = (const struct tcphdr *)skb->data;
1956
1957         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1958                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1959                 goto bad_packet;
1960         }
1961         if (!pskb_may_pull(skb, th->doff * 4))
1962                 goto discard_it;
1963
1964         /* An explanation is required here, I think.
1965          * Packet length and doff are validated by header prediction,
1966          * provided case of th->doff==0 is eliminated.
1967          * So, we defer the checks. */
1968
1969         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1970                 goto csum_error;
1971
1972         th = (const struct tcphdr *)skb->data;
1973         iph = ip_hdr(skb);
1974 lookup:
1975         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
1976                                skb, __tcp_hdrlen(th), th->source,
1977                                th->dest, sdif, &refcounted);
1978         if (!sk)
1979                 goto no_tcp_socket;
1980
1981 process:
1982         if (sk->sk_state == TCP_TIME_WAIT)
1983                 goto do_time_wait;
1984
1985         if (sk->sk_state == TCP_NEW_SYN_RECV) {
1986                 struct request_sock *req = inet_reqsk(sk);
1987                 bool req_stolen = false;
1988                 struct sock *nsk;
1989
1990                 sk = req->rsk_listener;
1991                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1992                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1993                 else
1994                         drop_reason = tcp_inbound_md5_hash(sk, skb,
1995                                                    &iph->saddr, &iph->daddr,
1996                                                    AF_INET, dif, sdif);
1997                 if (unlikely(drop_reason)) {
1998                         sk_drops_add(sk, skb);
1999                         reqsk_put(req);
2000                         goto discard_it;
2001                 }
2002                 if (tcp_checksum_complete(skb)) {
2003                         reqsk_put(req);
2004                         goto csum_error;
2005                 }
2006                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2007                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2008                         if (!nsk) {
2009                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2010                                 goto lookup;
2011                         }
2012                         sk = nsk;
2013                         /* reuseport_migrate_sock() has already held one sk_refcnt
2014                          * before returning.
2015                          */
2016                 } else {
2017                         /* We own a reference on the listener, increase it again
2018                          * as we might lose it too soon.
2019                          */
2020                         sock_hold(sk);
2021                 }
2022                 refcounted = true;
2023                 nsk = NULL;
2024                 if (!tcp_filter(sk, skb)) {
2025                         th = (const struct tcphdr *)skb->data;
2026                         iph = ip_hdr(skb);
2027                         tcp_v4_fill_cb(skb, iph, th);
2028                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2029                 } else {
2030                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2031                 }
2032                 if (!nsk) {
2033                         reqsk_put(req);
2034                         if (req_stolen) {
2035                                 /* Another cpu got exclusive access to req
2036                                  * and created a full blown socket.
2037                                  * Try to feed this packet to this socket
2038                                  * instead of discarding it.
2039                                  */
2040                                 tcp_v4_restore_cb(skb);
2041                                 sock_put(sk);
2042                                 goto lookup;
2043                         }
2044                         goto discard_and_relse;
2045                 }
2046                 nf_reset_ct(skb);
2047                 if (nsk == sk) {
2048                         reqsk_put(req);
2049                         tcp_v4_restore_cb(skb);
2050                 } else if (tcp_child_process(sk, nsk, skb)) {
2051                         tcp_v4_send_reset(nsk, skb);
2052                         goto discard_and_relse;
2053                 } else {
2054                         sock_put(sk);
2055                         return 0;
2056                 }
2057         }
2058
2059         if (static_branch_unlikely(&ip4_min_ttl)) {
2060                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2061                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2062                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2063                         goto discard_and_relse;
2064                 }
2065         }
2066
2067         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2068                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2069                 goto discard_and_relse;
2070         }
2071
2072         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2073                                            &iph->daddr, AF_INET, dif, sdif);
2074         if (drop_reason)
2075                 goto discard_and_relse;
2076
2077         nf_reset_ct(skb);
2078
2079         if (tcp_filter(sk, skb)) {
2080                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2081                 goto discard_and_relse;
2082         }
2083         th = (const struct tcphdr *)skb->data;
2084         iph = ip_hdr(skb);
2085         tcp_v4_fill_cb(skb, iph, th);
2086
2087         skb->dev = NULL;
2088
2089         if (sk->sk_state == TCP_LISTEN) {
2090                 ret = tcp_v4_do_rcv(sk, skb);
2091                 goto put_and_return;
2092         }
2093
2094         sk_incoming_cpu_update(sk);
2095
2096         bh_lock_sock_nested(sk);
2097         tcp_segs_in(tcp_sk(sk), skb);
2098         ret = 0;
2099         if (!sock_owned_by_user(sk)) {
2100                 ret = tcp_v4_do_rcv(sk, skb);
2101         } else {
2102                 if (tcp_add_backlog(sk, skb, &drop_reason))
2103                         goto discard_and_relse;
2104         }
2105         bh_unlock_sock(sk);
2106
2107 put_and_return:
2108         if (refcounted)
2109                 sock_put(sk);
2110
2111         return ret;
2112
2113 no_tcp_socket:
2114         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2115         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2116                 goto discard_it;
2117
2118         tcp_v4_fill_cb(skb, iph, th);
2119
2120         if (tcp_checksum_complete(skb)) {
2121 csum_error:
2122                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2123                 trace_tcp_bad_csum(skb);
2124                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2125 bad_packet:
2126                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2127         } else {
2128                 tcp_v4_send_reset(NULL, skb);
2129         }
2130
2131 discard_it:
2132         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2133         /* Discard frame. */
2134         kfree_skb_reason(skb, drop_reason);
2135         return 0;
2136
2137 discard_and_relse:
2138         sk_drops_add(sk, skb);
2139         if (refcounted)
2140                 sock_put(sk);
2141         goto discard_it;
2142
2143 do_time_wait:
2144         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2145                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2146                 inet_twsk_put(inet_twsk(sk));
2147                 goto discard_it;
2148         }
2149
2150         tcp_v4_fill_cb(skb, iph, th);
2151
2152         if (tcp_checksum_complete(skb)) {
2153                 inet_twsk_put(inet_twsk(sk));
2154                 goto csum_error;
2155         }
2156         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2157         case TCP_TW_SYN: {
2158                 struct sock *sk2 = inet_lookup_listener(net,
2159                                                         net->ipv4.tcp_death_row.hashinfo,
2160                                                         skb, __tcp_hdrlen(th),
2161                                                         iph->saddr, th->source,
2162                                                         iph->daddr, th->dest,
2163                                                         inet_iif(skb),
2164                                                         sdif);
2165                 if (sk2) {
2166                         inet_twsk_deschedule_put(inet_twsk(sk));
2167                         sk = sk2;
2168                         tcp_v4_restore_cb(skb);
2169                         refcounted = false;
2170                         goto process;
2171                 }
2172         }
2173                 /* to ACK */
2174                 fallthrough;
2175         case TCP_TW_ACK:
2176                 tcp_v4_timewait_ack(sk, skb);
2177                 break;
2178         case TCP_TW_RST:
2179                 tcp_v4_send_reset(sk, skb);
2180                 inet_twsk_deschedule_put(inet_twsk(sk));
2181                 goto discard_it;
2182         case TCP_TW_SUCCESS:;
2183         }
2184         goto discard_it;
2185 }
2186
2187 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2188         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2189         .twsk_unique    = tcp_twsk_unique,
2190         .twsk_destructor= tcp_twsk_destructor,
2191 };
2192
2193 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2194 {
2195         struct dst_entry *dst = skb_dst(skb);
2196
2197         if (dst && dst_hold_safe(dst)) {
2198                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2199                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2200         }
2201 }
2202 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2203
2204 const struct inet_connection_sock_af_ops ipv4_specific = {
2205         .queue_xmit        = ip_queue_xmit,
2206         .send_check        = tcp_v4_send_check,
2207         .rebuild_header    = inet_sk_rebuild_header,
2208         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2209         .conn_request      = tcp_v4_conn_request,
2210         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2211         .net_header_len    = sizeof(struct iphdr),
2212         .setsockopt        = ip_setsockopt,
2213         .getsockopt        = ip_getsockopt,
2214         .addr2sockaddr     = inet_csk_addr2sockaddr,
2215         .sockaddr_len      = sizeof(struct sockaddr_in),
2216         .mtu_reduced       = tcp_v4_mtu_reduced,
2217 };
2218 EXPORT_SYMBOL(ipv4_specific);
2219
2220 #ifdef CONFIG_TCP_MD5SIG
2221 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2222         .md5_lookup             = tcp_v4_md5_lookup,
2223         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2224         .md5_parse              = tcp_v4_parse_md5_keys,
2225 };
2226 #endif
2227
2228 /* NOTE: A lot of things set to zero explicitly by call to
2229  *       sk_alloc() so need not be done here.
2230  */
2231 static int tcp_v4_init_sock(struct sock *sk)
2232 {
2233         struct inet_connection_sock *icsk = inet_csk(sk);
2234
2235         tcp_init_sock(sk);
2236
2237         icsk->icsk_af_ops = &ipv4_specific;
2238
2239 #ifdef CONFIG_TCP_MD5SIG
2240         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2241 #endif
2242
2243         return 0;
2244 }
2245
2246 void tcp_v4_destroy_sock(struct sock *sk)
2247 {
2248         struct tcp_sock *tp = tcp_sk(sk);
2249
2250         trace_tcp_destroy_sock(sk);
2251
2252         tcp_clear_xmit_timers(sk);
2253
2254         tcp_cleanup_congestion_control(sk);
2255
2256         tcp_cleanup_ulp(sk);
2257
2258         /* Cleanup up the write buffer. */
2259         tcp_write_queue_purge(sk);
2260
2261         /* Check if we want to disable active TFO */
2262         tcp_fastopen_active_disable_ofo_check(sk);
2263
2264         /* Cleans up our, hopefully empty, out_of_order_queue. */
2265         skb_rbtree_purge(&tp->out_of_order_queue);
2266
2267 #ifdef CONFIG_TCP_MD5SIG
2268         /* Clean up the MD5 key list, if any */
2269         if (tp->md5sig_info) {
2270                 tcp_clear_md5_list(sk);
2271                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2272                 tp->md5sig_info = NULL;
2273         }
2274 #endif
2275
2276         /* Clean up a referenced TCP bind bucket. */
2277         if (inet_csk(sk)->icsk_bind_hash)
2278                 inet_put_port(sk);
2279
2280         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2281
2282         /* If socket is aborted during connect operation */
2283         tcp_free_fastopen_req(tp);
2284         tcp_fastopen_destroy_cipher(sk);
2285         tcp_saved_syn_free(tp);
2286
2287         sk_sockets_allocated_dec(sk);
2288 }
2289 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2290
2291 #ifdef CONFIG_PROC_FS
2292 /* Proc filesystem TCP sock list dumping. */
2293
2294 static unsigned short seq_file_family(const struct seq_file *seq);
2295
2296 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2297 {
2298         unsigned short family = seq_file_family(seq);
2299
2300         /* AF_UNSPEC is used as a match all */
2301         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2302                 net_eq(sock_net(sk), seq_file_net(seq)));
2303 }
2304
2305 /* Find a non empty bucket (starting from st->bucket)
2306  * and return the first sk from it.
2307  */
2308 static void *listening_get_first(struct seq_file *seq)
2309 {
2310         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2311         struct tcp_iter_state *st = seq->private;
2312
2313         st->offset = 0;
2314         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2315                 struct inet_listen_hashbucket *ilb2;
2316                 struct hlist_nulls_node *node;
2317                 struct sock *sk;
2318
2319                 ilb2 = &hinfo->lhash2[st->bucket];
2320                 if (hlist_nulls_empty(&ilb2->nulls_head))
2321                         continue;
2322
2323                 spin_lock(&ilb2->lock);
2324                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2325                         if (seq_sk_match(seq, sk))
2326                                 return sk;
2327                 }
2328                 spin_unlock(&ilb2->lock);
2329         }
2330
2331         return NULL;
2332 }
2333
2334 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2335  * If "cur" is the last one in the st->bucket,
2336  * call listening_get_first() to return the first sk of the next
2337  * non empty bucket.
2338  */
2339 static void *listening_get_next(struct seq_file *seq, void *cur)
2340 {
2341         struct tcp_iter_state *st = seq->private;
2342         struct inet_listen_hashbucket *ilb2;
2343         struct hlist_nulls_node *node;
2344         struct inet_hashinfo *hinfo;
2345         struct sock *sk = cur;
2346
2347         ++st->num;
2348         ++st->offset;
2349
2350         sk = sk_nulls_next(sk);
2351         sk_nulls_for_each_from(sk, node) {
2352                 if (seq_sk_match(seq, sk))
2353                         return sk;
2354         }
2355
2356         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2357         ilb2 = &hinfo->lhash2[st->bucket];
2358         spin_unlock(&ilb2->lock);
2359         ++st->bucket;
2360         return listening_get_first(seq);
2361 }
2362
2363 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2364 {
2365         struct tcp_iter_state *st = seq->private;
2366         void *rc;
2367
2368         st->bucket = 0;
2369         st->offset = 0;
2370         rc = listening_get_first(seq);
2371
2372         while (rc && *pos) {
2373                 rc = listening_get_next(seq, rc);
2374                 --*pos;
2375         }
2376         return rc;
2377 }
2378
2379 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2380                                 const struct tcp_iter_state *st)
2381 {
2382         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2383 }
2384
2385 /*
2386  * Get first established socket starting from bucket given in st->bucket.
2387  * If st->bucket is zero, the very first socket in the hash is returned.
2388  */
2389 static void *established_get_first(struct seq_file *seq)
2390 {
2391         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2392         struct tcp_iter_state *st = seq->private;
2393
2394         st->offset = 0;
2395         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2396                 struct sock *sk;
2397                 struct hlist_nulls_node *node;
2398                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2399
2400                 /* Lockless fast path for the common case of empty buckets */
2401                 if (empty_bucket(hinfo, st))
2402                         continue;
2403
2404                 spin_lock_bh(lock);
2405                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2406                         if (seq_sk_match(seq, sk))
2407                                 return sk;
2408                 }
2409                 spin_unlock_bh(lock);
2410         }
2411
2412         return NULL;
2413 }
2414
2415 static void *established_get_next(struct seq_file *seq, void *cur)
2416 {
2417         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2418         struct tcp_iter_state *st = seq->private;
2419         struct hlist_nulls_node *node;
2420         struct sock *sk = cur;
2421
2422         ++st->num;
2423         ++st->offset;
2424
2425         sk = sk_nulls_next(sk);
2426
2427         sk_nulls_for_each_from(sk, node) {
2428                 if (seq_sk_match(seq, sk))
2429                         return sk;
2430         }
2431
2432         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2433         ++st->bucket;
2434         return established_get_first(seq);
2435 }
2436
2437 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2438 {
2439         struct tcp_iter_state *st = seq->private;
2440         void *rc;
2441
2442         st->bucket = 0;
2443         rc = established_get_first(seq);
2444
2445         while (rc && pos) {
2446                 rc = established_get_next(seq, rc);
2447                 --pos;
2448         }
2449         return rc;
2450 }
2451
2452 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2453 {
2454         void *rc;
2455         struct tcp_iter_state *st = seq->private;
2456
2457         st->state = TCP_SEQ_STATE_LISTENING;
2458         rc        = listening_get_idx(seq, &pos);
2459
2460         if (!rc) {
2461                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2462                 rc        = established_get_idx(seq, pos);
2463         }
2464
2465         return rc;
2466 }
2467
2468 static void *tcp_seek_last_pos(struct seq_file *seq)
2469 {
2470         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2471         struct tcp_iter_state *st = seq->private;
2472         int bucket = st->bucket;
2473         int offset = st->offset;
2474         int orig_num = st->num;
2475         void *rc = NULL;
2476
2477         switch (st->state) {
2478         case TCP_SEQ_STATE_LISTENING:
2479                 if (st->bucket > hinfo->lhash2_mask)
2480                         break;
2481                 st->state = TCP_SEQ_STATE_LISTENING;
2482                 rc = listening_get_first(seq);
2483                 while (offset-- && rc && bucket == st->bucket)
2484                         rc = listening_get_next(seq, rc);
2485                 if (rc)
2486                         break;
2487                 st->bucket = 0;
2488                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2489                 fallthrough;
2490         case TCP_SEQ_STATE_ESTABLISHED:
2491                 if (st->bucket > hinfo->ehash_mask)
2492                         break;
2493                 rc = established_get_first(seq);
2494                 while (offset-- && rc && bucket == st->bucket)
2495                         rc = established_get_next(seq, rc);
2496         }
2497
2498         st->num = orig_num;
2499
2500         return rc;
2501 }
2502
2503 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2504 {
2505         struct tcp_iter_state *st = seq->private;
2506         void *rc;
2507
2508         if (*pos && *pos == st->last_pos) {
2509                 rc = tcp_seek_last_pos(seq);
2510                 if (rc)
2511                         goto out;
2512         }
2513
2514         st->state = TCP_SEQ_STATE_LISTENING;
2515         st->num = 0;
2516         st->bucket = 0;
2517         st->offset = 0;
2518         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2519
2520 out:
2521         st->last_pos = *pos;
2522         return rc;
2523 }
2524 EXPORT_SYMBOL(tcp_seq_start);
2525
2526 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2527 {
2528         struct tcp_iter_state *st = seq->private;
2529         void *rc = NULL;
2530
2531         if (v == SEQ_START_TOKEN) {
2532                 rc = tcp_get_idx(seq, 0);
2533                 goto out;
2534         }
2535
2536         switch (st->state) {
2537         case TCP_SEQ_STATE_LISTENING:
2538                 rc = listening_get_next(seq, v);
2539                 if (!rc) {
2540                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2541                         st->bucket = 0;
2542                         st->offset = 0;
2543                         rc        = established_get_first(seq);
2544                 }
2545                 break;
2546         case TCP_SEQ_STATE_ESTABLISHED:
2547                 rc = established_get_next(seq, v);
2548                 break;
2549         }
2550 out:
2551         ++*pos;
2552         st->last_pos = *pos;
2553         return rc;
2554 }
2555 EXPORT_SYMBOL(tcp_seq_next);
2556
2557 void tcp_seq_stop(struct seq_file *seq, void *v)
2558 {
2559         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2560         struct tcp_iter_state *st = seq->private;
2561
2562         switch (st->state) {
2563         case TCP_SEQ_STATE_LISTENING:
2564                 if (v != SEQ_START_TOKEN)
2565                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2566                 break;
2567         case TCP_SEQ_STATE_ESTABLISHED:
2568                 if (v)
2569                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2570                 break;
2571         }
2572 }
2573 EXPORT_SYMBOL(tcp_seq_stop);
2574
2575 static void get_openreq4(const struct request_sock *req,
2576                          struct seq_file *f, int i)
2577 {
2578         const struct inet_request_sock *ireq = inet_rsk(req);
2579         long delta = req->rsk_timer.expires - jiffies;
2580
2581         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2582                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2583                 i,
2584                 ireq->ir_loc_addr,
2585                 ireq->ir_num,
2586                 ireq->ir_rmt_addr,
2587                 ntohs(ireq->ir_rmt_port),
2588                 TCP_SYN_RECV,
2589                 0, 0, /* could print option size, but that is af dependent. */
2590                 1,    /* timers active (only the expire timer) */
2591                 jiffies_delta_to_clock_t(delta),
2592                 req->num_timeout,
2593                 from_kuid_munged(seq_user_ns(f),
2594                                  sock_i_uid(req->rsk_listener)),
2595                 0,  /* non standard timer */
2596                 0, /* open_requests have no inode */
2597                 0,
2598                 req);
2599 }
2600
2601 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2602 {
2603         int timer_active;
2604         unsigned long timer_expires;
2605         const struct tcp_sock *tp = tcp_sk(sk);
2606         const struct inet_connection_sock *icsk = inet_csk(sk);
2607         const struct inet_sock *inet = inet_sk(sk);
2608         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2609         __be32 dest = inet->inet_daddr;
2610         __be32 src = inet->inet_rcv_saddr;
2611         __u16 destp = ntohs(inet->inet_dport);
2612         __u16 srcp = ntohs(inet->inet_sport);
2613         int rx_queue;
2614         int state;
2615
2616         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2617             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2618             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2619                 timer_active    = 1;
2620                 timer_expires   = icsk->icsk_timeout;
2621         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2622                 timer_active    = 4;
2623                 timer_expires   = icsk->icsk_timeout;
2624         } else if (timer_pending(&sk->sk_timer)) {
2625                 timer_active    = 2;
2626                 timer_expires   = sk->sk_timer.expires;
2627         } else {
2628                 timer_active    = 0;
2629                 timer_expires = jiffies;
2630         }
2631
2632         state = inet_sk_state_load(sk);
2633         if (state == TCP_LISTEN)
2634                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2635         else
2636                 /* Because we don't lock the socket,
2637                  * we might find a transient negative value.
2638                  */
2639                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2640                                       READ_ONCE(tp->copied_seq), 0);
2641
2642         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2643                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2644                 i, src, srcp, dest, destp, state,
2645                 READ_ONCE(tp->write_seq) - tp->snd_una,
2646                 rx_queue,
2647                 timer_active,
2648                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2649                 icsk->icsk_retransmits,
2650                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2651                 icsk->icsk_probes_out,
2652                 sock_i_ino(sk),
2653                 refcount_read(&sk->sk_refcnt), sk,
2654                 jiffies_to_clock_t(icsk->icsk_rto),
2655                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2656                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2657                 tcp_snd_cwnd(tp),
2658                 state == TCP_LISTEN ?
2659                     fastopenq->max_qlen :
2660                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2661 }
2662
2663 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2664                                struct seq_file *f, int i)
2665 {
2666         long delta = tw->tw_timer.expires - jiffies;
2667         __be32 dest, src;
2668         __u16 destp, srcp;
2669
2670         dest  = tw->tw_daddr;
2671         src   = tw->tw_rcv_saddr;
2672         destp = ntohs(tw->tw_dport);
2673         srcp  = ntohs(tw->tw_sport);
2674
2675         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2676                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2677                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2678                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2679                 refcount_read(&tw->tw_refcnt), tw);
2680 }
2681
2682 #define TMPSZ 150
2683
2684 static int tcp4_seq_show(struct seq_file *seq, void *v)
2685 {
2686         struct tcp_iter_state *st;
2687         struct sock *sk = v;
2688
2689         seq_setwidth(seq, TMPSZ - 1);
2690         if (v == SEQ_START_TOKEN) {
2691                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2692                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2693                            "inode");
2694                 goto out;
2695         }
2696         st = seq->private;
2697
2698         if (sk->sk_state == TCP_TIME_WAIT)
2699                 get_timewait4_sock(v, seq, st->num);
2700         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2701                 get_openreq4(v, seq, st->num);
2702         else
2703                 get_tcp4_sock(v, seq, st->num);
2704 out:
2705         seq_pad(seq, '\n');
2706         return 0;
2707 }
2708
2709 #ifdef CONFIG_BPF_SYSCALL
2710 struct bpf_tcp_iter_state {
2711         struct tcp_iter_state state;
2712         unsigned int cur_sk;
2713         unsigned int end_sk;
2714         unsigned int max_sk;
2715         struct sock **batch;
2716         bool st_bucket_done;
2717 };
2718
2719 struct bpf_iter__tcp {
2720         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2721         __bpf_md_ptr(struct sock_common *, sk_common);
2722         uid_t uid __aligned(8);
2723 };
2724
2725 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2726                              struct sock_common *sk_common, uid_t uid)
2727 {
2728         struct bpf_iter__tcp ctx;
2729
2730         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2731         ctx.meta = meta;
2732         ctx.sk_common = sk_common;
2733         ctx.uid = uid;
2734         return bpf_iter_run_prog(prog, &ctx);
2735 }
2736
2737 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2738 {
2739         while (iter->cur_sk < iter->end_sk)
2740                 sock_put(iter->batch[iter->cur_sk++]);
2741 }
2742
2743 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2744                                       unsigned int new_batch_sz)
2745 {
2746         struct sock **new_batch;
2747
2748         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2749                              GFP_USER | __GFP_NOWARN);
2750         if (!new_batch)
2751                 return -ENOMEM;
2752
2753         bpf_iter_tcp_put_batch(iter);
2754         kvfree(iter->batch);
2755         iter->batch = new_batch;
2756         iter->max_sk = new_batch_sz;
2757
2758         return 0;
2759 }
2760
2761 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2762                                                  struct sock *start_sk)
2763 {
2764         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2765         struct bpf_tcp_iter_state *iter = seq->private;
2766         struct tcp_iter_state *st = &iter->state;
2767         struct hlist_nulls_node *node;
2768         unsigned int expected = 1;
2769         struct sock *sk;
2770
2771         sock_hold(start_sk);
2772         iter->batch[iter->end_sk++] = start_sk;
2773
2774         sk = sk_nulls_next(start_sk);
2775         sk_nulls_for_each_from(sk, node) {
2776                 if (seq_sk_match(seq, sk)) {
2777                         if (iter->end_sk < iter->max_sk) {
2778                                 sock_hold(sk);
2779                                 iter->batch[iter->end_sk++] = sk;
2780                         }
2781                         expected++;
2782                 }
2783         }
2784         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2785
2786         return expected;
2787 }
2788
2789 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2790                                                    struct sock *start_sk)
2791 {
2792         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2793         struct bpf_tcp_iter_state *iter = seq->private;
2794         struct tcp_iter_state *st = &iter->state;
2795         struct hlist_nulls_node *node;
2796         unsigned int expected = 1;
2797         struct sock *sk;
2798
2799         sock_hold(start_sk);
2800         iter->batch[iter->end_sk++] = start_sk;
2801
2802         sk = sk_nulls_next(start_sk);
2803         sk_nulls_for_each_from(sk, node) {
2804                 if (seq_sk_match(seq, sk)) {
2805                         if (iter->end_sk < iter->max_sk) {
2806                                 sock_hold(sk);
2807                                 iter->batch[iter->end_sk++] = sk;
2808                         }
2809                         expected++;
2810                 }
2811         }
2812         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2813
2814         return expected;
2815 }
2816
2817 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2818 {
2819         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2820         struct bpf_tcp_iter_state *iter = seq->private;
2821         struct tcp_iter_state *st = &iter->state;
2822         unsigned int expected;
2823         bool resized = false;
2824         struct sock *sk;
2825
2826         /* The st->bucket is done.  Directly advance to the next
2827          * bucket instead of having the tcp_seek_last_pos() to skip
2828          * one by one in the current bucket and eventually find out
2829          * it has to advance to the next bucket.
2830          */
2831         if (iter->st_bucket_done) {
2832                 st->offset = 0;
2833                 st->bucket++;
2834                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2835                     st->bucket > hinfo->lhash2_mask) {
2836                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2837                         st->bucket = 0;
2838                 }
2839         }
2840
2841 again:
2842         /* Get a new batch */
2843         iter->cur_sk = 0;
2844         iter->end_sk = 0;
2845         iter->st_bucket_done = false;
2846
2847         sk = tcp_seek_last_pos(seq);
2848         if (!sk)
2849                 return NULL; /* Done */
2850
2851         if (st->state == TCP_SEQ_STATE_LISTENING)
2852                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2853         else
2854                 expected = bpf_iter_tcp_established_batch(seq, sk);
2855
2856         if (iter->end_sk == expected) {
2857                 iter->st_bucket_done = true;
2858                 return sk;
2859         }
2860
2861         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2862                 resized = true;
2863                 goto again;
2864         }
2865
2866         return sk;
2867 }
2868
2869 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2870 {
2871         /* bpf iter does not support lseek, so it always
2872          * continue from where it was stop()-ped.
2873          */
2874         if (*pos)
2875                 return bpf_iter_tcp_batch(seq);
2876
2877         return SEQ_START_TOKEN;
2878 }
2879
2880 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2881 {
2882         struct bpf_tcp_iter_state *iter = seq->private;
2883         struct tcp_iter_state *st = &iter->state;
2884         struct sock *sk;
2885
2886         /* Whenever seq_next() is called, the iter->cur_sk is
2887          * done with seq_show(), so advance to the next sk in
2888          * the batch.
2889          */
2890         if (iter->cur_sk < iter->end_sk) {
2891                 /* Keeping st->num consistent in tcp_iter_state.
2892                  * bpf_iter_tcp does not use st->num.
2893                  * meta.seq_num is used instead.
2894                  */
2895                 st->num++;
2896                 /* Move st->offset to the next sk in the bucket such that
2897                  * the future start() will resume at st->offset in
2898                  * st->bucket.  See tcp_seek_last_pos().
2899                  */
2900                 st->offset++;
2901                 sock_put(iter->batch[iter->cur_sk++]);
2902         }
2903
2904         if (iter->cur_sk < iter->end_sk)
2905                 sk = iter->batch[iter->cur_sk];
2906         else
2907                 sk = bpf_iter_tcp_batch(seq);
2908
2909         ++*pos;
2910         /* Keeping st->last_pos consistent in tcp_iter_state.
2911          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2912          */
2913         st->last_pos = *pos;
2914         return sk;
2915 }
2916
2917 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2918 {
2919         struct bpf_iter_meta meta;
2920         struct bpf_prog *prog;
2921         struct sock *sk = v;
2922         bool slow;
2923         uid_t uid;
2924         int ret;
2925
2926         if (v == SEQ_START_TOKEN)
2927                 return 0;
2928
2929         if (sk_fullsock(sk))
2930                 slow = lock_sock_fast(sk);
2931
2932         if (unlikely(sk_unhashed(sk))) {
2933                 ret = SEQ_SKIP;
2934                 goto unlock;
2935         }
2936
2937         if (sk->sk_state == TCP_TIME_WAIT) {
2938                 uid = 0;
2939         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2940                 const struct request_sock *req = v;
2941
2942                 uid = from_kuid_munged(seq_user_ns(seq),
2943                                        sock_i_uid(req->rsk_listener));
2944         } else {
2945                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2946         }
2947
2948         meta.seq = seq;
2949         prog = bpf_iter_get_info(&meta, false);
2950         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2951
2952 unlock:
2953         if (sk_fullsock(sk))
2954                 unlock_sock_fast(sk, slow);
2955         return ret;
2956
2957 }
2958
2959 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2960 {
2961         struct bpf_tcp_iter_state *iter = seq->private;
2962         struct bpf_iter_meta meta;
2963         struct bpf_prog *prog;
2964
2965         if (!v) {
2966                 meta.seq = seq;
2967                 prog = bpf_iter_get_info(&meta, true);
2968                 if (prog)
2969                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
2970         }
2971
2972         if (iter->cur_sk < iter->end_sk) {
2973                 bpf_iter_tcp_put_batch(iter);
2974                 iter->st_bucket_done = false;
2975         }
2976 }
2977
2978 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2979         .show           = bpf_iter_tcp_seq_show,
2980         .start          = bpf_iter_tcp_seq_start,
2981         .next           = bpf_iter_tcp_seq_next,
2982         .stop           = bpf_iter_tcp_seq_stop,
2983 };
2984 #endif
2985 static unsigned short seq_file_family(const struct seq_file *seq)
2986 {
2987         const struct tcp_seq_afinfo *afinfo;
2988
2989 #ifdef CONFIG_BPF_SYSCALL
2990         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
2991         if (seq->op == &bpf_iter_tcp_seq_ops)
2992                 return AF_UNSPEC;
2993 #endif
2994
2995         /* Iterated from proc fs */
2996         afinfo = pde_data(file_inode(seq->file));
2997         return afinfo->family;
2998 }
2999
3000 static const struct seq_operations tcp4_seq_ops = {
3001         .show           = tcp4_seq_show,
3002         .start          = tcp_seq_start,
3003         .next           = tcp_seq_next,
3004         .stop           = tcp_seq_stop,
3005 };
3006
3007 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3008         .family         = AF_INET,
3009 };
3010
3011 static int __net_init tcp4_proc_init_net(struct net *net)
3012 {
3013         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3014                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3015                 return -ENOMEM;
3016         return 0;
3017 }
3018
3019 static void __net_exit tcp4_proc_exit_net(struct net *net)
3020 {
3021         remove_proc_entry("tcp", net->proc_net);
3022 }
3023
3024 static struct pernet_operations tcp4_net_ops = {
3025         .init = tcp4_proc_init_net,
3026         .exit = tcp4_proc_exit_net,
3027 };
3028
3029 int __init tcp4_proc_init(void)
3030 {
3031         return register_pernet_subsys(&tcp4_net_ops);
3032 }
3033
3034 void tcp4_proc_exit(void)
3035 {
3036         unregister_pernet_subsys(&tcp4_net_ops);
3037 }
3038 #endif /* CONFIG_PROC_FS */
3039
3040 /* @wake is one when sk_stream_write_space() calls us.
3041  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3042  * This mimics the strategy used in sock_def_write_space().
3043  */
3044 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3045 {
3046         const struct tcp_sock *tp = tcp_sk(sk);
3047         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3048                             READ_ONCE(tp->snd_nxt);
3049
3050         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3051 }
3052 EXPORT_SYMBOL(tcp_stream_memory_free);
3053
3054 struct proto tcp_prot = {
3055         .name                   = "TCP",
3056         .owner                  = THIS_MODULE,
3057         .close                  = tcp_close,
3058         .pre_connect            = tcp_v4_pre_connect,
3059         .connect                = tcp_v4_connect,
3060         .disconnect             = tcp_disconnect,
3061         .accept                 = inet_csk_accept,
3062         .ioctl                  = tcp_ioctl,
3063         .init                   = tcp_v4_init_sock,
3064         .destroy                = tcp_v4_destroy_sock,
3065         .shutdown               = tcp_shutdown,
3066         .setsockopt             = tcp_setsockopt,
3067         .getsockopt             = tcp_getsockopt,
3068         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3069         .keepalive              = tcp_set_keepalive,
3070         .recvmsg                = tcp_recvmsg,
3071         .sendmsg                = tcp_sendmsg,
3072         .sendpage               = tcp_sendpage,
3073         .backlog_rcv            = tcp_v4_do_rcv,
3074         .release_cb             = tcp_release_cb,
3075         .hash                   = inet_hash,
3076         .unhash                 = inet_unhash,
3077         .get_port               = inet_csk_get_port,
3078         .put_port               = inet_put_port,
3079 #ifdef CONFIG_BPF_SYSCALL
3080         .psock_update_sk_prot   = tcp_bpf_update_proto,
3081 #endif
3082         .enter_memory_pressure  = tcp_enter_memory_pressure,
3083         .leave_memory_pressure  = tcp_leave_memory_pressure,
3084         .stream_memory_free     = tcp_stream_memory_free,
3085         .sockets_allocated      = &tcp_sockets_allocated,
3086         .orphan_count           = &tcp_orphan_count,
3087
3088         .memory_allocated       = &tcp_memory_allocated,
3089         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3090
3091         .memory_pressure        = &tcp_memory_pressure,
3092         .sysctl_mem             = sysctl_tcp_mem,
3093         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3094         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3095         .max_header             = MAX_TCP_HEADER,
3096         .obj_size               = sizeof(struct tcp_sock),
3097         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3098         .twsk_prot              = &tcp_timewait_sock_ops,
3099         .rsk_prot               = &tcp_request_sock_ops,
3100         .h.hashinfo             = NULL,
3101         .no_autobind            = true,
3102         .diag_destroy           = tcp_abort,
3103 };
3104 EXPORT_SYMBOL(tcp_prot);
3105
3106 static void __net_exit tcp_sk_exit(struct net *net)
3107 {
3108         if (net->ipv4.tcp_congestion_control)
3109                 bpf_module_put(net->ipv4.tcp_congestion_control,
3110                                net->ipv4.tcp_congestion_control->owner);
3111 }
3112
3113 static void __net_init tcp_set_hashinfo(struct net *net)
3114 {
3115         struct inet_hashinfo *hinfo;
3116         unsigned int ehash_entries;
3117         struct net *old_net;
3118
3119         if (net_eq(net, &init_net))
3120                 goto fallback;
3121
3122         old_net = current->nsproxy->net_ns;
3123         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3124         if (!ehash_entries)
3125                 goto fallback;
3126
3127         ehash_entries = roundup_pow_of_two(ehash_entries);
3128         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3129         if (!hinfo) {
3130                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3131                         "for a netns, fallback to the global one\n",
3132                         ehash_entries);
3133 fallback:
3134                 hinfo = &tcp_hashinfo;
3135                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3136         }
3137
3138         net->ipv4.tcp_death_row.hashinfo = hinfo;
3139         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3140         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3141 }
3142
3143 static int __net_init tcp_sk_init(struct net *net)
3144 {
3145         net->ipv4.sysctl_tcp_ecn = 2;
3146         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3147
3148         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3149         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3150         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3151         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3152         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3153
3154         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3155         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3156         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3157
3158         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3159         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3160         net->ipv4.sysctl_tcp_syncookies = 1;
3161         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3162         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3163         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3164         net->ipv4.sysctl_tcp_orphan_retries = 0;
3165         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3166         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3167         net->ipv4.sysctl_tcp_tw_reuse = 2;
3168         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3169
3170         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3171         tcp_set_hashinfo(net);
3172
3173         net->ipv4.sysctl_tcp_sack = 1;
3174         net->ipv4.sysctl_tcp_window_scaling = 1;
3175         net->ipv4.sysctl_tcp_timestamps = 1;
3176         net->ipv4.sysctl_tcp_early_retrans = 3;
3177         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3178         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3179         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3180         net->ipv4.sysctl_tcp_max_reordering = 300;
3181         net->ipv4.sysctl_tcp_dsack = 1;
3182         net->ipv4.sysctl_tcp_app_win = 31;
3183         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3184         net->ipv4.sysctl_tcp_frto = 2;
3185         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3186         /* This limits the percentage of the congestion window which we
3187          * will allow a single TSO frame to consume.  Building TSO frames
3188          * which are too large can cause TCP streams to be bursty.
3189          */
3190         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3191         /* Default TSQ limit of 16 TSO segments */
3192         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3193
3194         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3195         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3196
3197         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3198         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3199         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3200         net->ipv4.sysctl_tcp_autocorking = 1;
3201         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3202         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3203         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3204         if (net != &init_net) {
3205                 memcpy(net->ipv4.sysctl_tcp_rmem,
3206                        init_net.ipv4.sysctl_tcp_rmem,
3207                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3208                 memcpy(net->ipv4.sysctl_tcp_wmem,
3209                        init_net.ipv4.sysctl_tcp_wmem,
3210                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3211         }
3212         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3213         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3214         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3215         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3216         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3217         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3218
3219         /* Reno is always built in */
3220         if (!net_eq(net, &init_net) &&
3221             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3222                                init_net.ipv4.tcp_congestion_control->owner))
3223                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3224         else
3225                 net->ipv4.tcp_congestion_control = &tcp_reno;
3226
3227         return 0;
3228 }
3229
3230 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3231 {
3232         struct net *net;
3233
3234         tcp_twsk_purge(net_exit_list, AF_INET);
3235
3236         list_for_each_entry(net, net_exit_list, exit_list) {
3237                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3238                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3239                 tcp_fastopen_ctx_destroy(net);
3240         }
3241 }
3242
3243 static struct pernet_operations __net_initdata tcp_sk_ops = {
3244        .init       = tcp_sk_init,
3245        .exit       = tcp_sk_exit,
3246        .exit_batch = tcp_sk_exit_batch,
3247 };
3248
3249 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3250 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3251                      struct sock_common *sk_common, uid_t uid)
3252
3253 #define INIT_BATCH_SZ 16
3254
3255 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3256 {
3257         struct bpf_tcp_iter_state *iter = priv_data;
3258         int err;
3259
3260         err = bpf_iter_init_seq_net(priv_data, aux);
3261         if (err)
3262                 return err;
3263
3264         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3265         if (err) {
3266                 bpf_iter_fini_seq_net(priv_data);
3267                 return err;
3268         }
3269
3270         return 0;
3271 }
3272
3273 static void bpf_iter_fini_tcp(void *priv_data)
3274 {
3275         struct bpf_tcp_iter_state *iter = priv_data;
3276
3277         bpf_iter_fini_seq_net(priv_data);
3278         kvfree(iter->batch);
3279 }
3280
3281 static const struct bpf_iter_seq_info tcp_seq_info = {
3282         .seq_ops                = &bpf_iter_tcp_seq_ops,
3283         .init_seq_private       = bpf_iter_init_tcp,
3284         .fini_seq_private       = bpf_iter_fini_tcp,
3285         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3286 };
3287
3288 static const struct bpf_func_proto *
3289 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3290                             const struct bpf_prog *prog)
3291 {
3292         switch (func_id) {
3293         case BPF_FUNC_setsockopt:
3294                 return &bpf_sk_setsockopt_proto;
3295         case BPF_FUNC_getsockopt:
3296                 return &bpf_sk_getsockopt_proto;
3297         default:
3298                 return NULL;
3299         }
3300 }
3301
3302 static struct bpf_iter_reg tcp_reg_info = {
3303         .target                 = "tcp",
3304         .ctx_arg_info_size      = 1,
3305         .ctx_arg_info           = {
3306                 { offsetof(struct bpf_iter__tcp, sk_common),
3307                   PTR_TO_BTF_ID_OR_NULL },
3308         },
3309         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3310         .seq_info               = &tcp_seq_info,
3311 };
3312
3313 static void __init bpf_iter_register(void)
3314 {
3315         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3316         if (bpf_iter_reg_target(&tcp_reg_info))
3317                 pr_warn("Warning: could not register bpf iterator tcp\n");
3318 }
3319
3320 #endif
3321
3322 void __init tcp_v4_init(void)
3323 {
3324         int cpu, res;
3325
3326         for_each_possible_cpu(cpu) {
3327                 struct sock *sk;
3328
3329                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3330                                            IPPROTO_TCP, &init_net);
3331                 if (res)
3332                         panic("Failed to create the TCP control socket.\n");
3333                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3334
3335                 /* Please enforce IP_DF and IPID==0 for RST and
3336                  * ACK sent in SYN-RECV and TIME-WAIT state.
3337                  */
3338                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3339
3340                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3341         }
3342         if (register_pernet_subsys(&tcp_sk_ops))
3343                 panic("Failed to create the TCP control socket.\n");
3344
3345 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3346         bpf_iter_register();
3347 #endif
3348 }