Merge tag 'fsnotify_for_v6.6-rc7' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-rpi.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84
85 #include <trace/events/tcp.h>
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99         return secure_tcp_seq(ip_hdr(skb)->daddr,
100                               ip_hdr(skb)->saddr,
101                               tcp_hdr(skb)->dest,
102                               tcp_hdr(skb)->source);
103 }
104
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116
117         if (reuse == 2) {
118                 /* Still does not detect *everything* that goes through
119                  * lo, since we require a loopback src or dst address
120                  * or direct binding to 'lo' interface.
121                  */
122                 bool loopback = false;
123                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124                         loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126                 if (tw->tw_family == AF_INET6) {
127                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131                                 loopback = true;
132                 } else
133 #endif
134                 {
135                         if (ipv4_is_loopback(tw->tw_daddr) ||
136                             ipv4_is_loopback(tw->tw_rcv_saddr))
137                                 loopback = true;
138                 }
139                 if (!loopback)
140                         reuse = 0;
141         }
142
143         /* With PAWS, it is safe from the viewpoint
144            of data integrity. Even without PAWS it is safe provided sequence
145            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146
147            Actually, the idea is close to VJ's one, only timestamp cache is
148            held not per host, but per port pair and TW bucket is used as state
149            holder.
150
151            If TW bucket has been already destroyed we fall back to VJ's scheme
152            and use initial timestamp retrieved from peer table.
153          */
154         if (tcptw->tw_ts_recent_stamp &&
155             (!twp || (reuse && time_after32(ktime_get_seconds(),
156                                             tcptw->tw_ts_recent_stamp)))) {
157                 /* In case of repair and re-using TIME-WAIT sockets we still
158                  * want to be sure that it is safe as above but honor the
159                  * sequence numbers and time stamps set as part of the repair
160                  * process.
161                  *
162                  * Without this check re-using a TIME-WAIT socket with TCP
163                  * repair would accumulate a -1 on the repair assigned
164                  * sequence number. The first time it is reused the sequence
165                  * is -1, the second time -2, etc. This fixes that issue
166                  * without appearing to create any others.
167                  */
168                 if (likely(!tp->repair)) {
169                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170
171                         if (!seq)
172                                 seq = 1;
173                         WRITE_ONCE(tp->write_seq, seq);
174                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
175                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176                 }
177                 sock_hold(sktw);
178                 return 1;
179         }
180
181         return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186                               int addr_len)
187 {
188         /* This check is replicated from tcp_v4_connect() and intended to
189          * prevent BPF program called below from accessing bytes that are out
190          * of the bound specified by user in addr_len.
191          */
192         if (addr_len < sizeof(struct sockaddr_in))
193                 return -EINVAL;
194
195         sock_owned_by_me(sk);
196
197         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
198 }
199
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204         struct inet_timewait_death_row *tcp_death_row;
205         struct inet_sock *inet = inet_sk(sk);
206         struct tcp_sock *tp = tcp_sk(sk);
207         struct ip_options_rcu *inet_opt;
208         struct net *net = sock_net(sk);
209         __be16 orig_sport, orig_dport;
210         __be32 daddr, nexthop;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235                               orig_dport, sk);
236         if (IS_ERR(rt)) {
237                 err = PTR_ERR(rt);
238                 if (err == -ENETUNREACH)
239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240                 return err;
241         }
242
243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244                 ip_rt_put(rt);
245                 return -ENETUNREACH;
246         }
247
248         if (!inet_opt || !inet_opt->opt.srr)
249                 daddr = fl4->daddr;
250
251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253         if (!inet->inet_saddr) {
254                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255                 if (err) {
256                         ip_rt_put(rt);
257                         return err;
258                 }
259         } else {
260                 sk_rcv_saddr_set(sk, inet->inet_saddr);
261         }
262
263         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264                 /* Reset inherited state */
265                 tp->rx_opt.ts_recent       = 0;
266                 tp->rx_opt.ts_recent_stamp = 0;
267                 if (likely(!tp->repair))
268                         WRITE_ONCE(tp->write_seq, 0);
269         }
270
271         inet->inet_dport = usin->sin_port;
272         sk_daddr_set(sk, daddr);
273
274         inet_csk(sk)->icsk_ext_hdr_len = 0;
275         if (inet_opt)
276                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280         /* Socket identity is still unknown (sport may be zero).
281          * However we set state to SYN-SENT and not releasing socket
282          * lock select source port, enter ourselves into the hash tables and
283          * complete initialization after this.
284          */
285         tcp_set_state(sk, TCP_SYN_SENT);
286         err = inet_hash_connect(tcp_death_row, sk);
287         if (err)
288                 goto failure;
289
290         sk_set_txhash(sk);
291
292         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293                                inet->inet_sport, inet->inet_dport, sk);
294         if (IS_ERR(rt)) {
295                 err = PTR_ERR(rt);
296                 rt = NULL;
297                 goto failure;
298         }
299         /* OK, now commit destination to socket.  */
300         sk->sk_gso_type = SKB_GSO_TCPV4;
301         sk_setup_caps(sk, &rt->dst);
302         rt = NULL;
303
304         if (likely(!tp->repair)) {
305                 if (!tp->write_seq)
306                         WRITE_ONCE(tp->write_seq,
307                                    secure_tcp_seq(inet->inet_saddr,
308                                                   inet->inet_daddr,
309                                                   inet->inet_sport,
310                                                   usin->sin_port));
311                 WRITE_ONCE(tp->tsoffset,
312                            secure_tcp_ts_off(net, inet->inet_saddr,
313                                              inet->inet_daddr));
314         }
315
316         atomic_set(&inet->inet_id, get_random_u16());
317
318         if (tcp_fastopen_defer_connect(sk, &err))
319                 return err;
320         if (err)
321                 goto failure;
322
323         err = tcp_connect(sk);
324
325         if (err)
326                 goto failure;
327
328         return 0;
329
330 failure:
331         /*
332          * This unhashes the socket and releases the local port,
333          * if necessary.
334          */
335         tcp_set_state(sk, TCP_CLOSE);
336         inet_bhash2_reset_saddr(sk);
337         ip_rt_put(rt);
338         sk->sk_route_caps = 0;
339         inet->inet_dport = 0;
340         return err;
341 }
342 EXPORT_SYMBOL(tcp_v4_connect);
343
344 /*
345  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
346  * It can be called through tcp_release_cb() if socket was owned by user
347  * at the time tcp_v4_err() was called to handle ICMP message.
348  */
349 void tcp_v4_mtu_reduced(struct sock *sk)
350 {
351         struct inet_sock *inet = inet_sk(sk);
352         struct dst_entry *dst;
353         u32 mtu;
354
355         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
356                 return;
357         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
358         dst = inet_csk_update_pmtu(sk, mtu);
359         if (!dst)
360                 return;
361
362         /* Something is about to be wrong... Remember soft error
363          * for the case, if this connection will not able to recover.
364          */
365         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
366                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
367
368         mtu = dst_mtu(dst);
369
370         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
371             ip_sk_accept_pmtu(sk) &&
372             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
373                 tcp_sync_mss(sk, mtu);
374
375                 /* Resend the TCP packet because it's
376                  * clear that the old packet has been
377                  * dropped. This is the new "fast" path mtu
378                  * discovery.
379                  */
380                 tcp_simple_retransmit(sk);
381         } /* else let the usual retransmit timer handle it */
382 }
383 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
384
385 static void do_redirect(struct sk_buff *skb, struct sock *sk)
386 {
387         struct dst_entry *dst = __sk_dst_check(sk, 0);
388
389         if (dst)
390                 dst->ops->redirect(dst, sk, skb);
391 }
392
393
394 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
395 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
396 {
397         struct request_sock *req = inet_reqsk(sk);
398         struct net *net = sock_net(sk);
399
400         /* ICMPs are not backlogged, hence we cannot get
401          * an established socket here.
402          */
403         if (seq != tcp_rsk(req)->snt_isn) {
404                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405         } else if (abort) {
406                 /*
407                  * Still in SYN_RECV, just remove it silently.
408                  * There is no good way to pass the error to the newly
409                  * created socket, and POSIX does not want network
410                  * errors returned from accept().
411                  */
412                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
413                 tcp_listendrop(req->rsk_listener);
414         }
415         reqsk_put(req);
416 }
417 EXPORT_SYMBOL(tcp_req_err);
418
419 /* TCP-LD (RFC 6069) logic */
420 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
421 {
422         struct inet_connection_sock *icsk = inet_csk(sk);
423         struct tcp_sock *tp = tcp_sk(sk);
424         struct sk_buff *skb;
425         s32 remaining;
426         u32 delta_us;
427
428         if (sock_owned_by_user(sk))
429                 return;
430
431         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
432             !icsk->icsk_backoff)
433                 return;
434
435         skb = tcp_rtx_queue_head(sk);
436         if (WARN_ON_ONCE(!skb))
437                 return;
438
439         icsk->icsk_backoff--;
440         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
441         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
442
443         tcp_mstamp_refresh(tp);
444         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
445         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446
447         if (remaining > 0) {
448                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449                                           remaining, TCP_RTO_MAX);
450         } else {
451                 /* RTO revert clocked out retransmission.
452                  * Will retransmit now.
453                  */
454                 tcp_retransmit_timer(sk);
455         }
456 }
457 EXPORT_SYMBOL(tcp_ld_RTO_revert);
458
459 /*
460  * This routine is called by the ICMP module when it gets some
461  * sort of error condition.  If err < 0 then the socket should
462  * be closed and the error returned to the user.  If err > 0
463  * it's just the icmp type << 8 | icmp code.  After adjustment
464  * header points to the first 8 bytes of the tcp header.  We need
465  * to find the appropriate port.
466  *
467  * The locking strategy used here is very "optimistic". When
468  * someone else accesses the socket the ICMP is just dropped
469  * and for some paths there is no check at all.
470  * A more general error queue to queue errors for later handling
471  * is probably better.
472  *
473  */
474
475 int tcp_v4_err(struct sk_buff *skb, u32 info)
476 {
477         const struct iphdr *iph = (const struct iphdr *)skb->data;
478         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
479         struct tcp_sock *tp;
480         const int type = icmp_hdr(skb)->type;
481         const int code = icmp_hdr(skb)->code;
482         struct sock *sk;
483         struct request_sock *fastopen;
484         u32 seq, snd_una;
485         int err;
486         struct net *net = dev_net(skb->dev);
487
488         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489                                        iph->daddr, th->dest, iph->saddr,
490                                        ntohs(th->source), inet_iif(skb), 0);
491         if (!sk) {
492                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493                 return -ENOENT;
494         }
495         if (sk->sk_state == TCP_TIME_WAIT) {
496                 inet_twsk_put(inet_twsk(sk));
497                 return 0;
498         }
499         seq = ntohl(th->seq);
500         if (sk->sk_state == TCP_NEW_SYN_RECV) {
501                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502                                      type == ICMP_TIME_EXCEEDED ||
503                                      (type == ICMP_DEST_UNREACH &&
504                                       (code == ICMP_NET_UNREACH ||
505                                        code == ICMP_HOST_UNREACH)));
506                 return 0;
507         }
508
509         bh_lock_sock(sk);
510         /* If too many ICMPs get dropped on busy
511          * servers this needs to be solved differently.
512          * We do take care of PMTU discovery (RFC1191) special case :
513          * we can receive locally generated ICMP messages while socket is held.
514          */
515         if (sock_owned_by_user(sk)) {
516                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518         }
519         if (sk->sk_state == TCP_CLOSE)
520                 goto out;
521
522         if (static_branch_unlikely(&ip4_min_ttl)) {
523                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
524                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526                         goto out;
527                 }
528         }
529
530         tp = tcp_sk(sk);
531         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532         fastopen = rcu_dereference(tp->fastopen_rsk);
533         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534         if (sk->sk_state != TCP_LISTEN &&
535             !between(seq, snd_una, tp->snd_nxt)) {
536                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537                 goto out;
538         }
539
540         switch (type) {
541         case ICMP_REDIRECT:
542                 if (!sock_owned_by_user(sk))
543                         do_redirect(skb, sk);
544                 goto out;
545         case ICMP_SOURCE_QUENCH:
546                 /* Just silently ignore these. */
547                 goto out;
548         case ICMP_PARAMETERPROB:
549                 err = EPROTO;
550                 break;
551         case ICMP_DEST_UNREACH:
552                 if (code > NR_ICMP_UNREACH)
553                         goto out;
554
555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556                         /* We are not interested in TCP_LISTEN and open_requests
557                          * (SYN-ACKs send out by Linux are always <576bytes so
558                          * they should go through unfragmented).
559                          */
560                         if (sk->sk_state == TCP_LISTEN)
561                                 goto out;
562
563                         WRITE_ONCE(tp->mtu_info, info);
564                         if (!sock_owned_by_user(sk)) {
565                                 tcp_v4_mtu_reduced(sk);
566                         } else {
567                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568                                         sock_hold(sk);
569                         }
570                         goto out;
571                 }
572
573                 err = icmp_err_convert[code].errno;
574                 /* check if this ICMP message allows revert of backoff.
575                  * (see RFC 6069)
576                  */
577                 if (!fastopen &&
578                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579                         tcp_ld_RTO_revert(sk, seq);
580                 break;
581         case ICMP_TIME_EXCEEDED:
582                 err = EHOSTUNREACH;
583                 break;
584         default:
585                 goto out;
586         }
587
588         switch (sk->sk_state) {
589         case TCP_SYN_SENT:
590         case TCP_SYN_RECV:
591                 /* Only in fast or simultaneous open. If a fast open socket is
592                  * already accepted it is treated as a connected one below.
593                  */
594                 if (fastopen && !fastopen->sk)
595                         break;
596
597                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598
599                 if (!sock_owned_by_user(sk)) {
600                         WRITE_ONCE(sk->sk_err, err);
601
602                         sk_error_report(sk);
603
604                         tcp_done(sk);
605                 } else {
606                         WRITE_ONCE(sk->sk_err_soft, err);
607                 }
608                 goto out;
609         }
610
611         /* If we've already connected we will keep trying
612          * until we time out, or the user gives up.
613          *
614          * rfc1122 4.2.3.9 allows to consider as hard errors
615          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616          * but it is obsoleted by pmtu discovery).
617          *
618          * Note, that in modern internet, where routing is unreliable
619          * and in each dark corner broken firewalls sit, sending random
620          * errors ordered by their masters even this two messages finally lose
621          * their original sense (even Linux sends invalid PORT_UNREACHs)
622          *
623          * Now we are in compliance with RFCs.
624          *                                                      --ANK (980905)
625          */
626
627         if (!sock_owned_by_user(sk) &&
628             inet_test_bit(RECVERR, sk)) {
629                 WRITE_ONCE(sk->sk_err, err);
630                 sk_error_report(sk);
631         } else  { /* Only an error on timeout */
632                 WRITE_ONCE(sk->sk_err_soft, err);
633         }
634
635 out:
636         bh_unlock_sock(sk);
637         sock_put(sk);
638         return 0;
639 }
640
641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 {
643         struct tcphdr *th = tcp_hdr(skb);
644
645         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646         skb->csum_start = skb_transport_header(skb) - skb->head;
647         skb->csum_offset = offsetof(struct tcphdr, check);
648 }
649
650 /* This routine computes an IPv4 TCP checksum. */
651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 {
653         const struct inet_sock *inet = inet_sk(sk);
654
655         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 }
657 EXPORT_SYMBOL(tcp_v4_send_check);
658
659 /*
660  *      This routine will send an RST to the other tcp.
661  *
662  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663  *                    for reset.
664  *      Answer: if a packet caused RST, it is not for a socket
665  *              existing in our system, if it is matched to a socket,
666  *              it is just duplicate segment or bug in other side's TCP.
667  *              So that we build reply only basing on parameters
668  *              arrived with segment.
669  *      Exception: precedence violation. We do not implement it in any case.
670  */
671
672 #ifdef CONFIG_TCP_MD5SIG
673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #else
675 #define OPTION_BYTES sizeof(__be32)
676 #endif
677
678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 {
680         const struct tcphdr *th = tcp_hdr(skb);
681         struct {
682                 struct tcphdr th;
683                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
684         } rep;
685         struct ip_reply_arg arg;
686 #ifdef CONFIG_TCP_MD5SIG
687         struct tcp_md5sig_key *key = NULL;
688         const __u8 *hash_location = NULL;
689         unsigned char newhash[16];
690         int genhash;
691         struct sock *sk1 = NULL;
692 #endif
693         u64 transmit_time = 0;
694         struct sock *ctl_sk;
695         struct net *net;
696         u32 txhash = 0;
697
698         /* Never send a reset in response to a reset. */
699         if (th->rst)
700                 return;
701
702         /* If sk not NULL, it means we did a successful lookup and incoming
703          * route had to be correct. prequeue might have dropped our dst.
704          */
705         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706                 return;
707
708         /* Swap the send and the receive. */
709         memset(&rep, 0, sizeof(rep));
710         rep.th.dest   = th->source;
711         rep.th.source = th->dest;
712         rep.th.doff   = sizeof(struct tcphdr) / 4;
713         rep.th.rst    = 1;
714
715         if (th->ack) {
716                 rep.th.seq = th->ack_seq;
717         } else {
718                 rep.th.ack = 1;
719                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720                                        skb->len - (th->doff << 2));
721         }
722
723         memset(&arg, 0, sizeof(arg));
724         arg.iov[0].iov_base = (unsigned char *)&rep;
725         arg.iov[0].iov_len  = sizeof(rep.th);
726
727         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728 #ifdef CONFIG_TCP_MD5SIG
729         rcu_read_lock();
730         hash_location = tcp_parse_md5sig_option(th);
731         if (sk && sk_fullsock(sk)) {
732                 const union tcp_md5_addr *addr;
733                 int l3index;
734
735                 /* sdif set, means packet ingressed via a device
736                  * in an L3 domain and inet_iif is set to it.
737                  */
738                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741         } else if (hash_location) {
742                 const union tcp_md5_addr *addr;
743                 int sdif = tcp_v4_sdif(skb);
744                 int dif = inet_iif(skb);
745                 int l3index;
746
747                 /*
748                  * active side is lost. Try to find listening socket through
749                  * source port, and then find md5 key through listening socket.
750                  * we are not loose security here:
751                  * Incoming packet is checked with md5 hash with finding key,
752                  * no RST generated if md5 hash doesn't match.
753                  */
754                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755                                              NULL, 0, ip_hdr(skb)->saddr,
756                                              th->source, ip_hdr(skb)->daddr,
757                                              ntohs(th->source), dif, sdif);
758                 /* don't send rst if it can't find key */
759                 if (!sk1)
760                         goto out;
761
762                 /* sdif set, means packet ingressed via a device
763                  * in an L3 domain and dif is set to it.
764                  */
765                 l3index = sdif ? dif : 0;
766                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768                 if (!key)
769                         goto out;
770
771
772                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
774                         goto out;
775
776         }
777
778         if (key) {
779                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780                                    (TCPOPT_NOP << 16) |
781                                    (TCPOPT_MD5SIG << 8) |
782                                    TCPOLEN_MD5SIG);
783                 /* Update length and the length the header thinks exists */
784                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785                 rep.th.doff = arg.iov[0].iov_len / 4;
786
787                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788                                      key, ip_hdr(skb)->saddr,
789                                      ip_hdr(skb)->daddr, &rep.th);
790         }
791 #endif
792         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793         if (rep.opt[0] == 0) {
794                 __be32 mrst = mptcp_reset_option(skb);
795
796                 if (mrst) {
797                         rep.opt[0] = mrst;
798                         arg.iov[0].iov_len += sizeof(mrst);
799                         rep.th.doff = arg.iov[0].iov_len / 4;
800                 }
801         }
802
803         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804                                       ip_hdr(skb)->saddr, /* XXX */
805                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
806         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808
809         /* When socket is gone, all binding information is lost.
810          * routing might fail in this case. No choice here, if we choose to force
811          * input interface, we will misroute in case of asymmetric route.
812          */
813         if (sk) {
814                 arg.bound_dev_if = sk->sk_bound_dev_if;
815                 if (sk_fullsock(sk))
816                         trace_tcp_send_reset(sk, skb);
817         }
818
819         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821
822         arg.tos = ip_hdr(skb)->tos;
823         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824         local_bh_disable();
825         ctl_sk = this_cpu_read(ipv4_tcp_sk);
826         sock_net_set(ctl_sk, net);
827         if (sk) {
828                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
830                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
832                 transmit_time = tcp_transmit_time(sk);
833                 xfrm_sk_clone_policy(ctl_sk, sk);
834                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836         } else {
837                 ctl_sk->sk_mark = 0;
838                 ctl_sk->sk_priority = 0;
839         }
840         ip_send_unicast_reply(ctl_sk,
841                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
842                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843                               &arg, arg.iov[0].iov_len,
844                               transmit_time, txhash);
845
846         xfrm_sk_free_policy(ctl_sk);
847         sock_net_set(ctl_sk, &init_net);
848         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850         local_bh_enable();
851
852 #ifdef CONFIG_TCP_MD5SIG
853 out:
854         rcu_read_unlock();
855 #endif
856 }
857
858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859    outside socket context is ugly, certainly. What can I do?
860  */
861
862 static void tcp_v4_send_ack(const struct sock *sk,
863                             struct sk_buff *skb, u32 seq, u32 ack,
864                             u32 win, u32 tsval, u32 tsecr, int oif,
865                             struct tcp_md5sig_key *key,
866                             int reply_flags, u8 tos, u32 txhash)
867 {
868         const struct tcphdr *th = tcp_hdr(skb);
869         struct {
870                 struct tcphdr th;
871                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872 #ifdef CONFIG_TCP_MD5SIG
873                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874 #endif
875                         ];
876         } rep;
877         struct net *net = sock_net(sk);
878         struct ip_reply_arg arg;
879         struct sock *ctl_sk;
880         u64 transmit_time;
881
882         memset(&rep.th, 0, sizeof(struct tcphdr));
883         memset(&arg, 0, sizeof(arg));
884
885         arg.iov[0].iov_base = (unsigned char *)&rep;
886         arg.iov[0].iov_len  = sizeof(rep.th);
887         if (tsecr) {
888                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889                                    (TCPOPT_TIMESTAMP << 8) |
890                                    TCPOLEN_TIMESTAMP);
891                 rep.opt[1] = htonl(tsval);
892                 rep.opt[2] = htonl(tsecr);
893                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894         }
895
896         /* Swap the send and the receive. */
897         rep.th.dest    = th->source;
898         rep.th.source  = th->dest;
899         rep.th.doff    = arg.iov[0].iov_len / 4;
900         rep.th.seq     = htonl(seq);
901         rep.th.ack_seq = htonl(ack);
902         rep.th.ack     = 1;
903         rep.th.window  = htons(win);
904
905 #ifdef CONFIG_TCP_MD5SIG
906         if (key) {
907                 int offset = (tsecr) ? 3 : 0;
908
909                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910                                           (TCPOPT_NOP << 16) |
911                                           (TCPOPT_MD5SIG << 8) |
912                                           TCPOLEN_MD5SIG);
913                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914                 rep.th.doff = arg.iov[0].iov_len/4;
915
916                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917                                     key, ip_hdr(skb)->saddr,
918                                     ip_hdr(skb)->daddr, &rep.th);
919         }
920 #endif
921         arg.flags = reply_flags;
922         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923                                       ip_hdr(skb)->saddr, /* XXX */
924                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
925         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926         if (oif)
927                 arg.bound_dev_if = oif;
928         arg.tos = tos;
929         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930         local_bh_disable();
931         ctl_sk = this_cpu_read(ipv4_tcp_sk);
932         sock_net_set(ctl_sk, net);
933         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937         transmit_time = tcp_transmit_time(sk);
938         ip_send_unicast_reply(ctl_sk,
939                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
940                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941                               &arg, arg.iov[0].iov_len,
942                               transmit_time, txhash);
943
944         sock_net_set(ctl_sk, &init_net);
945         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946         local_bh_enable();
947 }
948
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951         struct inet_timewait_sock *tw = inet_twsk(sk);
952         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953
954         tcp_v4_send_ack(sk, skb,
955                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958                         tcptw->tw_ts_recent,
959                         tw->tw_bound_dev_if,
960                         tcp_twsk_md5_key(tcptw),
961                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962                         tw->tw_tos,
963                         tw->tw_txhash
964                         );
965
966         inet_twsk_put(tw);
967 }
968
969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970                                   struct request_sock *req)
971 {
972         const union tcp_md5_addr *addr;
973         int l3index;
974
975         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977          */
978         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979                                              tcp_sk(sk)->snd_nxt;
980
981         /* RFC 7323 2.3
982          * The window field (SEG.WND) of every outgoing segment, with the
983          * exception of <SYN> segments, MUST be right-shifted by
984          * Rcv.Wind.Shift bits:
985          */
986         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988         tcp_v4_send_ack(sk, skb, seq,
989                         tcp_rsk(req)->rcv_nxt,
990                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992                         READ_ONCE(req->ts_recent),
993                         0,
994                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996                         ip_hdr(skb)->tos,
997                         READ_ONCE(tcp_rsk(req)->txhash));
998 }
999
1000 /*
1001  *      Send a SYN-ACK after having received a SYN.
1002  *      This still operates on a request_sock only, not on a big
1003  *      socket.
1004  */
1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006                               struct flowi *fl,
1007                               struct request_sock *req,
1008                               struct tcp_fastopen_cookie *foc,
1009                               enum tcp_synack_type synack_type,
1010                               struct sk_buff *syn_skb)
1011 {
1012         const struct inet_request_sock *ireq = inet_rsk(req);
1013         struct flowi4 fl4;
1014         int err = -1;
1015         struct sk_buff *skb;
1016         u8 tos;
1017
1018         /* First, grab a route. */
1019         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020                 return -1;
1021
1022         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023
1024         if (skb) {
1025                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026
1027                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1030                                 inet_sk(sk)->tos;
1031
1032                 if (!INET_ECN_is_capable(tos) &&
1033                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1034                         tos |= INET_ECN_ECT_0;
1035
1036                 rcu_read_lock();
1037                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038                                             ireq->ir_rmt_addr,
1039                                             rcu_dereference(ireq->ireq_opt),
1040                                             tos);
1041                 rcu_read_unlock();
1042                 err = net_xmit_eval(err);
1043         }
1044
1045         return err;
1046 }
1047
1048 /*
1049  *      IPv4 request_sock destructor.
1050  */
1051 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 {
1053         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054 }
1055
1056 #ifdef CONFIG_TCP_MD5SIG
1057 /*
1058  * RFC2385 MD5 checksumming requires a mapping of
1059  * IP address->MD5 Key.
1060  * We need to maintain these in the sk structure.
1061  */
1062
1063 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1064 EXPORT_SYMBOL(tcp_md5_needed);
1065
1066 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067 {
1068         if (!old)
1069                 return true;
1070
1071         /* l3index always overrides non-l3index */
1072         if (old->l3index && new->l3index == 0)
1073                 return false;
1074         if (old->l3index == 0 && new->l3index)
1075                 return true;
1076
1077         return old->prefixlen < new->prefixlen;
1078 }
1079
1080 /* Find the Key structure for an address.  */
1081 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082                                            const union tcp_md5_addr *addr,
1083                                            int family)
1084 {
1085         const struct tcp_sock *tp = tcp_sk(sk);
1086         struct tcp_md5sig_key *key;
1087         const struct tcp_md5sig_info *md5sig;
1088         __be32 mask;
1089         struct tcp_md5sig_key *best_match = NULL;
1090         bool match;
1091
1092         /* caller either holds rcu_read_lock() or socket lock */
1093         md5sig = rcu_dereference_check(tp->md5sig_info,
1094                                        lockdep_sock_is_held(sk));
1095         if (!md5sig)
1096                 return NULL;
1097
1098         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099                                  lockdep_sock_is_held(sk)) {
1100                 if (key->family != family)
1101                         continue;
1102                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103                         continue;
1104                 if (family == AF_INET) {
1105                         mask = inet_make_mask(key->prefixlen);
1106                         match = (key->addr.a4.s_addr & mask) ==
1107                                 (addr->a4.s_addr & mask);
1108 #if IS_ENABLED(CONFIG_IPV6)
1109                 } else if (family == AF_INET6) {
1110                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111                                                   key->prefixlen);
1112 #endif
1113                 } else {
1114                         match = false;
1115                 }
1116
1117                 if (match && better_md5_match(best_match, key))
1118                         best_match = key;
1119         }
1120         return best_match;
1121 }
1122 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123
1124 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125                                                       const union tcp_md5_addr *addr,
1126                                                       int family, u8 prefixlen,
1127                                                       int l3index, u8 flags)
1128 {
1129         const struct tcp_sock *tp = tcp_sk(sk);
1130         struct tcp_md5sig_key *key;
1131         unsigned int size = sizeof(struct in_addr);
1132         const struct tcp_md5sig_info *md5sig;
1133
1134         /* caller either holds rcu_read_lock() or socket lock */
1135         md5sig = rcu_dereference_check(tp->md5sig_info,
1136                                        lockdep_sock_is_held(sk));
1137         if (!md5sig)
1138                 return NULL;
1139 #if IS_ENABLED(CONFIG_IPV6)
1140         if (family == AF_INET6)
1141                 size = sizeof(struct in6_addr);
1142 #endif
1143         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144                                  lockdep_sock_is_held(sk)) {
1145                 if (key->family != family)
1146                         continue;
1147                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148                         continue;
1149                 if (key->l3index != l3index)
1150                         continue;
1151                 if (!memcmp(&key->addr, addr, size) &&
1152                     key->prefixlen == prefixlen)
1153                         return key;
1154         }
1155         return NULL;
1156 }
1157
1158 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159                                          const struct sock *addr_sk)
1160 {
1161         const union tcp_md5_addr *addr;
1162         int l3index;
1163
1164         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165                                                  addr_sk->sk_bound_dev_if);
1166         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 }
1169 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170
1171 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172 {
1173         struct tcp_sock *tp = tcp_sk(sk);
1174         struct tcp_md5sig_info *md5sig;
1175
1176         md5sig = kmalloc(sizeof(*md5sig), gfp);
1177         if (!md5sig)
1178                 return -ENOMEM;
1179
1180         sk_gso_disable(sk);
1181         INIT_HLIST_HEAD(&md5sig->head);
1182         rcu_assign_pointer(tp->md5sig_info, md5sig);
1183         return 0;
1184 }
1185
1186 /* This can be called on a newly created socket, from other files */
1187 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1188                             int family, u8 prefixlen, int l3index, u8 flags,
1189                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190 {
1191         /* Add Key to the list */
1192         struct tcp_md5sig_key *key;
1193         struct tcp_sock *tp = tcp_sk(sk);
1194         struct tcp_md5sig_info *md5sig;
1195
1196         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197         if (key) {
1198                 /* Pre-existing entry - just update that one.
1199                  * Note that the key might be used concurrently.
1200                  * data_race() is telling kcsan that we do not care of
1201                  * key mismatches, since changing MD5 key on live flows
1202                  * can lead to packet drops.
1203                  */
1204                 data_race(memcpy(key->key, newkey, newkeylen));
1205
1206                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1207                  * Also note that a reader could catch new key->keylen value
1208                  * but old key->key[], this is the reason we use __GFP_ZERO
1209                  * at sock_kmalloc() time below these lines.
1210                  */
1211                 WRITE_ONCE(key->keylen, newkeylen);
1212
1213                 return 0;
1214         }
1215
1216         md5sig = rcu_dereference_protected(tp->md5sig_info,
1217                                            lockdep_sock_is_held(sk));
1218
1219         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220         if (!key)
1221                 return -ENOMEM;
1222         if (!tcp_alloc_md5sig_pool()) {
1223                 sock_kfree_s(sk, key, sizeof(*key));
1224                 return -ENOMEM;
1225         }
1226
1227         memcpy(key->key, newkey, newkeylen);
1228         key->keylen = newkeylen;
1229         key->family = family;
1230         key->prefixlen = prefixlen;
1231         key->l3index = l3index;
1232         key->flags = flags;
1233         memcpy(&key->addr, addr,
1234                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235                                                                  sizeof(struct in_addr));
1236         hlist_add_head_rcu(&key->node, &md5sig->head);
1237         return 0;
1238 }
1239
1240 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1241                    int family, u8 prefixlen, int l3index, u8 flags,
1242                    const u8 *newkey, u8 newkeylen)
1243 {
1244         struct tcp_sock *tp = tcp_sk(sk);
1245
1246         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1247                 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1248                         return -ENOMEM;
1249
1250                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1251                         struct tcp_md5sig_info *md5sig;
1252
1253                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1254                         rcu_assign_pointer(tp->md5sig_info, NULL);
1255                         kfree_rcu(md5sig, rcu);
1256                         return -EUSERS;
1257                 }
1258         }
1259
1260         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1261                                 newkey, newkeylen, GFP_KERNEL);
1262 }
1263 EXPORT_SYMBOL(tcp_md5_do_add);
1264
1265 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1266                      int family, u8 prefixlen, int l3index,
1267                      struct tcp_md5sig_key *key)
1268 {
1269         struct tcp_sock *tp = tcp_sk(sk);
1270
1271         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1272                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1273                         return -ENOMEM;
1274
1275                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1276                         struct tcp_md5sig_info *md5sig;
1277
1278                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1279                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1280                         rcu_assign_pointer(tp->md5sig_info, NULL);
1281                         kfree_rcu(md5sig, rcu);
1282                         return -EUSERS;
1283                 }
1284         }
1285
1286         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1287                                 key->flags, key->key, key->keylen,
1288                                 sk_gfp_mask(sk, GFP_ATOMIC));
1289 }
1290 EXPORT_SYMBOL(tcp_md5_key_copy);
1291
1292 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1293                    u8 prefixlen, int l3index, u8 flags)
1294 {
1295         struct tcp_md5sig_key *key;
1296
1297         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1298         if (!key)
1299                 return -ENOENT;
1300         hlist_del_rcu(&key->node);
1301         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1302         kfree_rcu(key, rcu);
1303         return 0;
1304 }
1305 EXPORT_SYMBOL(tcp_md5_do_del);
1306
1307 static void tcp_clear_md5_list(struct sock *sk)
1308 {
1309         struct tcp_sock *tp = tcp_sk(sk);
1310         struct tcp_md5sig_key *key;
1311         struct hlist_node *n;
1312         struct tcp_md5sig_info *md5sig;
1313
1314         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315
1316         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1317                 hlist_del_rcu(&key->node);
1318                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1319                 kfree_rcu(key, rcu);
1320         }
1321 }
1322
1323 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1324                                  sockptr_t optval, int optlen)
1325 {
1326         struct tcp_md5sig cmd;
1327         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1328         const union tcp_md5_addr *addr;
1329         u8 prefixlen = 32;
1330         int l3index = 0;
1331         u8 flags;
1332
1333         if (optlen < sizeof(cmd))
1334                 return -EINVAL;
1335
1336         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1337                 return -EFAULT;
1338
1339         if (sin->sin_family != AF_INET)
1340                 return -EINVAL;
1341
1342         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343
1344         if (optname == TCP_MD5SIG_EXT &&
1345             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1346                 prefixlen = cmd.tcpm_prefixlen;
1347                 if (prefixlen > 32)
1348                         return -EINVAL;
1349         }
1350
1351         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1352             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1353                 struct net_device *dev;
1354
1355                 rcu_read_lock();
1356                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1357                 if (dev && netif_is_l3_master(dev))
1358                         l3index = dev->ifindex;
1359
1360                 rcu_read_unlock();
1361
1362                 /* ok to reference set/not set outside of rcu;
1363                  * right now device MUST be an L3 master
1364                  */
1365                 if (!dev || !l3index)
1366                         return -EINVAL;
1367         }
1368
1369         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370
1371         if (!cmd.tcpm_keylen)
1372                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373
1374         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1375                 return -EINVAL;
1376
1377         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1378                               cmd.tcpm_key, cmd.tcpm_keylen);
1379 }
1380
1381 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1382                                    __be32 daddr, __be32 saddr,
1383                                    const struct tcphdr *th, int nbytes)
1384 {
1385         struct tcp4_pseudohdr *bp;
1386         struct scatterlist sg;
1387         struct tcphdr *_th;
1388
1389         bp = hp->scratch;
1390         bp->saddr = saddr;
1391         bp->daddr = daddr;
1392         bp->pad = 0;
1393         bp->protocol = IPPROTO_TCP;
1394         bp->len = cpu_to_be16(nbytes);
1395
1396         _th = (struct tcphdr *)(bp + 1);
1397         memcpy(_th, th, sizeof(*th));
1398         _th->check = 0;
1399
1400         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1401         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1402                                 sizeof(*bp) + sizeof(*th));
1403         return crypto_ahash_update(hp->md5_req);
1404 }
1405
1406 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1407                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408 {
1409         struct tcp_md5sig_pool *hp;
1410         struct ahash_request *req;
1411
1412         hp = tcp_get_md5sig_pool();
1413         if (!hp)
1414                 goto clear_hash_noput;
1415         req = hp->md5_req;
1416
1417         if (crypto_ahash_init(req))
1418                 goto clear_hash;
1419         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420                 goto clear_hash;
1421         if (tcp_md5_hash_key(hp, key))
1422                 goto clear_hash;
1423         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1424         if (crypto_ahash_final(req))
1425                 goto clear_hash;
1426
1427         tcp_put_md5sig_pool();
1428         return 0;
1429
1430 clear_hash:
1431         tcp_put_md5sig_pool();
1432 clear_hash_noput:
1433         memset(md5_hash, 0, 16);
1434         return 1;
1435 }
1436
1437 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1438                         const struct sock *sk,
1439                         const struct sk_buff *skb)
1440 {
1441         struct tcp_md5sig_pool *hp;
1442         struct ahash_request *req;
1443         const struct tcphdr *th = tcp_hdr(skb);
1444         __be32 saddr, daddr;
1445
1446         if (sk) { /* valid for establish/request sockets */
1447                 saddr = sk->sk_rcv_saddr;
1448                 daddr = sk->sk_daddr;
1449         } else {
1450                 const struct iphdr *iph = ip_hdr(skb);
1451                 saddr = iph->saddr;
1452                 daddr = iph->daddr;
1453         }
1454
1455         hp = tcp_get_md5sig_pool();
1456         if (!hp)
1457                 goto clear_hash_noput;
1458         req = hp->md5_req;
1459
1460         if (crypto_ahash_init(req))
1461                 goto clear_hash;
1462
1463         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464                 goto clear_hash;
1465         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466                 goto clear_hash;
1467         if (tcp_md5_hash_key(hp, key))
1468                 goto clear_hash;
1469         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1470         if (crypto_ahash_final(req))
1471                 goto clear_hash;
1472
1473         tcp_put_md5sig_pool();
1474         return 0;
1475
1476 clear_hash:
1477         tcp_put_md5sig_pool();
1478 clear_hash_noput:
1479         memset(md5_hash, 0, 16);
1480         return 1;
1481 }
1482 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1483
1484 #endif
1485
1486 static void tcp_v4_init_req(struct request_sock *req,
1487                             const struct sock *sk_listener,
1488                             struct sk_buff *skb)
1489 {
1490         struct inet_request_sock *ireq = inet_rsk(req);
1491         struct net *net = sock_net(sk_listener);
1492
1493         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1494         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1495         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1496 }
1497
1498 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1499                                           struct sk_buff *skb,
1500                                           struct flowi *fl,
1501                                           struct request_sock *req)
1502 {
1503         tcp_v4_init_req(req, sk, skb);
1504
1505         if (security_inet_conn_request(sk, skb, req))
1506                 return NULL;
1507
1508         return inet_csk_route_req(sk, &fl->u.ip4, req);
1509 }
1510
1511 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512         .family         =       PF_INET,
1513         .obj_size       =       sizeof(struct tcp_request_sock),
1514         .rtx_syn_ack    =       tcp_rtx_synack,
1515         .send_ack       =       tcp_v4_reqsk_send_ack,
1516         .destructor     =       tcp_v4_reqsk_destructor,
1517         .send_reset     =       tcp_v4_send_reset,
1518         .syn_ack_timeout =      tcp_syn_ack_timeout,
1519 };
1520
1521 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1522         .mss_clamp      =       TCP_MSS_DEFAULT,
1523 #ifdef CONFIG_TCP_MD5SIG
1524         .req_md5_lookup =       tcp_v4_md5_lookup,
1525         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1526 #endif
1527 #ifdef CONFIG_SYN_COOKIES
1528         .cookie_init_seq =      cookie_v4_init_sequence,
1529 #endif
1530         .route_req      =       tcp_v4_route_req,
1531         .init_seq       =       tcp_v4_init_seq,
1532         .init_ts_off    =       tcp_v4_init_ts_off,
1533         .send_synack    =       tcp_v4_send_synack,
1534 };
1535
1536 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537 {
1538         /* Never answer to SYNs send to broadcast or multicast */
1539         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1540                 goto drop;
1541
1542         return tcp_conn_request(&tcp_request_sock_ops,
1543                                 &tcp_request_sock_ipv4_ops, sk, skb);
1544
1545 drop:
1546         tcp_listendrop(sk);
1547         return 0;
1548 }
1549 EXPORT_SYMBOL(tcp_v4_conn_request);
1550
1551
1552 /*
1553  * The three way handshake has completed - we got a valid synack -
1554  * now create the new socket.
1555  */
1556 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1557                                   struct request_sock *req,
1558                                   struct dst_entry *dst,
1559                                   struct request_sock *req_unhash,
1560                                   bool *own_req)
1561 {
1562         struct inet_request_sock *ireq;
1563         bool found_dup_sk = false;
1564         struct inet_sock *newinet;
1565         struct tcp_sock *newtp;
1566         struct sock *newsk;
1567 #ifdef CONFIG_TCP_MD5SIG
1568         const union tcp_md5_addr *addr;
1569         struct tcp_md5sig_key *key;
1570         int l3index;
1571 #endif
1572         struct ip_options_rcu *inet_opt;
1573
1574         if (sk_acceptq_is_full(sk))
1575                 goto exit_overflow;
1576
1577         newsk = tcp_create_openreq_child(sk, req, skb);
1578         if (!newsk)
1579                 goto exit_nonewsk;
1580
1581         newsk->sk_gso_type = SKB_GSO_TCPV4;
1582         inet_sk_rx_dst_set(newsk, skb);
1583
1584         newtp                 = tcp_sk(newsk);
1585         newinet               = inet_sk(newsk);
1586         ireq                  = inet_rsk(req);
1587         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1588         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1589         newsk->sk_bound_dev_if = ireq->ir_iif;
1590         newinet->inet_saddr   = ireq->ir_loc_addr;
1591         inet_opt              = rcu_dereference(ireq->ireq_opt);
1592         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1593         newinet->mc_index     = inet_iif(skb);
1594         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1595         newinet->rcv_tos      = ip_hdr(skb)->tos;
1596         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597         if (inet_opt)
1598                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1599         atomic_set(&newinet->inet_id, get_random_u16());
1600
1601         /* Set ToS of the new socket based upon the value of incoming SYN.
1602          * ECT bits are set later in tcp_init_transfer().
1603          */
1604         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1605                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1606
1607         if (!dst) {
1608                 dst = inet_csk_route_child_sock(sk, newsk, req);
1609                 if (!dst)
1610                         goto put_and_exit;
1611         } else {
1612                 /* syncookie case : see end of cookie_v4_check() */
1613         }
1614         sk_setup_caps(newsk, dst);
1615
1616         tcp_ca_openreq_child(newsk, dst);
1617
1618         tcp_sync_mss(newsk, dst_mtu(dst));
1619         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620
1621         tcp_initialize_rcv_mss(newsk);
1622
1623 #ifdef CONFIG_TCP_MD5SIG
1624         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1625         /* Copy over the MD5 key from the original socket */
1626         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1627         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628         if (key) {
1629                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630                         goto put_and_exit;
1631                 sk_gso_disable(newsk);
1632         }
1633 #endif
1634
1635         if (__inet_inherit_port(sk, newsk) < 0)
1636                 goto put_and_exit;
1637         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638                                        &found_dup_sk);
1639         if (likely(*own_req)) {
1640                 tcp_move_syn(newtp, req);
1641                 ireq->ireq_opt = NULL;
1642         } else {
1643                 newinet->inet_opt = NULL;
1644
1645                 if (!req_unhash && found_dup_sk) {
1646                         /* This code path should only be executed in the
1647                          * syncookie case only
1648                          */
1649                         bh_unlock_sock(newsk);
1650                         sock_put(newsk);
1651                         newsk = NULL;
1652                 }
1653         }
1654         return newsk;
1655
1656 exit_overflow:
1657         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659         dst_release(dst);
1660 exit:
1661         tcp_listendrop(sk);
1662         return NULL;
1663 put_and_exit:
1664         newinet->inet_opt = NULL;
1665         inet_csk_prepare_forced_close(newsk);
1666         tcp_done(newsk);
1667         goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674         const struct tcphdr *th = tcp_hdr(skb);
1675
1676         if (!th->syn)
1677                 sk = cookie_v4_check(sk, skb);
1678 #endif
1679         return sk;
1680 }
1681
1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683                          struct tcphdr *th, u32 *cookie)
1684 {
1685         u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688                                     &tcp_request_sock_ipv4_ops, sk, th);
1689         if (mss) {
1690                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691                 tcp_synq_overflow(sk);
1692         }
1693 #endif
1694         return mss;
1695 }
1696
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698                                                            u32));
1699 /* The socket must have it's spinlock held when we get
1700  * here, unless it is a TCP_LISTEN socket.
1701  *
1702  * We have a potential double-lock case here, so even when
1703  * doing backlog processing we use the BH locking scheme.
1704  * This is because we cannot sleep with the original spinlock
1705  * held.
1706  */
1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709         enum skb_drop_reason reason;
1710         struct sock *rsk;
1711
1712         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1713                 struct dst_entry *dst;
1714
1715                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1716                                                 lockdep_sock_is_held(sk));
1717
1718                 sock_rps_save_rxhash(sk, skb);
1719                 sk_mark_napi_id(sk, skb);
1720                 if (dst) {
1721                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1722                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723                                              dst, 0)) {
1724                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1725                                 dst_release(dst);
1726                         }
1727                 }
1728                 tcp_rcv_established(sk, skb);
1729                 return 0;
1730         }
1731
1732         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1733         if (tcp_checksum_complete(skb))
1734                 goto csum_err;
1735
1736         if (sk->sk_state == TCP_LISTEN) {
1737                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1738
1739                 if (!nsk)
1740                         goto discard;
1741                 if (nsk != sk) {
1742                         if (tcp_child_process(sk, nsk, skb)) {
1743                                 rsk = nsk;
1744                                 goto reset;
1745                         }
1746                         return 0;
1747                 }
1748         } else
1749                 sock_rps_save_rxhash(sk, skb);
1750
1751         if (tcp_rcv_state_process(sk, skb)) {
1752                 rsk = sk;
1753                 goto reset;
1754         }
1755         return 0;
1756
1757 reset:
1758         tcp_v4_send_reset(rsk, skb);
1759 discard:
1760         kfree_skb_reason(skb, reason);
1761         /* Be careful here. If this function gets more complicated and
1762          * gcc suffers from register pressure on the x86, sk (in %ebx)
1763          * might be destroyed here. This current version compiles correctly,
1764          * but you have been warned.
1765          */
1766         return 0;
1767
1768 csum_err:
1769         reason = SKB_DROP_REASON_TCP_CSUM;
1770         trace_tcp_bad_csum(skb);
1771         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1772         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1773         goto discard;
1774 }
1775 EXPORT_SYMBOL(tcp_v4_do_rcv);
1776
1777 int tcp_v4_early_demux(struct sk_buff *skb)
1778 {
1779         struct net *net = dev_net(skb->dev);
1780         const struct iphdr *iph;
1781         const struct tcphdr *th;
1782         struct sock *sk;
1783
1784         if (skb->pkt_type != PACKET_HOST)
1785                 return 0;
1786
1787         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1788                 return 0;
1789
1790         iph = ip_hdr(skb);
1791         th = tcp_hdr(skb);
1792
1793         if (th->doff < sizeof(struct tcphdr) / 4)
1794                 return 0;
1795
1796         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1797                                        iph->saddr, th->source,
1798                                        iph->daddr, ntohs(th->dest),
1799                                        skb->skb_iif, inet_sdif(skb));
1800         if (sk) {
1801                 skb->sk = sk;
1802                 skb->destructor = sock_edemux;
1803                 if (sk_fullsock(sk)) {
1804                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1805
1806                         if (dst)
1807                                 dst = dst_check(dst, 0);
1808                         if (dst &&
1809                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1810                                 skb_dst_set_noref(skb, dst);
1811                 }
1812         }
1813         return 0;
1814 }
1815
1816 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1817                      enum skb_drop_reason *reason)
1818 {
1819         u32 limit, tail_gso_size, tail_gso_segs;
1820         struct skb_shared_info *shinfo;
1821         const struct tcphdr *th;
1822         struct tcphdr *thtail;
1823         struct sk_buff *tail;
1824         unsigned int hdrlen;
1825         bool fragstolen;
1826         u32 gso_segs;
1827         u32 gso_size;
1828         int delta;
1829
1830         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1831          * we can fix skb->truesize to its real value to avoid future drops.
1832          * This is valid because skb is not yet charged to the socket.
1833          * It has been noticed pure SACK packets were sometimes dropped
1834          * (if cooked by drivers without copybreak feature).
1835          */
1836         skb_condense(skb);
1837
1838         skb_dst_drop(skb);
1839
1840         if (unlikely(tcp_checksum_complete(skb))) {
1841                 bh_unlock_sock(sk);
1842                 trace_tcp_bad_csum(skb);
1843                 *reason = SKB_DROP_REASON_TCP_CSUM;
1844                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1845                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1846                 return true;
1847         }
1848
1849         /* Attempt coalescing to last skb in backlog, even if we are
1850          * above the limits.
1851          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852          */
1853         th = (const struct tcphdr *)skb->data;
1854         hdrlen = th->doff * 4;
1855
1856         tail = sk->sk_backlog.tail;
1857         if (!tail)
1858                 goto no_coalesce;
1859         thtail = (struct tcphdr *)tail->data;
1860
1861         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1862             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1863             ((TCP_SKB_CB(tail)->tcp_flags |
1864               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1865             !((TCP_SKB_CB(tail)->tcp_flags &
1866               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1867             ((TCP_SKB_CB(tail)->tcp_flags ^
1868               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1869 #ifdef CONFIG_TLS_DEVICE
1870             tail->decrypted != skb->decrypted ||
1871 #endif
1872             !mptcp_skb_can_collapse(tail, skb) ||
1873             thtail->doff != th->doff ||
1874             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1875                 goto no_coalesce;
1876
1877         __skb_pull(skb, hdrlen);
1878
1879         shinfo = skb_shinfo(skb);
1880         gso_size = shinfo->gso_size ?: skb->len;
1881         gso_segs = shinfo->gso_segs ?: 1;
1882
1883         shinfo = skb_shinfo(tail);
1884         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1885         tail_gso_segs = shinfo->gso_segs ?: 1;
1886
1887         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1888                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1889
1890                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1891                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1892                         thtail->window = th->window;
1893                 }
1894
1895                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1896                  * thtail->fin, so that the fast path in tcp_rcv_established()
1897                  * is not entered if we append a packet with a FIN.
1898                  * SYN, RST, URG are not present.
1899                  * ACK is set on both packets.
1900                  * PSH : we do not really care in TCP stack,
1901                  *       at least for 'GRO' packets.
1902                  */
1903                 thtail->fin |= th->fin;
1904                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1905
1906                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1907                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1908                         tail->tstamp = skb->tstamp;
1909                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1910                 }
1911
1912                 /* Not as strict as GRO. We only need to carry mss max value */
1913                 shinfo->gso_size = max(gso_size, tail_gso_size);
1914                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1915
1916                 sk->sk_backlog.len += delta;
1917                 __NET_INC_STATS(sock_net(sk),
1918                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1919                 kfree_skb_partial(skb, fragstolen);
1920                 return false;
1921         }
1922         __skb_push(skb, hdrlen);
1923
1924 no_coalesce:
1925         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1926
1927         /* Only socket owner can try to collapse/prune rx queues
1928          * to reduce memory overhead, so add a little headroom here.
1929          * Few sockets backlog are possibly concurrently non empty.
1930          */
1931         limit += 64 * 1024;
1932
1933         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1934                 bh_unlock_sock(sk);
1935                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1936                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1937                 return true;
1938         }
1939         return false;
1940 }
1941 EXPORT_SYMBOL(tcp_add_backlog);
1942
1943 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1944 {
1945         struct tcphdr *th = (struct tcphdr *)skb->data;
1946
1947         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1948 }
1949 EXPORT_SYMBOL(tcp_filter);
1950
1951 static void tcp_v4_restore_cb(struct sk_buff *skb)
1952 {
1953         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1954                 sizeof(struct inet_skb_parm));
1955 }
1956
1957 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1958                            const struct tcphdr *th)
1959 {
1960         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1961          * barrier() makes sure compiler wont play fool^Waliasing games.
1962          */
1963         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1964                 sizeof(struct inet_skb_parm));
1965         barrier();
1966
1967         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1968         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1969                                     skb->len - th->doff * 4);
1970         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1971         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1972         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1973         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1974         TCP_SKB_CB(skb)->sacked  = 0;
1975         TCP_SKB_CB(skb)->has_rxtstamp =
1976                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1977 }
1978
1979 /*
1980  *      From tcp_input.c
1981  */
1982
1983 int tcp_v4_rcv(struct sk_buff *skb)
1984 {
1985         struct net *net = dev_net(skb->dev);
1986         enum skb_drop_reason drop_reason;
1987         int sdif = inet_sdif(skb);
1988         int dif = inet_iif(skb);
1989         const struct iphdr *iph;
1990         const struct tcphdr *th;
1991         bool refcounted;
1992         struct sock *sk;
1993         int ret;
1994
1995         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1996         if (skb->pkt_type != PACKET_HOST)
1997                 goto discard_it;
1998
1999         /* Count it even if it's bad */
2000         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2001
2002         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2003                 goto discard_it;
2004
2005         th = (const struct tcphdr *)skb->data;
2006
2007         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2008                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2009                 goto bad_packet;
2010         }
2011         if (!pskb_may_pull(skb, th->doff * 4))
2012                 goto discard_it;
2013
2014         /* An explanation is required here, I think.
2015          * Packet length and doff are validated by header prediction,
2016          * provided case of th->doff==0 is eliminated.
2017          * So, we defer the checks. */
2018
2019         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2020                 goto csum_error;
2021
2022         th = (const struct tcphdr *)skb->data;
2023         iph = ip_hdr(skb);
2024 lookup:
2025         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2026                                skb, __tcp_hdrlen(th), th->source,
2027                                th->dest, sdif, &refcounted);
2028         if (!sk)
2029                 goto no_tcp_socket;
2030
2031 process:
2032         if (sk->sk_state == TCP_TIME_WAIT)
2033                 goto do_time_wait;
2034
2035         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2036                 struct request_sock *req = inet_reqsk(sk);
2037                 bool req_stolen = false;
2038                 struct sock *nsk;
2039
2040                 sk = req->rsk_listener;
2041                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2042                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2043                 else
2044                         drop_reason = tcp_inbound_md5_hash(sk, skb,
2045                                                    &iph->saddr, &iph->daddr,
2046                                                    AF_INET, dif, sdif);
2047                 if (unlikely(drop_reason)) {
2048                         sk_drops_add(sk, skb);
2049                         reqsk_put(req);
2050                         goto discard_it;
2051                 }
2052                 if (tcp_checksum_complete(skb)) {
2053                         reqsk_put(req);
2054                         goto csum_error;
2055                 }
2056                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2057                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2058                         if (!nsk) {
2059                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2060                                 goto lookup;
2061                         }
2062                         sk = nsk;
2063                         /* reuseport_migrate_sock() has already held one sk_refcnt
2064                          * before returning.
2065                          */
2066                 } else {
2067                         /* We own a reference on the listener, increase it again
2068                          * as we might lose it too soon.
2069                          */
2070                         sock_hold(sk);
2071                 }
2072                 refcounted = true;
2073                 nsk = NULL;
2074                 if (!tcp_filter(sk, skb)) {
2075                         th = (const struct tcphdr *)skb->data;
2076                         iph = ip_hdr(skb);
2077                         tcp_v4_fill_cb(skb, iph, th);
2078                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2079                 } else {
2080                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2081                 }
2082                 if (!nsk) {
2083                         reqsk_put(req);
2084                         if (req_stolen) {
2085                                 /* Another cpu got exclusive access to req
2086                                  * and created a full blown socket.
2087                                  * Try to feed this packet to this socket
2088                                  * instead of discarding it.
2089                                  */
2090                                 tcp_v4_restore_cb(skb);
2091                                 sock_put(sk);
2092                                 goto lookup;
2093                         }
2094                         goto discard_and_relse;
2095                 }
2096                 nf_reset_ct(skb);
2097                 if (nsk == sk) {
2098                         reqsk_put(req);
2099                         tcp_v4_restore_cb(skb);
2100                 } else if (tcp_child_process(sk, nsk, skb)) {
2101                         tcp_v4_send_reset(nsk, skb);
2102                         goto discard_and_relse;
2103                 } else {
2104                         sock_put(sk);
2105                         return 0;
2106                 }
2107         }
2108
2109         if (static_branch_unlikely(&ip4_min_ttl)) {
2110                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2111                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2112                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2113                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2114                         goto discard_and_relse;
2115                 }
2116         }
2117
2118         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2119                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2120                 goto discard_and_relse;
2121         }
2122
2123         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2124                                            &iph->daddr, AF_INET, dif, sdif);
2125         if (drop_reason)
2126                 goto discard_and_relse;
2127
2128         nf_reset_ct(skb);
2129
2130         if (tcp_filter(sk, skb)) {
2131                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2132                 goto discard_and_relse;
2133         }
2134         th = (const struct tcphdr *)skb->data;
2135         iph = ip_hdr(skb);
2136         tcp_v4_fill_cb(skb, iph, th);
2137
2138         skb->dev = NULL;
2139
2140         if (sk->sk_state == TCP_LISTEN) {
2141                 ret = tcp_v4_do_rcv(sk, skb);
2142                 goto put_and_return;
2143         }
2144
2145         sk_incoming_cpu_update(sk);
2146
2147         bh_lock_sock_nested(sk);
2148         tcp_segs_in(tcp_sk(sk), skb);
2149         ret = 0;
2150         if (!sock_owned_by_user(sk)) {
2151                 ret = tcp_v4_do_rcv(sk, skb);
2152         } else {
2153                 if (tcp_add_backlog(sk, skb, &drop_reason))
2154                         goto discard_and_relse;
2155         }
2156         bh_unlock_sock(sk);
2157
2158 put_and_return:
2159         if (refcounted)
2160                 sock_put(sk);
2161
2162         return ret;
2163
2164 no_tcp_socket:
2165         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2166         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2167                 goto discard_it;
2168
2169         tcp_v4_fill_cb(skb, iph, th);
2170
2171         if (tcp_checksum_complete(skb)) {
2172 csum_error:
2173                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2174                 trace_tcp_bad_csum(skb);
2175                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2176 bad_packet:
2177                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2178         } else {
2179                 tcp_v4_send_reset(NULL, skb);
2180         }
2181
2182 discard_it:
2183         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2184         /* Discard frame. */
2185         kfree_skb_reason(skb, drop_reason);
2186         return 0;
2187
2188 discard_and_relse:
2189         sk_drops_add(sk, skb);
2190         if (refcounted)
2191                 sock_put(sk);
2192         goto discard_it;
2193
2194 do_time_wait:
2195         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2196                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2197                 inet_twsk_put(inet_twsk(sk));
2198                 goto discard_it;
2199         }
2200
2201         tcp_v4_fill_cb(skb, iph, th);
2202
2203         if (tcp_checksum_complete(skb)) {
2204                 inet_twsk_put(inet_twsk(sk));
2205                 goto csum_error;
2206         }
2207         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2208         case TCP_TW_SYN: {
2209                 struct sock *sk2 = inet_lookup_listener(net,
2210                                                         net->ipv4.tcp_death_row.hashinfo,
2211                                                         skb, __tcp_hdrlen(th),
2212                                                         iph->saddr, th->source,
2213                                                         iph->daddr, th->dest,
2214                                                         inet_iif(skb),
2215                                                         sdif);
2216                 if (sk2) {
2217                         inet_twsk_deschedule_put(inet_twsk(sk));
2218                         sk = sk2;
2219                         tcp_v4_restore_cb(skb);
2220                         refcounted = false;
2221                         goto process;
2222                 }
2223         }
2224                 /* to ACK */
2225                 fallthrough;
2226         case TCP_TW_ACK:
2227                 tcp_v4_timewait_ack(sk, skb);
2228                 break;
2229         case TCP_TW_RST:
2230                 tcp_v4_send_reset(sk, skb);
2231                 inet_twsk_deschedule_put(inet_twsk(sk));
2232                 goto discard_it;
2233         case TCP_TW_SUCCESS:;
2234         }
2235         goto discard_it;
2236 }
2237
2238 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2239         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2240         .twsk_unique    = tcp_twsk_unique,
2241         .twsk_destructor= tcp_twsk_destructor,
2242 };
2243
2244 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2245 {
2246         struct dst_entry *dst = skb_dst(skb);
2247
2248         if (dst && dst_hold_safe(dst)) {
2249                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2250                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2251         }
2252 }
2253 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2254
2255 const struct inet_connection_sock_af_ops ipv4_specific = {
2256         .queue_xmit        = ip_queue_xmit,
2257         .send_check        = tcp_v4_send_check,
2258         .rebuild_header    = inet_sk_rebuild_header,
2259         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2260         .conn_request      = tcp_v4_conn_request,
2261         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2262         .net_header_len    = sizeof(struct iphdr),
2263         .setsockopt        = ip_setsockopt,
2264         .getsockopt        = ip_getsockopt,
2265         .addr2sockaddr     = inet_csk_addr2sockaddr,
2266         .sockaddr_len      = sizeof(struct sockaddr_in),
2267         .mtu_reduced       = tcp_v4_mtu_reduced,
2268 };
2269 EXPORT_SYMBOL(ipv4_specific);
2270
2271 #ifdef CONFIG_TCP_MD5SIG
2272 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2273         .md5_lookup             = tcp_v4_md5_lookup,
2274         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2275         .md5_parse              = tcp_v4_parse_md5_keys,
2276 };
2277 #endif
2278
2279 /* NOTE: A lot of things set to zero explicitly by call to
2280  *       sk_alloc() so need not be done here.
2281  */
2282 static int tcp_v4_init_sock(struct sock *sk)
2283 {
2284         struct inet_connection_sock *icsk = inet_csk(sk);
2285
2286         tcp_init_sock(sk);
2287
2288         icsk->icsk_af_ops = &ipv4_specific;
2289
2290 #ifdef CONFIG_TCP_MD5SIG
2291         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2292 #endif
2293
2294         return 0;
2295 }
2296
2297 void tcp_v4_destroy_sock(struct sock *sk)
2298 {
2299         struct tcp_sock *tp = tcp_sk(sk);
2300
2301         trace_tcp_destroy_sock(sk);
2302
2303         tcp_clear_xmit_timers(sk);
2304
2305         tcp_cleanup_congestion_control(sk);
2306
2307         tcp_cleanup_ulp(sk);
2308
2309         /* Cleanup up the write buffer. */
2310         tcp_write_queue_purge(sk);
2311
2312         /* Check if we want to disable active TFO */
2313         tcp_fastopen_active_disable_ofo_check(sk);
2314
2315         /* Cleans up our, hopefully empty, out_of_order_queue. */
2316         skb_rbtree_purge(&tp->out_of_order_queue);
2317
2318 #ifdef CONFIG_TCP_MD5SIG
2319         /* Clean up the MD5 key list, if any */
2320         if (tp->md5sig_info) {
2321                 tcp_clear_md5_list(sk);
2322                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2323                 tp->md5sig_info = NULL;
2324                 static_branch_slow_dec_deferred(&tcp_md5_needed);
2325         }
2326 #endif
2327
2328         /* Clean up a referenced TCP bind bucket. */
2329         if (inet_csk(sk)->icsk_bind_hash)
2330                 inet_put_port(sk);
2331
2332         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2333
2334         /* If socket is aborted during connect operation */
2335         tcp_free_fastopen_req(tp);
2336         tcp_fastopen_destroy_cipher(sk);
2337         tcp_saved_syn_free(tp);
2338
2339         sk_sockets_allocated_dec(sk);
2340 }
2341 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2342
2343 #ifdef CONFIG_PROC_FS
2344 /* Proc filesystem TCP sock list dumping. */
2345
2346 static unsigned short seq_file_family(const struct seq_file *seq);
2347
2348 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2349 {
2350         unsigned short family = seq_file_family(seq);
2351
2352         /* AF_UNSPEC is used as a match all */
2353         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2354                 net_eq(sock_net(sk), seq_file_net(seq)));
2355 }
2356
2357 /* Find a non empty bucket (starting from st->bucket)
2358  * and return the first sk from it.
2359  */
2360 static void *listening_get_first(struct seq_file *seq)
2361 {
2362         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2363         struct tcp_iter_state *st = seq->private;
2364
2365         st->offset = 0;
2366         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2367                 struct inet_listen_hashbucket *ilb2;
2368                 struct hlist_nulls_node *node;
2369                 struct sock *sk;
2370
2371                 ilb2 = &hinfo->lhash2[st->bucket];
2372                 if (hlist_nulls_empty(&ilb2->nulls_head))
2373                         continue;
2374
2375                 spin_lock(&ilb2->lock);
2376                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2377                         if (seq_sk_match(seq, sk))
2378                                 return sk;
2379                 }
2380                 spin_unlock(&ilb2->lock);
2381         }
2382
2383         return NULL;
2384 }
2385
2386 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2387  * If "cur" is the last one in the st->bucket,
2388  * call listening_get_first() to return the first sk of the next
2389  * non empty bucket.
2390  */
2391 static void *listening_get_next(struct seq_file *seq, void *cur)
2392 {
2393         struct tcp_iter_state *st = seq->private;
2394         struct inet_listen_hashbucket *ilb2;
2395         struct hlist_nulls_node *node;
2396         struct inet_hashinfo *hinfo;
2397         struct sock *sk = cur;
2398
2399         ++st->num;
2400         ++st->offset;
2401
2402         sk = sk_nulls_next(sk);
2403         sk_nulls_for_each_from(sk, node) {
2404                 if (seq_sk_match(seq, sk))
2405                         return sk;
2406         }
2407
2408         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2409         ilb2 = &hinfo->lhash2[st->bucket];
2410         spin_unlock(&ilb2->lock);
2411         ++st->bucket;
2412         return listening_get_first(seq);
2413 }
2414
2415 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2416 {
2417         struct tcp_iter_state *st = seq->private;
2418         void *rc;
2419
2420         st->bucket = 0;
2421         st->offset = 0;
2422         rc = listening_get_first(seq);
2423
2424         while (rc && *pos) {
2425                 rc = listening_get_next(seq, rc);
2426                 --*pos;
2427         }
2428         return rc;
2429 }
2430
2431 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2432                                 const struct tcp_iter_state *st)
2433 {
2434         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2435 }
2436
2437 /*
2438  * Get first established socket starting from bucket given in st->bucket.
2439  * If st->bucket is zero, the very first socket in the hash is returned.
2440  */
2441 static void *established_get_first(struct seq_file *seq)
2442 {
2443         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2444         struct tcp_iter_state *st = seq->private;
2445
2446         st->offset = 0;
2447         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2448                 struct sock *sk;
2449                 struct hlist_nulls_node *node;
2450                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2451
2452                 cond_resched();
2453
2454                 /* Lockless fast path for the common case of empty buckets */
2455                 if (empty_bucket(hinfo, st))
2456                         continue;
2457
2458                 spin_lock_bh(lock);
2459                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2460                         if (seq_sk_match(seq, sk))
2461                                 return sk;
2462                 }
2463                 spin_unlock_bh(lock);
2464         }
2465
2466         return NULL;
2467 }
2468
2469 static void *established_get_next(struct seq_file *seq, void *cur)
2470 {
2471         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2472         struct tcp_iter_state *st = seq->private;
2473         struct hlist_nulls_node *node;
2474         struct sock *sk = cur;
2475
2476         ++st->num;
2477         ++st->offset;
2478
2479         sk = sk_nulls_next(sk);
2480
2481         sk_nulls_for_each_from(sk, node) {
2482                 if (seq_sk_match(seq, sk))
2483                         return sk;
2484         }
2485
2486         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2487         ++st->bucket;
2488         return established_get_first(seq);
2489 }
2490
2491 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2492 {
2493         struct tcp_iter_state *st = seq->private;
2494         void *rc;
2495
2496         st->bucket = 0;
2497         rc = established_get_first(seq);
2498
2499         while (rc && pos) {
2500                 rc = established_get_next(seq, rc);
2501                 --pos;
2502         }
2503         return rc;
2504 }
2505
2506 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2507 {
2508         void *rc;
2509         struct tcp_iter_state *st = seq->private;
2510
2511         st->state = TCP_SEQ_STATE_LISTENING;
2512         rc        = listening_get_idx(seq, &pos);
2513
2514         if (!rc) {
2515                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2516                 rc        = established_get_idx(seq, pos);
2517         }
2518
2519         return rc;
2520 }
2521
2522 static void *tcp_seek_last_pos(struct seq_file *seq)
2523 {
2524         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2525         struct tcp_iter_state *st = seq->private;
2526         int bucket = st->bucket;
2527         int offset = st->offset;
2528         int orig_num = st->num;
2529         void *rc = NULL;
2530
2531         switch (st->state) {
2532         case TCP_SEQ_STATE_LISTENING:
2533                 if (st->bucket > hinfo->lhash2_mask)
2534                         break;
2535                 rc = listening_get_first(seq);
2536                 while (offset-- && rc && bucket == st->bucket)
2537                         rc = listening_get_next(seq, rc);
2538                 if (rc)
2539                         break;
2540                 st->bucket = 0;
2541                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2542                 fallthrough;
2543         case TCP_SEQ_STATE_ESTABLISHED:
2544                 if (st->bucket > hinfo->ehash_mask)
2545                         break;
2546                 rc = established_get_first(seq);
2547                 while (offset-- && rc && bucket == st->bucket)
2548                         rc = established_get_next(seq, rc);
2549         }
2550
2551         st->num = orig_num;
2552
2553         return rc;
2554 }
2555
2556 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2557 {
2558         struct tcp_iter_state *st = seq->private;
2559         void *rc;
2560
2561         if (*pos && *pos == st->last_pos) {
2562                 rc = tcp_seek_last_pos(seq);
2563                 if (rc)
2564                         goto out;
2565         }
2566
2567         st->state = TCP_SEQ_STATE_LISTENING;
2568         st->num = 0;
2569         st->bucket = 0;
2570         st->offset = 0;
2571         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2572
2573 out:
2574         st->last_pos = *pos;
2575         return rc;
2576 }
2577 EXPORT_SYMBOL(tcp_seq_start);
2578
2579 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2580 {
2581         struct tcp_iter_state *st = seq->private;
2582         void *rc = NULL;
2583
2584         if (v == SEQ_START_TOKEN) {
2585                 rc = tcp_get_idx(seq, 0);
2586                 goto out;
2587         }
2588
2589         switch (st->state) {
2590         case TCP_SEQ_STATE_LISTENING:
2591                 rc = listening_get_next(seq, v);
2592                 if (!rc) {
2593                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2594                         st->bucket = 0;
2595                         st->offset = 0;
2596                         rc        = established_get_first(seq);
2597                 }
2598                 break;
2599         case TCP_SEQ_STATE_ESTABLISHED:
2600                 rc = established_get_next(seq, v);
2601                 break;
2602         }
2603 out:
2604         ++*pos;
2605         st->last_pos = *pos;
2606         return rc;
2607 }
2608 EXPORT_SYMBOL(tcp_seq_next);
2609
2610 void tcp_seq_stop(struct seq_file *seq, void *v)
2611 {
2612         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2613         struct tcp_iter_state *st = seq->private;
2614
2615         switch (st->state) {
2616         case TCP_SEQ_STATE_LISTENING:
2617                 if (v != SEQ_START_TOKEN)
2618                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2619                 break;
2620         case TCP_SEQ_STATE_ESTABLISHED:
2621                 if (v)
2622                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2623                 break;
2624         }
2625 }
2626 EXPORT_SYMBOL(tcp_seq_stop);
2627
2628 static void get_openreq4(const struct request_sock *req,
2629                          struct seq_file *f, int i)
2630 {
2631         const struct inet_request_sock *ireq = inet_rsk(req);
2632         long delta = req->rsk_timer.expires - jiffies;
2633
2634         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2635                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2636                 i,
2637                 ireq->ir_loc_addr,
2638                 ireq->ir_num,
2639                 ireq->ir_rmt_addr,
2640                 ntohs(ireq->ir_rmt_port),
2641                 TCP_SYN_RECV,
2642                 0, 0, /* could print option size, but that is af dependent. */
2643                 1,    /* timers active (only the expire timer) */
2644                 jiffies_delta_to_clock_t(delta),
2645                 req->num_timeout,
2646                 from_kuid_munged(seq_user_ns(f),
2647                                  sock_i_uid(req->rsk_listener)),
2648                 0,  /* non standard timer */
2649                 0, /* open_requests have no inode */
2650                 0,
2651                 req);
2652 }
2653
2654 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2655 {
2656         int timer_active;
2657         unsigned long timer_expires;
2658         const struct tcp_sock *tp = tcp_sk(sk);
2659         const struct inet_connection_sock *icsk = inet_csk(sk);
2660         const struct inet_sock *inet = inet_sk(sk);
2661         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2662         __be32 dest = inet->inet_daddr;
2663         __be32 src = inet->inet_rcv_saddr;
2664         __u16 destp = ntohs(inet->inet_dport);
2665         __u16 srcp = ntohs(inet->inet_sport);
2666         int rx_queue;
2667         int state;
2668
2669         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2670             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2671             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2672                 timer_active    = 1;
2673                 timer_expires   = icsk->icsk_timeout;
2674         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2675                 timer_active    = 4;
2676                 timer_expires   = icsk->icsk_timeout;
2677         } else if (timer_pending(&sk->sk_timer)) {
2678                 timer_active    = 2;
2679                 timer_expires   = sk->sk_timer.expires;
2680         } else {
2681                 timer_active    = 0;
2682                 timer_expires = jiffies;
2683         }
2684
2685         state = inet_sk_state_load(sk);
2686         if (state == TCP_LISTEN)
2687                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2688         else
2689                 /* Because we don't lock the socket,
2690                  * we might find a transient negative value.
2691                  */
2692                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2693                                       READ_ONCE(tp->copied_seq), 0);
2694
2695         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2696                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2697                 i, src, srcp, dest, destp, state,
2698                 READ_ONCE(tp->write_seq) - tp->snd_una,
2699                 rx_queue,
2700                 timer_active,
2701                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2702                 icsk->icsk_retransmits,
2703                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2704                 icsk->icsk_probes_out,
2705                 sock_i_ino(sk),
2706                 refcount_read(&sk->sk_refcnt), sk,
2707                 jiffies_to_clock_t(icsk->icsk_rto),
2708                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2709                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2710                 tcp_snd_cwnd(tp),
2711                 state == TCP_LISTEN ?
2712                     fastopenq->max_qlen :
2713                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2714 }
2715
2716 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2717                                struct seq_file *f, int i)
2718 {
2719         long delta = tw->tw_timer.expires - jiffies;
2720         __be32 dest, src;
2721         __u16 destp, srcp;
2722
2723         dest  = tw->tw_daddr;
2724         src   = tw->tw_rcv_saddr;
2725         destp = ntohs(tw->tw_dport);
2726         srcp  = ntohs(tw->tw_sport);
2727
2728         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2729                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2730                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2731                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2732                 refcount_read(&tw->tw_refcnt), tw);
2733 }
2734
2735 #define TMPSZ 150
2736
2737 static int tcp4_seq_show(struct seq_file *seq, void *v)
2738 {
2739         struct tcp_iter_state *st;
2740         struct sock *sk = v;
2741
2742         seq_setwidth(seq, TMPSZ - 1);
2743         if (v == SEQ_START_TOKEN) {
2744                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2745                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2746                            "inode");
2747                 goto out;
2748         }
2749         st = seq->private;
2750
2751         if (sk->sk_state == TCP_TIME_WAIT)
2752                 get_timewait4_sock(v, seq, st->num);
2753         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2754                 get_openreq4(v, seq, st->num);
2755         else
2756                 get_tcp4_sock(v, seq, st->num);
2757 out:
2758         seq_pad(seq, '\n');
2759         return 0;
2760 }
2761
2762 #ifdef CONFIG_BPF_SYSCALL
2763 struct bpf_tcp_iter_state {
2764         struct tcp_iter_state state;
2765         unsigned int cur_sk;
2766         unsigned int end_sk;
2767         unsigned int max_sk;
2768         struct sock **batch;
2769         bool st_bucket_done;
2770 };
2771
2772 struct bpf_iter__tcp {
2773         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2774         __bpf_md_ptr(struct sock_common *, sk_common);
2775         uid_t uid __aligned(8);
2776 };
2777
2778 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2779                              struct sock_common *sk_common, uid_t uid)
2780 {
2781         struct bpf_iter__tcp ctx;
2782
2783         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2784         ctx.meta = meta;
2785         ctx.sk_common = sk_common;
2786         ctx.uid = uid;
2787         return bpf_iter_run_prog(prog, &ctx);
2788 }
2789
2790 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2791 {
2792         while (iter->cur_sk < iter->end_sk)
2793                 sock_gen_put(iter->batch[iter->cur_sk++]);
2794 }
2795
2796 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2797                                       unsigned int new_batch_sz)
2798 {
2799         struct sock **new_batch;
2800
2801         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2802                              GFP_USER | __GFP_NOWARN);
2803         if (!new_batch)
2804                 return -ENOMEM;
2805
2806         bpf_iter_tcp_put_batch(iter);
2807         kvfree(iter->batch);
2808         iter->batch = new_batch;
2809         iter->max_sk = new_batch_sz;
2810
2811         return 0;
2812 }
2813
2814 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2815                                                  struct sock *start_sk)
2816 {
2817         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2818         struct bpf_tcp_iter_state *iter = seq->private;
2819         struct tcp_iter_state *st = &iter->state;
2820         struct hlist_nulls_node *node;
2821         unsigned int expected = 1;
2822         struct sock *sk;
2823
2824         sock_hold(start_sk);
2825         iter->batch[iter->end_sk++] = start_sk;
2826
2827         sk = sk_nulls_next(start_sk);
2828         sk_nulls_for_each_from(sk, node) {
2829                 if (seq_sk_match(seq, sk)) {
2830                         if (iter->end_sk < iter->max_sk) {
2831                                 sock_hold(sk);
2832                                 iter->batch[iter->end_sk++] = sk;
2833                         }
2834                         expected++;
2835                 }
2836         }
2837         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2838
2839         return expected;
2840 }
2841
2842 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2843                                                    struct sock *start_sk)
2844 {
2845         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2846         struct bpf_tcp_iter_state *iter = seq->private;
2847         struct tcp_iter_state *st = &iter->state;
2848         struct hlist_nulls_node *node;
2849         unsigned int expected = 1;
2850         struct sock *sk;
2851
2852         sock_hold(start_sk);
2853         iter->batch[iter->end_sk++] = start_sk;
2854
2855         sk = sk_nulls_next(start_sk);
2856         sk_nulls_for_each_from(sk, node) {
2857                 if (seq_sk_match(seq, sk)) {
2858                         if (iter->end_sk < iter->max_sk) {
2859                                 sock_hold(sk);
2860                                 iter->batch[iter->end_sk++] = sk;
2861                         }
2862                         expected++;
2863                 }
2864         }
2865         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2866
2867         return expected;
2868 }
2869
2870 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2871 {
2872         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2873         struct bpf_tcp_iter_state *iter = seq->private;
2874         struct tcp_iter_state *st = &iter->state;
2875         unsigned int expected;
2876         bool resized = false;
2877         struct sock *sk;
2878
2879         /* The st->bucket is done.  Directly advance to the next
2880          * bucket instead of having the tcp_seek_last_pos() to skip
2881          * one by one in the current bucket and eventually find out
2882          * it has to advance to the next bucket.
2883          */
2884         if (iter->st_bucket_done) {
2885                 st->offset = 0;
2886                 st->bucket++;
2887                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2888                     st->bucket > hinfo->lhash2_mask) {
2889                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2890                         st->bucket = 0;
2891                 }
2892         }
2893
2894 again:
2895         /* Get a new batch */
2896         iter->cur_sk = 0;
2897         iter->end_sk = 0;
2898         iter->st_bucket_done = false;
2899
2900         sk = tcp_seek_last_pos(seq);
2901         if (!sk)
2902                 return NULL; /* Done */
2903
2904         if (st->state == TCP_SEQ_STATE_LISTENING)
2905                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2906         else
2907                 expected = bpf_iter_tcp_established_batch(seq, sk);
2908
2909         if (iter->end_sk == expected) {
2910                 iter->st_bucket_done = true;
2911                 return sk;
2912         }
2913
2914         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2915                 resized = true;
2916                 goto again;
2917         }
2918
2919         return sk;
2920 }
2921
2922 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2923 {
2924         /* bpf iter does not support lseek, so it always
2925          * continue from where it was stop()-ped.
2926          */
2927         if (*pos)
2928                 return bpf_iter_tcp_batch(seq);
2929
2930         return SEQ_START_TOKEN;
2931 }
2932
2933 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2934 {
2935         struct bpf_tcp_iter_state *iter = seq->private;
2936         struct tcp_iter_state *st = &iter->state;
2937         struct sock *sk;
2938
2939         /* Whenever seq_next() is called, the iter->cur_sk is
2940          * done with seq_show(), so advance to the next sk in
2941          * the batch.
2942          */
2943         if (iter->cur_sk < iter->end_sk) {
2944                 /* Keeping st->num consistent in tcp_iter_state.
2945                  * bpf_iter_tcp does not use st->num.
2946                  * meta.seq_num is used instead.
2947                  */
2948                 st->num++;
2949                 /* Move st->offset to the next sk in the bucket such that
2950                  * the future start() will resume at st->offset in
2951                  * st->bucket.  See tcp_seek_last_pos().
2952                  */
2953                 st->offset++;
2954                 sock_gen_put(iter->batch[iter->cur_sk++]);
2955         }
2956
2957         if (iter->cur_sk < iter->end_sk)
2958                 sk = iter->batch[iter->cur_sk];
2959         else
2960                 sk = bpf_iter_tcp_batch(seq);
2961
2962         ++*pos;
2963         /* Keeping st->last_pos consistent in tcp_iter_state.
2964          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2965          */
2966         st->last_pos = *pos;
2967         return sk;
2968 }
2969
2970 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2971 {
2972         struct bpf_iter_meta meta;
2973         struct bpf_prog *prog;
2974         struct sock *sk = v;
2975         uid_t uid;
2976         int ret;
2977
2978         if (v == SEQ_START_TOKEN)
2979                 return 0;
2980
2981         if (sk_fullsock(sk))
2982                 lock_sock(sk);
2983
2984         if (unlikely(sk_unhashed(sk))) {
2985                 ret = SEQ_SKIP;
2986                 goto unlock;
2987         }
2988
2989         if (sk->sk_state == TCP_TIME_WAIT) {
2990                 uid = 0;
2991         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2992                 const struct request_sock *req = v;
2993
2994                 uid = from_kuid_munged(seq_user_ns(seq),
2995                                        sock_i_uid(req->rsk_listener));
2996         } else {
2997                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2998         }
2999
3000         meta.seq = seq;
3001         prog = bpf_iter_get_info(&meta, false);
3002         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3003
3004 unlock:
3005         if (sk_fullsock(sk))
3006                 release_sock(sk);
3007         return ret;
3008
3009 }
3010
3011 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3012 {
3013         struct bpf_tcp_iter_state *iter = seq->private;
3014         struct bpf_iter_meta meta;
3015         struct bpf_prog *prog;
3016
3017         if (!v) {
3018                 meta.seq = seq;
3019                 prog = bpf_iter_get_info(&meta, true);
3020                 if (prog)
3021                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3022         }
3023
3024         if (iter->cur_sk < iter->end_sk) {
3025                 bpf_iter_tcp_put_batch(iter);
3026                 iter->st_bucket_done = false;
3027         }
3028 }
3029
3030 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3031         .show           = bpf_iter_tcp_seq_show,
3032         .start          = bpf_iter_tcp_seq_start,
3033         .next           = bpf_iter_tcp_seq_next,
3034         .stop           = bpf_iter_tcp_seq_stop,
3035 };
3036 #endif
3037 static unsigned short seq_file_family(const struct seq_file *seq)
3038 {
3039         const struct tcp_seq_afinfo *afinfo;
3040
3041 #ifdef CONFIG_BPF_SYSCALL
3042         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3043         if (seq->op == &bpf_iter_tcp_seq_ops)
3044                 return AF_UNSPEC;
3045 #endif
3046
3047         /* Iterated from proc fs */
3048         afinfo = pde_data(file_inode(seq->file));
3049         return afinfo->family;
3050 }
3051
3052 static const struct seq_operations tcp4_seq_ops = {
3053         .show           = tcp4_seq_show,
3054         .start          = tcp_seq_start,
3055         .next           = tcp_seq_next,
3056         .stop           = tcp_seq_stop,
3057 };
3058
3059 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3060         .family         = AF_INET,
3061 };
3062
3063 static int __net_init tcp4_proc_init_net(struct net *net)
3064 {
3065         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3066                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3067                 return -ENOMEM;
3068         return 0;
3069 }
3070
3071 static void __net_exit tcp4_proc_exit_net(struct net *net)
3072 {
3073         remove_proc_entry("tcp", net->proc_net);
3074 }
3075
3076 static struct pernet_operations tcp4_net_ops = {
3077         .init = tcp4_proc_init_net,
3078         .exit = tcp4_proc_exit_net,
3079 };
3080
3081 int __init tcp4_proc_init(void)
3082 {
3083         return register_pernet_subsys(&tcp4_net_ops);
3084 }
3085
3086 void tcp4_proc_exit(void)
3087 {
3088         unregister_pernet_subsys(&tcp4_net_ops);
3089 }
3090 #endif /* CONFIG_PROC_FS */
3091
3092 /* @wake is one when sk_stream_write_space() calls us.
3093  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3094  * This mimics the strategy used in sock_def_write_space().
3095  */
3096 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3097 {
3098         const struct tcp_sock *tp = tcp_sk(sk);
3099         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3100                             READ_ONCE(tp->snd_nxt);
3101
3102         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3103 }
3104 EXPORT_SYMBOL(tcp_stream_memory_free);
3105
3106 struct proto tcp_prot = {
3107         .name                   = "TCP",
3108         .owner                  = THIS_MODULE,
3109         .close                  = tcp_close,
3110         .pre_connect            = tcp_v4_pre_connect,
3111         .connect                = tcp_v4_connect,
3112         .disconnect             = tcp_disconnect,
3113         .accept                 = inet_csk_accept,
3114         .ioctl                  = tcp_ioctl,
3115         .init                   = tcp_v4_init_sock,
3116         .destroy                = tcp_v4_destroy_sock,
3117         .shutdown               = tcp_shutdown,
3118         .setsockopt             = tcp_setsockopt,
3119         .getsockopt             = tcp_getsockopt,
3120         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3121         .keepalive              = tcp_set_keepalive,
3122         .recvmsg                = tcp_recvmsg,
3123         .sendmsg                = tcp_sendmsg,
3124         .splice_eof             = tcp_splice_eof,
3125         .backlog_rcv            = tcp_v4_do_rcv,
3126         .release_cb             = tcp_release_cb,
3127         .hash                   = inet_hash,
3128         .unhash                 = inet_unhash,
3129         .get_port               = inet_csk_get_port,
3130         .put_port               = inet_put_port,
3131 #ifdef CONFIG_BPF_SYSCALL
3132         .psock_update_sk_prot   = tcp_bpf_update_proto,
3133 #endif
3134         .enter_memory_pressure  = tcp_enter_memory_pressure,
3135         .leave_memory_pressure  = tcp_leave_memory_pressure,
3136         .stream_memory_free     = tcp_stream_memory_free,
3137         .sockets_allocated      = &tcp_sockets_allocated,
3138         .orphan_count           = &tcp_orphan_count,
3139
3140         .memory_allocated       = &tcp_memory_allocated,
3141         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3142
3143         .memory_pressure        = &tcp_memory_pressure,
3144         .sysctl_mem             = sysctl_tcp_mem,
3145         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3146         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3147         .max_header             = MAX_TCP_HEADER,
3148         .obj_size               = sizeof(struct tcp_sock),
3149         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3150         .twsk_prot              = &tcp_timewait_sock_ops,
3151         .rsk_prot               = &tcp_request_sock_ops,
3152         .h.hashinfo             = NULL,
3153         .no_autobind            = true,
3154         .diag_destroy           = tcp_abort,
3155 };
3156 EXPORT_SYMBOL(tcp_prot);
3157
3158 static void __net_exit tcp_sk_exit(struct net *net)
3159 {
3160         if (net->ipv4.tcp_congestion_control)
3161                 bpf_module_put(net->ipv4.tcp_congestion_control,
3162                                net->ipv4.tcp_congestion_control->owner);
3163 }
3164
3165 static void __net_init tcp_set_hashinfo(struct net *net)
3166 {
3167         struct inet_hashinfo *hinfo;
3168         unsigned int ehash_entries;
3169         struct net *old_net;
3170
3171         if (net_eq(net, &init_net))
3172                 goto fallback;
3173
3174         old_net = current->nsproxy->net_ns;
3175         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3176         if (!ehash_entries)
3177                 goto fallback;
3178
3179         ehash_entries = roundup_pow_of_two(ehash_entries);
3180         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3181         if (!hinfo) {
3182                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3183                         "for a netns, fallback to the global one\n",
3184                         ehash_entries);
3185 fallback:
3186                 hinfo = &tcp_hashinfo;
3187                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3188         }
3189
3190         net->ipv4.tcp_death_row.hashinfo = hinfo;
3191         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3192         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3193 }
3194
3195 static int __net_init tcp_sk_init(struct net *net)
3196 {
3197         net->ipv4.sysctl_tcp_ecn = 2;
3198         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3199
3200         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3201         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3202         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3203         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3204         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3205
3206         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3207         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3208         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3209
3210         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3211         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3212         net->ipv4.sysctl_tcp_syncookies = 1;
3213         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3214         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3215         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3216         net->ipv4.sysctl_tcp_orphan_retries = 0;
3217         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3218         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3219         net->ipv4.sysctl_tcp_tw_reuse = 2;
3220         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3221
3222         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3223         tcp_set_hashinfo(net);
3224
3225         net->ipv4.sysctl_tcp_sack = 1;
3226         net->ipv4.sysctl_tcp_window_scaling = 1;
3227         net->ipv4.sysctl_tcp_timestamps = 1;
3228         net->ipv4.sysctl_tcp_early_retrans = 3;
3229         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3230         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3231         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3232         net->ipv4.sysctl_tcp_max_reordering = 300;
3233         net->ipv4.sysctl_tcp_dsack = 1;
3234         net->ipv4.sysctl_tcp_app_win = 31;
3235         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3236         net->ipv4.sysctl_tcp_frto = 2;
3237         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3238         /* This limits the percentage of the congestion window which we
3239          * will allow a single TSO frame to consume.  Building TSO frames
3240          * which are too large can cause TCP streams to be bursty.
3241          */
3242         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3243         /* Default TSQ limit of 16 TSO segments */
3244         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3245
3246         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3247         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3248
3249         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3250         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3251         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3252         net->ipv4.sysctl_tcp_autocorking = 1;
3253         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3254         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3255         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3256         if (net != &init_net) {
3257                 memcpy(net->ipv4.sysctl_tcp_rmem,
3258                        init_net.ipv4.sysctl_tcp_rmem,
3259                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3260                 memcpy(net->ipv4.sysctl_tcp_wmem,
3261                        init_net.ipv4.sysctl_tcp_wmem,
3262                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3263         }
3264         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3265         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3266         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3267         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3268         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3269         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3270
3271         /* Set default values for PLB */
3272         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3273         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3274         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3275         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3276         /* Default congestion threshold for PLB to mark a round is 50% */
3277         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3278
3279         /* Reno is always built in */
3280         if (!net_eq(net, &init_net) &&
3281             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3282                                init_net.ipv4.tcp_congestion_control->owner))
3283                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3284         else
3285                 net->ipv4.tcp_congestion_control = &tcp_reno;
3286
3287         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3288         net->ipv4.sysctl_tcp_shrink_window = 0;
3289
3290         return 0;
3291 }
3292
3293 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3294 {
3295         struct net *net;
3296
3297         tcp_twsk_purge(net_exit_list, AF_INET);
3298
3299         list_for_each_entry(net, net_exit_list, exit_list) {
3300                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3301                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3302                 tcp_fastopen_ctx_destroy(net);
3303         }
3304 }
3305
3306 static struct pernet_operations __net_initdata tcp_sk_ops = {
3307        .init       = tcp_sk_init,
3308        .exit       = tcp_sk_exit,
3309        .exit_batch = tcp_sk_exit_batch,
3310 };
3311
3312 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3313 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3314                      struct sock_common *sk_common, uid_t uid)
3315
3316 #define INIT_BATCH_SZ 16
3317
3318 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3319 {
3320         struct bpf_tcp_iter_state *iter = priv_data;
3321         int err;
3322
3323         err = bpf_iter_init_seq_net(priv_data, aux);
3324         if (err)
3325                 return err;
3326
3327         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3328         if (err) {
3329                 bpf_iter_fini_seq_net(priv_data);
3330                 return err;
3331         }
3332
3333         return 0;
3334 }
3335
3336 static void bpf_iter_fini_tcp(void *priv_data)
3337 {
3338         struct bpf_tcp_iter_state *iter = priv_data;
3339
3340         bpf_iter_fini_seq_net(priv_data);
3341         kvfree(iter->batch);
3342 }
3343
3344 static const struct bpf_iter_seq_info tcp_seq_info = {
3345         .seq_ops                = &bpf_iter_tcp_seq_ops,
3346         .init_seq_private       = bpf_iter_init_tcp,
3347         .fini_seq_private       = bpf_iter_fini_tcp,
3348         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3349 };
3350
3351 static const struct bpf_func_proto *
3352 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3353                             const struct bpf_prog *prog)
3354 {
3355         switch (func_id) {
3356         case BPF_FUNC_setsockopt:
3357                 return &bpf_sk_setsockopt_proto;
3358         case BPF_FUNC_getsockopt:
3359                 return &bpf_sk_getsockopt_proto;
3360         default:
3361                 return NULL;
3362         }
3363 }
3364
3365 static struct bpf_iter_reg tcp_reg_info = {
3366         .target                 = "tcp",
3367         .ctx_arg_info_size      = 1,
3368         .ctx_arg_info           = {
3369                 { offsetof(struct bpf_iter__tcp, sk_common),
3370                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3371         },
3372         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3373         .seq_info               = &tcp_seq_info,
3374 };
3375
3376 static void __init bpf_iter_register(void)
3377 {
3378         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3379         if (bpf_iter_reg_target(&tcp_reg_info))
3380                 pr_warn("Warning: could not register bpf iterator tcp\n");
3381 }
3382
3383 #endif
3384
3385 void __init tcp_v4_init(void)
3386 {
3387         int cpu, res;
3388
3389         for_each_possible_cpu(cpu) {
3390                 struct sock *sk;
3391
3392                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3393                                            IPPROTO_TCP, &init_net);
3394                 if (res)
3395                         panic("Failed to create the TCP control socket.\n");
3396                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3397
3398                 /* Please enforce IP_DF and IPID==0 for RST and
3399                  * ACK sent in SYN-RECV and TIME-WAIT state.
3400                  */
3401                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3402
3403                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3404         }
3405         if (register_pernet_subsys(&tcp_sk_ops))
3406                 panic("Failed to create the TCP control socket.\n");
3407
3408 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3409         bpf_iter_register();
3410 #endif
3411 }