Merge tag 'counter-fixes-for-6.6a' of git://git.kernel.org/pub/scm/linux/kernel/git...
[platform/kernel/linux-starfive.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60 #include <linux/sched.h>
61
62 #include <net/net_namespace.h>
63 #include <net/icmp.h>
64 #include <net/inet_hashtables.h>
65 #include <net/tcp.h>
66 #include <net/transp_v6.h>
67 #include <net/ipv6.h>
68 #include <net/inet_common.h>
69 #include <net/timewait_sock.h>
70 #include <net/xfrm.h>
71 #include <net/secure_seq.h>
72 #include <net/busy_poll.h>
73
74 #include <linux/inet.h>
75 #include <linux/ipv6.h>
76 #include <linux/stddef.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/inetdevice.h>
80 #include <linux/btf_ids.h>
81
82 #include <crypto/hash.h>
83 #include <linux/scatterlist.h>
84
85 #include <trace/events/tcp.h>
86
87 #ifdef CONFIG_TCP_MD5SIG
88 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
89                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
90 #endif
91
92 struct inet_hashinfo tcp_hashinfo;
93 EXPORT_SYMBOL(tcp_hashinfo);
94
95 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
96
97 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
98 {
99         return secure_tcp_seq(ip_hdr(skb)->daddr,
100                               ip_hdr(skb)->saddr,
101                               tcp_hdr(skb)->dest,
102                               tcp_hdr(skb)->source);
103 }
104
105 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
106 {
107         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
108 }
109
110 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
111 {
112         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
113         const struct inet_timewait_sock *tw = inet_twsk(sktw);
114         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
115         struct tcp_sock *tp = tcp_sk(sk);
116
117         if (reuse == 2) {
118                 /* Still does not detect *everything* that goes through
119                  * lo, since we require a loopback src or dst address
120                  * or direct binding to 'lo' interface.
121                  */
122                 bool loopback = false;
123                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
124                         loopback = true;
125 #if IS_ENABLED(CONFIG_IPV6)
126                 if (tw->tw_family == AF_INET6) {
127                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
129                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
130                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
131                                 loopback = true;
132                 } else
133 #endif
134                 {
135                         if (ipv4_is_loopback(tw->tw_daddr) ||
136                             ipv4_is_loopback(tw->tw_rcv_saddr))
137                                 loopback = true;
138                 }
139                 if (!loopback)
140                         reuse = 0;
141         }
142
143         /* With PAWS, it is safe from the viewpoint
144            of data integrity. Even without PAWS it is safe provided sequence
145            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
146
147            Actually, the idea is close to VJ's one, only timestamp cache is
148            held not per host, but per port pair and TW bucket is used as state
149            holder.
150
151            If TW bucket has been already destroyed we fall back to VJ's scheme
152            and use initial timestamp retrieved from peer table.
153          */
154         if (tcptw->tw_ts_recent_stamp &&
155             (!twp || (reuse && time_after32(ktime_get_seconds(),
156                                             tcptw->tw_ts_recent_stamp)))) {
157                 /* In case of repair and re-using TIME-WAIT sockets we still
158                  * want to be sure that it is safe as above but honor the
159                  * sequence numbers and time stamps set as part of the repair
160                  * process.
161                  *
162                  * Without this check re-using a TIME-WAIT socket with TCP
163                  * repair would accumulate a -1 on the repair assigned
164                  * sequence number. The first time it is reused the sequence
165                  * is -1, the second time -2, etc. This fixes that issue
166                  * without appearing to create any others.
167                  */
168                 if (likely(!tp->repair)) {
169                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
170
171                         if (!seq)
172                                 seq = 1;
173                         WRITE_ONCE(tp->write_seq, seq);
174                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
175                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
176                 }
177                 sock_hold(sktw);
178                 return 1;
179         }
180
181         return 0;
182 }
183 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
184
185 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
186                               int addr_len)
187 {
188         /* This check is replicated from tcp_v4_connect() and intended to
189          * prevent BPF program called below from accessing bytes that are out
190          * of the bound specified by user in addr_len.
191          */
192         if (addr_len < sizeof(struct sockaddr_in))
193                 return -EINVAL;
194
195         sock_owned_by_me(sk);
196
197         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
198 }
199
200 /* This will initiate an outgoing connection. */
201 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
202 {
203         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
204         struct inet_timewait_death_row *tcp_death_row;
205         struct inet_sock *inet = inet_sk(sk);
206         struct tcp_sock *tp = tcp_sk(sk);
207         struct ip_options_rcu *inet_opt;
208         struct net *net = sock_net(sk);
209         __be16 orig_sport, orig_dport;
210         __be32 daddr, nexthop;
211         struct flowi4 *fl4;
212         struct rtable *rt;
213         int err;
214
215         if (addr_len < sizeof(struct sockaddr_in))
216                 return -EINVAL;
217
218         if (usin->sin_family != AF_INET)
219                 return -EAFNOSUPPORT;
220
221         nexthop = daddr = usin->sin_addr.s_addr;
222         inet_opt = rcu_dereference_protected(inet->inet_opt,
223                                              lockdep_sock_is_held(sk));
224         if (inet_opt && inet_opt->opt.srr) {
225                 if (!daddr)
226                         return -EINVAL;
227                 nexthop = inet_opt->opt.faddr;
228         }
229
230         orig_sport = inet->inet_sport;
231         orig_dport = usin->sin_port;
232         fl4 = &inet->cork.fl.u.ip4;
233         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
234                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
235                               orig_dport, sk);
236         if (IS_ERR(rt)) {
237                 err = PTR_ERR(rt);
238                 if (err == -ENETUNREACH)
239                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
240                 return err;
241         }
242
243         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
244                 ip_rt_put(rt);
245                 return -ENETUNREACH;
246         }
247
248         if (!inet_opt || !inet_opt->opt.srr)
249                 daddr = fl4->daddr;
250
251         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
252
253         if (!inet->inet_saddr) {
254                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
255                 if (err) {
256                         ip_rt_put(rt);
257                         return err;
258                 }
259         } else {
260                 sk_rcv_saddr_set(sk, inet->inet_saddr);
261         }
262
263         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
264                 /* Reset inherited state */
265                 tp->rx_opt.ts_recent       = 0;
266                 tp->rx_opt.ts_recent_stamp = 0;
267                 if (likely(!tp->repair))
268                         WRITE_ONCE(tp->write_seq, 0);
269         }
270
271         inet->inet_dport = usin->sin_port;
272         sk_daddr_set(sk, daddr);
273
274         inet_csk(sk)->icsk_ext_hdr_len = 0;
275         if (inet_opt)
276                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
277
278         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
279
280         /* Socket identity is still unknown (sport may be zero).
281          * However we set state to SYN-SENT and not releasing socket
282          * lock select source port, enter ourselves into the hash tables and
283          * complete initialization after this.
284          */
285         tcp_set_state(sk, TCP_SYN_SENT);
286         err = inet_hash_connect(tcp_death_row, sk);
287         if (err)
288                 goto failure;
289
290         sk_set_txhash(sk);
291
292         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
293                                inet->inet_sport, inet->inet_dport, sk);
294         if (IS_ERR(rt)) {
295                 err = PTR_ERR(rt);
296                 rt = NULL;
297                 goto failure;
298         }
299         /* OK, now commit destination to socket.  */
300         sk->sk_gso_type = SKB_GSO_TCPV4;
301         sk_setup_caps(sk, &rt->dst);
302         rt = NULL;
303
304         if (likely(!tp->repair)) {
305                 if (!tp->write_seq)
306                         WRITE_ONCE(tp->write_seq,
307                                    secure_tcp_seq(inet->inet_saddr,
308                                                   inet->inet_daddr,
309                                                   inet->inet_sport,
310                                                   usin->sin_port));
311                 WRITE_ONCE(tp->tsoffset,
312                            secure_tcp_ts_off(net, inet->inet_saddr,
313                                              inet->inet_daddr));
314         }
315
316         atomic_set(&inet->inet_id, get_random_u16());
317
318         if (tcp_fastopen_defer_connect(sk, &err))
319                 return err;
320         if (err)
321                 goto failure;
322
323         err = tcp_connect(sk);
324
325         if (err)
326                 goto failure;
327
328         return 0;
329
330 failure:
331         /*
332          * This unhashes the socket and releases the local port,
333          * if necessary.
334          */
335         tcp_set_state(sk, TCP_CLOSE);
336         inet_bhash2_reset_saddr(sk);
337         ip_rt_put(rt);
338         sk->sk_route_caps = 0;
339         inet->inet_dport = 0;
340         return err;
341 }
342 EXPORT_SYMBOL(tcp_v4_connect);
343
344 /*
345  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
346  * It can be called through tcp_release_cb() if socket was owned by user
347  * at the time tcp_v4_err() was called to handle ICMP message.
348  */
349 void tcp_v4_mtu_reduced(struct sock *sk)
350 {
351         struct inet_sock *inet = inet_sk(sk);
352         struct dst_entry *dst;
353         u32 mtu;
354
355         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
356                 return;
357         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
358         dst = inet_csk_update_pmtu(sk, mtu);
359         if (!dst)
360                 return;
361
362         /* Something is about to be wrong... Remember soft error
363          * for the case, if this connection will not able to recover.
364          */
365         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
366                 WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
367
368         mtu = dst_mtu(dst);
369
370         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
371             ip_sk_accept_pmtu(sk) &&
372             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
373                 tcp_sync_mss(sk, mtu);
374
375                 /* Resend the TCP packet because it's
376                  * clear that the old packet has been
377                  * dropped. This is the new "fast" path mtu
378                  * discovery.
379                  */
380                 tcp_simple_retransmit(sk);
381         } /* else let the usual retransmit timer handle it */
382 }
383 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
384
385 static void do_redirect(struct sk_buff *skb, struct sock *sk)
386 {
387         struct dst_entry *dst = __sk_dst_check(sk, 0);
388
389         if (dst)
390                 dst->ops->redirect(dst, sk, skb);
391 }
392
393
394 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
395 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
396 {
397         struct request_sock *req = inet_reqsk(sk);
398         struct net *net = sock_net(sk);
399
400         /* ICMPs are not backlogged, hence we cannot get
401          * an established socket here.
402          */
403         if (seq != tcp_rsk(req)->snt_isn) {
404                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
405         } else if (abort) {
406                 /*
407                  * Still in SYN_RECV, just remove it silently.
408                  * There is no good way to pass the error to the newly
409                  * created socket, and POSIX does not want network
410                  * errors returned from accept().
411                  */
412                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
413                 tcp_listendrop(req->rsk_listener);
414         }
415         reqsk_put(req);
416 }
417 EXPORT_SYMBOL(tcp_req_err);
418
419 /* TCP-LD (RFC 6069) logic */
420 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
421 {
422         struct inet_connection_sock *icsk = inet_csk(sk);
423         struct tcp_sock *tp = tcp_sk(sk);
424         struct sk_buff *skb;
425         s32 remaining;
426         u32 delta_us;
427
428         if (sock_owned_by_user(sk))
429                 return;
430
431         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
432             !icsk->icsk_backoff)
433                 return;
434
435         skb = tcp_rtx_queue_head(sk);
436         if (WARN_ON_ONCE(!skb))
437                 return;
438
439         icsk->icsk_backoff--;
440         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
441         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
442
443         tcp_mstamp_refresh(tp);
444         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
445         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
446
447         if (remaining > 0) {
448                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
449                                           remaining, TCP_RTO_MAX);
450         } else {
451                 /* RTO revert clocked out retransmission.
452                  * Will retransmit now.
453                  */
454                 tcp_retransmit_timer(sk);
455         }
456 }
457 EXPORT_SYMBOL(tcp_ld_RTO_revert);
458
459 /*
460  * This routine is called by the ICMP module when it gets some
461  * sort of error condition.  If err < 0 then the socket should
462  * be closed and the error returned to the user.  If err > 0
463  * it's just the icmp type << 8 | icmp code.  After adjustment
464  * header points to the first 8 bytes of the tcp header.  We need
465  * to find the appropriate port.
466  *
467  * The locking strategy used here is very "optimistic". When
468  * someone else accesses the socket the ICMP is just dropped
469  * and for some paths there is no check at all.
470  * A more general error queue to queue errors for later handling
471  * is probably better.
472  *
473  */
474
475 int tcp_v4_err(struct sk_buff *skb, u32 info)
476 {
477         const struct iphdr *iph = (const struct iphdr *)skb->data;
478         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
479         struct tcp_sock *tp;
480         const int type = icmp_hdr(skb)->type;
481         const int code = icmp_hdr(skb)->code;
482         struct sock *sk;
483         struct request_sock *fastopen;
484         u32 seq, snd_una;
485         int err;
486         struct net *net = dev_net(skb->dev);
487
488         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
489                                        iph->daddr, th->dest, iph->saddr,
490                                        ntohs(th->source), inet_iif(skb), 0);
491         if (!sk) {
492                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
493                 return -ENOENT;
494         }
495         if (sk->sk_state == TCP_TIME_WAIT) {
496                 inet_twsk_put(inet_twsk(sk));
497                 return 0;
498         }
499         seq = ntohl(th->seq);
500         if (sk->sk_state == TCP_NEW_SYN_RECV) {
501                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
502                                      type == ICMP_TIME_EXCEEDED ||
503                                      (type == ICMP_DEST_UNREACH &&
504                                       (code == ICMP_NET_UNREACH ||
505                                        code == ICMP_HOST_UNREACH)));
506                 return 0;
507         }
508
509         bh_lock_sock(sk);
510         /* If too many ICMPs get dropped on busy
511          * servers this needs to be solved differently.
512          * We do take care of PMTU discovery (RFC1191) special case :
513          * we can receive locally generated ICMP messages while socket is held.
514          */
515         if (sock_owned_by_user(sk)) {
516                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
517                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
518         }
519         if (sk->sk_state == TCP_CLOSE)
520                 goto out;
521
522         if (static_branch_unlikely(&ip4_min_ttl)) {
523                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
524                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
525                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
526                         goto out;
527                 }
528         }
529
530         tp = tcp_sk(sk);
531         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
532         fastopen = rcu_dereference(tp->fastopen_rsk);
533         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
534         if (sk->sk_state != TCP_LISTEN &&
535             !between(seq, snd_una, tp->snd_nxt)) {
536                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
537                 goto out;
538         }
539
540         switch (type) {
541         case ICMP_REDIRECT:
542                 if (!sock_owned_by_user(sk))
543                         do_redirect(skb, sk);
544                 goto out;
545         case ICMP_SOURCE_QUENCH:
546                 /* Just silently ignore these. */
547                 goto out;
548         case ICMP_PARAMETERPROB:
549                 err = EPROTO;
550                 break;
551         case ICMP_DEST_UNREACH:
552                 if (code > NR_ICMP_UNREACH)
553                         goto out;
554
555                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
556                         /* We are not interested in TCP_LISTEN and open_requests
557                          * (SYN-ACKs send out by Linux are always <576bytes so
558                          * they should go through unfragmented).
559                          */
560                         if (sk->sk_state == TCP_LISTEN)
561                                 goto out;
562
563                         WRITE_ONCE(tp->mtu_info, info);
564                         if (!sock_owned_by_user(sk)) {
565                                 tcp_v4_mtu_reduced(sk);
566                         } else {
567                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
568                                         sock_hold(sk);
569                         }
570                         goto out;
571                 }
572
573                 err = icmp_err_convert[code].errno;
574                 /* check if this ICMP message allows revert of backoff.
575                  * (see RFC 6069)
576                  */
577                 if (!fastopen &&
578                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
579                         tcp_ld_RTO_revert(sk, seq);
580                 break;
581         case ICMP_TIME_EXCEEDED:
582                 err = EHOSTUNREACH;
583                 break;
584         default:
585                 goto out;
586         }
587
588         switch (sk->sk_state) {
589         case TCP_SYN_SENT:
590         case TCP_SYN_RECV:
591                 /* Only in fast or simultaneous open. If a fast open socket is
592                  * already accepted it is treated as a connected one below.
593                  */
594                 if (fastopen && !fastopen->sk)
595                         break;
596
597                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
598
599                 if (!sock_owned_by_user(sk)) {
600                         WRITE_ONCE(sk->sk_err, err);
601
602                         sk_error_report(sk);
603
604                         tcp_done(sk);
605                 } else {
606                         WRITE_ONCE(sk->sk_err_soft, err);
607                 }
608                 goto out;
609         }
610
611         /* If we've already connected we will keep trying
612          * until we time out, or the user gives up.
613          *
614          * rfc1122 4.2.3.9 allows to consider as hard errors
615          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
616          * but it is obsoleted by pmtu discovery).
617          *
618          * Note, that in modern internet, where routing is unreliable
619          * and in each dark corner broken firewalls sit, sending random
620          * errors ordered by their masters even this two messages finally lose
621          * their original sense (even Linux sends invalid PORT_UNREACHs)
622          *
623          * Now we are in compliance with RFCs.
624          *                                                      --ANK (980905)
625          */
626
627         if (!sock_owned_by_user(sk) &&
628             inet_test_bit(RECVERR, sk)) {
629                 WRITE_ONCE(sk->sk_err, err);
630                 sk_error_report(sk);
631         } else  { /* Only an error on timeout */
632                 WRITE_ONCE(sk->sk_err_soft, err);
633         }
634
635 out:
636         bh_unlock_sock(sk);
637         sock_put(sk);
638         return 0;
639 }
640
641 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
642 {
643         struct tcphdr *th = tcp_hdr(skb);
644
645         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
646         skb->csum_start = skb_transport_header(skb) - skb->head;
647         skb->csum_offset = offsetof(struct tcphdr, check);
648 }
649
650 /* This routine computes an IPv4 TCP checksum. */
651 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
652 {
653         const struct inet_sock *inet = inet_sk(sk);
654
655         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
656 }
657 EXPORT_SYMBOL(tcp_v4_send_check);
658
659 /*
660  *      This routine will send an RST to the other tcp.
661  *
662  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
663  *                    for reset.
664  *      Answer: if a packet caused RST, it is not for a socket
665  *              existing in our system, if it is matched to a socket,
666  *              it is just duplicate segment or bug in other side's TCP.
667  *              So that we build reply only basing on parameters
668  *              arrived with segment.
669  *      Exception: precedence violation. We do not implement it in any case.
670  */
671
672 #ifdef CONFIG_TCP_MD5SIG
673 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
674 #else
675 #define OPTION_BYTES sizeof(__be32)
676 #endif
677
678 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
679 {
680         const struct tcphdr *th = tcp_hdr(skb);
681         struct {
682                 struct tcphdr th;
683                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
684         } rep;
685         struct ip_reply_arg arg;
686 #ifdef CONFIG_TCP_MD5SIG
687         struct tcp_md5sig_key *key = NULL;
688         const __u8 *hash_location = NULL;
689         unsigned char newhash[16];
690         int genhash;
691         struct sock *sk1 = NULL;
692 #endif
693         u64 transmit_time = 0;
694         struct sock *ctl_sk;
695         struct net *net;
696         u32 txhash = 0;
697
698         /* Never send a reset in response to a reset. */
699         if (th->rst)
700                 return;
701
702         /* If sk not NULL, it means we did a successful lookup and incoming
703          * route had to be correct. prequeue might have dropped our dst.
704          */
705         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
706                 return;
707
708         /* Swap the send and the receive. */
709         memset(&rep, 0, sizeof(rep));
710         rep.th.dest   = th->source;
711         rep.th.source = th->dest;
712         rep.th.doff   = sizeof(struct tcphdr) / 4;
713         rep.th.rst    = 1;
714
715         if (th->ack) {
716                 rep.th.seq = th->ack_seq;
717         } else {
718                 rep.th.ack = 1;
719                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
720                                        skb->len - (th->doff << 2));
721         }
722
723         memset(&arg, 0, sizeof(arg));
724         arg.iov[0].iov_base = (unsigned char *)&rep;
725         arg.iov[0].iov_len  = sizeof(rep.th);
726
727         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
728 #ifdef CONFIG_TCP_MD5SIG
729         rcu_read_lock();
730         hash_location = tcp_parse_md5sig_option(th);
731         if (sk && sk_fullsock(sk)) {
732                 const union tcp_md5_addr *addr;
733                 int l3index;
734
735                 /* sdif set, means packet ingressed via a device
736                  * in an L3 domain and inet_iif is set to it.
737                  */
738                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
739                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
740                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
741         } else if (hash_location) {
742                 const union tcp_md5_addr *addr;
743                 int sdif = tcp_v4_sdif(skb);
744                 int dif = inet_iif(skb);
745                 int l3index;
746
747                 /*
748                  * active side is lost. Try to find listening socket through
749                  * source port, and then find md5 key through listening socket.
750                  * we are not loose security here:
751                  * Incoming packet is checked with md5 hash with finding key,
752                  * no RST generated if md5 hash doesn't match.
753                  */
754                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
755                                              NULL, 0, ip_hdr(skb)->saddr,
756                                              th->source, ip_hdr(skb)->daddr,
757                                              ntohs(th->source), dif, sdif);
758                 /* don't send rst if it can't find key */
759                 if (!sk1)
760                         goto out;
761
762                 /* sdif set, means packet ingressed via a device
763                  * in an L3 domain and dif is set to it.
764                  */
765                 l3index = sdif ? dif : 0;
766                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
767                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
768                 if (!key)
769                         goto out;
770
771
772                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
773                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
774                         goto out;
775
776         }
777
778         if (key) {
779                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
780                                    (TCPOPT_NOP << 16) |
781                                    (TCPOPT_MD5SIG << 8) |
782                                    TCPOLEN_MD5SIG);
783                 /* Update length and the length the header thinks exists */
784                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
785                 rep.th.doff = arg.iov[0].iov_len / 4;
786
787                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
788                                      key, ip_hdr(skb)->saddr,
789                                      ip_hdr(skb)->daddr, &rep.th);
790         }
791 #endif
792         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
793         if (rep.opt[0] == 0) {
794                 __be32 mrst = mptcp_reset_option(skb);
795
796                 if (mrst) {
797                         rep.opt[0] = mrst;
798                         arg.iov[0].iov_len += sizeof(mrst);
799                         rep.th.doff = arg.iov[0].iov_len / 4;
800                 }
801         }
802
803         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
804                                       ip_hdr(skb)->saddr, /* XXX */
805                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
806         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
807         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
808
809         /* When socket is gone, all binding information is lost.
810          * routing might fail in this case. No choice here, if we choose to force
811          * input interface, we will misroute in case of asymmetric route.
812          */
813         if (sk) {
814                 arg.bound_dev_if = sk->sk_bound_dev_if;
815                 if (sk_fullsock(sk))
816                         trace_tcp_send_reset(sk, skb);
817         }
818
819         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
820                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
821
822         arg.tos = ip_hdr(skb)->tos;
823         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
824         local_bh_disable();
825         ctl_sk = this_cpu_read(ipv4_tcp_sk);
826         sock_net_set(ctl_sk, net);
827         if (sk) {
828                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
829                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
830                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
831                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
832                 transmit_time = tcp_transmit_time(sk);
833                 xfrm_sk_clone_policy(ctl_sk, sk);
834                 txhash = (sk->sk_state == TCP_TIME_WAIT) ?
835                          inet_twsk(sk)->tw_txhash : sk->sk_txhash;
836         } else {
837                 ctl_sk->sk_mark = 0;
838                 ctl_sk->sk_priority = 0;
839         }
840         ip_send_unicast_reply(ctl_sk,
841                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
842                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
843                               &arg, arg.iov[0].iov_len,
844                               transmit_time, txhash);
845
846         xfrm_sk_free_policy(ctl_sk);
847         sock_net_set(ctl_sk, &init_net);
848         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
849         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
850         local_bh_enable();
851
852 #ifdef CONFIG_TCP_MD5SIG
853 out:
854         rcu_read_unlock();
855 #endif
856 }
857
858 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
859    outside socket context is ugly, certainly. What can I do?
860  */
861
862 static void tcp_v4_send_ack(const struct sock *sk,
863                             struct sk_buff *skb, u32 seq, u32 ack,
864                             u32 win, u32 tsval, u32 tsecr, int oif,
865                             struct tcp_md5sig_key *key,
866                             int reply_flags, u8 tos, u32 txhash)
867 {
868         const struct tcphdr *th = tcp_hdr(skb);
869         struct {
870                 struct tcphdr th;
871                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
872 #ifdef CONFIG_TCP_MD5SIG
873                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
874 #endif
875                         ];
876         } rep;
877         struct net *net = sock_net(sk);
878         struct ip_reply_arg arg;
879         struct sock *ctl_sk;
880         u64 transmit_time;
881
882         memset(&rep.th, 0, sizeof(struct tcphdr));
883         memset(&arg, 0, sizeof(arg));
884
885         arg.iov[0].iov_base = (unsigned char *)&rep;
886         arg.iov[0].iov_len  = sizeof(rep.th);
887         if (tsecr) {
888                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
889                                    (TCPOPT_TIMESTAMP << 8) |
890                                    TCPOLEN_TIMESTAMP);
891                 rep.opt[1] = htonl(tsval);
892                 rep.opt[2] = htonl(tsecr);
893                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
894         }
895
896         /* Swap the send and the receive. */
897         rep.th.dest    = th->source;
898         rep.th.source  = th->dest;
899         rep.th.doff    = arg.iov[0].iov_len / 4;
900         rep.th.seq     = htonl(seq);
901         rep.th.ack_seq = htonl(ack);
902         rep.th.ack     = 1;
903         rep.th.window  = htons(win);
904
905 #ifdef CONFIG_TCP_MD5SIG
906         if (key) {
907                 int offset = (tsecr) ? 3 : 0;
908
909                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
910                                           (TCPOPT_NOP << 16) |
911                                           (TCPOPT_MD5SIG << 8) |
912                                           TCPOLEN_MD5SIG);
913                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
914                 rep.th.doff = arg.iov[0].iov_len/4;
915
916                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
917                                     key, ip_hdr(skb)->saddr,
918                                     ip_hdr(skb)->daddr, &rep.th);
919         }
920 #endif
921         arg.flags = reply_flags;
922         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
923                                       ip_hdr(skb)->saddr, /* XXX */
924                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
925         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
926         if (oif)
927                 arg.bound_dev_if = oif;
928         arg.tos = tos;
929         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
930         local_bh_disable();
931         ctl_sk = this_cpu_read(ipv4_tcp_sk);
932         sock_net_set(ctl_sk, net);
933         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
934                            inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark);
935         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
936                            inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority);
937         transmit_time = tcp_transmit_time(sk);
938         ip_send_unicast_reply(ctl_sk,
939                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
940                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
941                               &arg, arg.iov[0].iov_len,
942                               transmit_time, txhash);
943
944         sock_net_set(ctl_sk, &init_net);
945         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
946         local_bh_enable();
947 }
948
949 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
950 {
951         struct inet_timewait_sock *tw = inet_twsk(sk);
952         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
953
954         tcp_v4_send_ack(sk, skb,
955                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
956                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
957                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
958                         tcptw->tw_ts_recent,
959                         tw->tw_bound_dev_if,
960                         tcp_twsk_md5_key(tcptw),
961                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
962                         tw->tw_tos,
963                         tw->tw_txhash
964                         );
965
966         inet_twsk_put(tw);
967 }
968
969 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
970                                   struct request_sock *req)
971 {
972         const union tcp_md5_addr *addr;
973         int l3index;
974
975         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
976          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
977          */
978         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
979                                              tcp_sk(sk)->snd_nxt;
980
981         /* RFC 7323 2.3
982          * The window field (SEG.WND) of every outgoing segment, with the
983          * exception of <SYN> segments, MUST be right-shifted by
984          * Rcv.Wind.Shift bits:
985          */
986         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
987         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
988         tcp_v4_send_ack(sk, skb, seq,
989                         tcp_rsk(req)->rcv_nxt,
990                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
991                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
992                         READ_ONCE(req->ts_recent),
993                         0,
994                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
995                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
996                         ip_hdr(skb)->tos,
997                         READ_ONCE(tcp_rsk(req)->txhash));
998 }
999
1000 /*
1001  *      Send a SYN-ACK after having received a SYN.
1002  *      This still operates on a request_sock only, not on a big
1003  *      socket.
1004  */
1005 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
1006                               struct flowi *fl,
1007                               struct request_sock *req,
1008                               struct tcp_fastopen_cookie *foc,
1009                               enum tcp_synack_type synack_type,
1010                               struct sk_buff *syn_skb)
1011 {
1012         const struct inet_request_sock *ireq = inet_rsk(req);
1013         struct flowi4 fl4;
1014         int err = -1;
1015         struct sk_buff *skb;
1016         u8 tos;
1017
1018         /* First, grab a route. */
1019         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1020                 return -1;
1021
1022         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1023
1024         if (skb) {
1025                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1026
1027                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1028                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1029                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1030                                 inet_sk(sk)->tos;
1031
1032                 if (!INET_ECN_is_capable(tos) &&
1033                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1034                         tos |= INET_ECN_ECT_0;
1035
1036                 rcu_read_lock();
1037                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1038                                             ireq->ir_rmt_addr,
1039                                             rcu_dereference(ireq->ireq_opt),
1040                                             tos);
1041                 rcu_read_unlock();
1042                 err = net_xmit_eval(err);
1043         }
1044
1045         return err;
1046 }
1047
1048 /*
1049  *      IPv4 request_sock destructor.
1050  */
1051 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1052 {
1053         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1054 }
1055
1056 #ifdef CONFIG_TCP_MD5SIG
1057 /*
1058  * RFC2385 MD5 checksumming requires a mapping of
1059  * IP address->MD5 Key.
1060  * We need to maintain these in the sk structure.
1061  */
1062
1063 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1064 EXPORT_SYMBOL(tcp_md5_needed);
1065
1066 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1067 {
1068         if (!old)
1069                 return true;
1070
1071         /* l3index always overrides non-l3index */
1072         if (old->l3index && new->l3index == 0)
1073                 return false;
1074         if (old->l3index == 0 && new->l3index)
1075                 return true;
1076
1077         return old->prefixlen < new->prefixlen;
1078 }
1079
1080 /* Find the Key structure for an address.  */
1081 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1082                                            const union tcp_md5_addr *addr,
1083                                            int family)
1084 {
1085         const struct tcp_sock *tp = tcp_sk(sk);
1086         struct tcp_md5sig_key *key;
1087         const struct tcp_md5sig_info *md5sig;
1088         __be32 mask;
1089         struct tcp_md5sig_key *best_match = NULL;
1090         bool match;
1091
1092         /* caller either holds rcu_read_lock() or socket lock */
1093         md5sig = rcu_dereference_check(tp->md5sig_info,
1094                                        lockdep_sock_is_held(sk));
1095         if (!md5sig)
1096                 return NULL;
1097
1098         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1099                                  lockdep_sock_is_held(sk)) {
1100                 if (key->family != family)
1101                         continue;
1102                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1103                         continue;
1104                 if (family == AF_INET) {
1105                         mask = inet_make_mask(key->prefixlen);
1106                         match = (key->addr.a4.s_addr & mask) ==
1107                                 (addr->a4.s_addr & mask);
1108 #if IS_ENABLED(CONFIG_IPV6)
1109                 } else if (family == AF_INET6) {
1110                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1111                                                   key->prefixlen);
1112 #endif
1113                 } else {
1114                         match = false;
1115                 }
1116
1117                 if (match && better_md5_match(best_match, key))
1118                         best_match = key;
1119         }
1120         return best_match;
1121 }
1122 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1123
1124 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1125                                                       const union tcp_md5_addr *addr,
1126                                                       int family, u8 prefixlen,
1127                                                       int l3index, u8 flags)
1128 {
1129         const struct tcp_sock *tp = tcp_sk(sk);
1130         struct tcp_md5sig_key *key;
1131         unsigned int size = sizeof(struct in_addr);
1132         const struct tcp_md5sig_info *md5sig;
1133
1134         /* caller either holds rcu_read_lock() or socket lock */
1135         md5sig = rcu_dereference_check(tp->md5sig_info,
1136                                        lockdep_sock_is_held(sk));
1137         if (!md5sig)
1138                 return NULL;
1139 #if IS_ENABLED(CONFIG_IPV6)
1140         if (family == AF_INET6)
1141                 size = sizeof(struct in6_addr);
1142 #endif
1143         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1144                                  lockdep_sock_is_held(sk)) {
1145                 if (key->family != family)
1146                         continue;
1147                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1148                         continue;
1149                 if (key->l3index != l3index)
1150                         continue;
1151                 if (!memcmp(&key->addr, addr, size) &&
1152                     key->prefixlen == prefixlen)
1153                         return key;
1154         }
1155         return NULL;
1156 }
1157
1158 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1159                                          const struct sock *addr_sk)
1160 {
1161         const union tcp_md5_addr *addr;
1162         int l3index;
1163
1164         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1165                                                  addr_sk->sk_bound_dev_if);
1166         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1167         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1168 }
1169 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1170
1171 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1172 {
1173         struct tcp_sock *tp = tcp_sk(sk);
1174         struct tcp_md5sig_info *md5sig;
1175
1176         md5sig = kmalloc(sizeof(*md5sig), gfp);
1177         if (!md5sig)
1178                 return -ENOMEM;
1179
1180         sk_gso_disable(sk);
1181         INIT_HLIST_HEAD(&md5sig->head);
1182         rcu_assign_pointer(tp->md5sig_info, md5sig);
1183         return 0;
1184 }
1185
1186 /* This can be called on a newly created socket, from other files */
1187 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1188                             int family, u8 prefixlen, int l3index, u8 flags,
1189                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1190 {
1191         /* Add Key to the list */
1192         struct tcp_md5sig_key *key;
1193         struct tcp_sock *tp = tcp_sk(sk);
1194         struct tcp_md5sig_info *md5sig;
1195
1196         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1197         if (key) {
1198                 /* Pre-existing entry - just update that one.
1199                  * Note that the key might be used concurrently.
1200                  * data_race() is telling kcsan that we do not care of
1201                  * key mismatches, since changing MD5 key on live flows
1202                  * can lead to packet drops.
1203                  */
1204                 data_race(memcpy(key->key, newkey, newkeylen));
1205
1206                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1207                  * Also note that a reader could catch new key->keylen value
1208                  * but old key->key[], this is the reason we use __GFP_ZERO
1209                  * at sock_kmalloc() time below these lines.
1210                  */
1211                 WRITE_ONCE(key->keylen, newkeylen);
1212
1213                 return 0;
1214         }
1215
1216         md5sig = rcu_dereference_protected(tp->md5sig_info,
1217                                            lockdep_sock_is_held(sk));
1218
1219         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1220         if (!key)
1221                 return -ENOMEM;
1222         if (!tcp_alloc_md5sig_pool()) {
1223                 sock_kfree_s(sk, key, sizeof(*key));
1224                 return -ENOMEM;
1225         }
1226
1227         memcpy(key->key, newkey, newkeylen);
1228         key->keylen = newkeylen;
1229         key->family = family;
1230         key->prefixlen = prefixlen;
1231         key->l3index = l3index;
1232         key->flags = flags;
1233         memcpy(&key->addr, addr,
1234                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1235                                                                  sizeof(struct in_addr));
1236         hlist_add_head_rcu(&key->node, &md5sig->head);
1237         return 0;
1238 }
1239
1240 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1241                    int family, u8 prefixlen, int l3index, u8 flags,
1242                    const u8 *newkey, u8 newkeylen)
1243 {
1244         struct tcp_sock *tp = tcp_sk(sk);
1245
1246         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1247                 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1248                         return -ENOMEM;
1249
1250                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1251                         struct tcp_md5sig_info *md5sig;
1252
1253                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1254                         rcu_assign_pointer(tp->md5sig_info, NULL);
1255                         kfree_rcu(md5sig, rcu);
1256                         return -EUSERS;
1257                 }
1258         }
1259
1260         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1261                                 newkey, newkeylen, GFP_KERNEL);
1262 }
1263 EXPORT_SYMBOL(tcp_md5_do_add);
1264
1265 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1266                      int family, u8 prefixlen, int l3index,
1267                      struct tcp_md5sig_key *key)
1268 {
1269         struct tcp_sock *tp = tcp_sk(sk);
1270
1271         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1272                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1273                         return -ENOMEM;
1274
1275                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1276                         struct tcp_md5sig_info *md5sig;
1277
1278                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1279                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1280                         rcu_assign_pointer(tp->md5sig_info, NULL);
1281                         kfree_rcu(md5sig, rcu);
1282                         return -EUSERS;
1283                 }
1284         }
1285
1286         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1287                                 key->flags, key->key, key->keylen,
1288                                 sk_gfp_mask(sk, GFP_ATOMIC));
1289 }
1290 EXPORT_SYMBOL(tcp_md5_key_copy);
1291
1292 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1293                    u8 prefixlen, int l3index, u8 flags)
1294 {
1295         struct tcp_md5sig_key *key;
1296
1297         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1298         if (!key)
1299                 return -ENOENT;
1300         hlist_del_rcu(&key->node);
1301         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1302         kfree_rcu(key, rcu);
1303         return 0;
1304 }
1305 EXPORT_SYMBOL(tcp_md5_do_del);
1306
1307 static void tcp_clear_md5_list(struct sock *sk)
1308 {
1309         struct tcp_sock *tp = tcp_sk(sk);
1310         struct tcp_md5sig_key *key;
1311         struct hlist_node *n;
1312         struct tcp_md5sig_info *md5sig;
1313
1314         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1315
1316         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1317                 hlist_del_rcu(&key->node);
1318                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1319                 kfree_rcu(key, rcu);
1320         }
1321 }
1322
1323 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1324                                  sockptr_t optval, int optlen)
1325 {
1326         struct tcp_md5sig cmd;
1327         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1328         const union tcp_md5_addr *addr;
1329         u8 prefixlen = 32;
1330         int l3index = 0;
1331         u8 flags;
1332
1333         if (optlen < sizeof(cmd))
1334                 return -EINVAL;
1335
1336         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1337                 return -EFAULT;
1338
1339         if (sin->sin_family != AF_INET)
1340                 return -EINVAL;
1341
1342         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1343
1344         if (optname == TCP_MD5SIG_EXT &&
1345             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1346                 prefixlen = cmd.tcpm_prefixlen;
1347                 if (prefixlen > 32)
1348                         return -EINVAL;
1349         }
1350
1351         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1352             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1353                 struct net_device *dev;
1354
1355                 rcu_read_lock();
1356                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1357                 if (dev && netif_is_l3_master(dev))
1358                         l3index = dev->ifindex;
1359
1360                 rcu_read_unlock();
1361
1362                 /* ok to reference set/not set outside of rcu;
1363                  * right now device MUST be an L3 master
1364                  */
1365                 if (!dev || !l3index)
1366                         return -EINVAL;
1367         }
1368
1369         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1370
1371         if (!cmd.tcpm_keylen)
1372                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1373
1374         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1375                 return -EINVAL;
1376
1377         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1378                               cmd.tcpm_key, cmd.tcpm_keylen);
1379 }
1380
1381 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1382                                    __be32 daddr, __be32 saddr,
1383                                    const struct tcphdr *th, int nbytes)
1384 {
1385         struct tcp4_pseudohdr *bp;
1386         struct scatterlist sg;
1387         struct tcphdr *_th;
1388
1389         bp = hp->scratch;
1390         bp->saddr = saddr;
1391         bp->daddr = daddr;
1392         bp->pad = 0;
1393         bp->protocol = IPPROTO_TCP;
1394         bp->len = cpu_to_be16(nbytes);
1395
1396         _th = (struct tcphdr *)(bp + 1);
1397         memcpy(_th, th, sizeof(*th));
1398         _th->check = 0;
1399
1400         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1401         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1402                                 sizeof(*bp) + sizeof(*th));
1403         return crypto_ahash_update(hp->md5_req);
1404 }
1405
1406 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1407                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1408 {
1409         struct tcp_md5sig_pool *hp;
1410         struct ahash_request *req;
1411
1412         hp = tcp_get_md5sig_pool();
1413         if (!hp)
1414                 goto clear_hash_noput;
1415         req = hp->md5_req;
1416
1417         if (crypto_ahash_init(req))
1418                 goto clear_hash;
1419         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1420                 goto clear_hash;
1421         if (tcp_md5_hash_key(hp, key))
1422                 goto clear_hash;
1423         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1424         if (crypto_ahash_final(req))
1425                 goto clear_hash;
1426
1427         tcp_put_md5sig_pool();
1428         return 0;
1429
1430 clear_hash:
1431         tcp_put_md5sig_pool();
1432 clear_hash_noput:
1433         memset(md5_hash, 0, 16);
1434         return 1;
1435 }
1436
1437 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1438                         const struct sock *sk,
1439                         const struct sk_buff *skb)
1440 {
1441         struct tcp_md5sig_pool *hp;
1442         struct ahash_request *req;
1443         const struct tcphdr *th = tcp_hdr(skb);
1444         __be32 saddr, daddr;
1445
1446         if (sk) { /* valid for establish/request sockets */
1447                 saddr = sk->sk_rcv_saddr;
1448                 daddr = sk->sk_daddr;
1449         } else {
1450                 const struct iphdr *iph = ip_hdr(skb);
1451                 saddr = iph->saddr;
1452                 daddr = iph->daddr;
1453         }
1454
1455         hp = tcp_get_md5sig_pool();
1456         if (!hp)
1457                 goto clear_hash_noput;
1458         req = hp->md5_req;
1459
1460         if (crypto_ahash_init(req))
1461                 goto clear_hash;
1462
1463         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1464                 goto clear_hash;
1465         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1466                 goto clear_hash;
1467         if (tcp_md5_hash_key(hp, key))
1468                 goto clear_hash;
1469         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1470         if (crypto_ahash_final(req))
1471                 goto clear_hash;
1472
1473         tcp_put_md5sig_pool();
1474         return 0;
1475
1476 clear_hash:
1477         tcp_put_md5sig_pool();
1478 clear_hash_noput:
1479         memset(md5_hash, 0, 16);
1480         return 1;
1481 }
1482 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1483
1484 #endif
1485
1486 static void tcp_v4_init_req(struct request_sock *req,
1487                             const struct sock *sk_listener,
1488                             struct sk_buff *skb)
1489 {
1490         struct inet_request_sock *ireq = inet_rsk(req);
1491         struct net *net = sock_net(sk_listener);
1492
1493         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1494         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1495         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1496 }
1497
1498 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1499                                           struct sk_buff *skb,
1500                                           struct flowi *fl,
1501                                           struct request_sock *req)
1502 {
1503         tcp_v4_init_req(req, sk, skb);
1504
1505         if (security_inet_conn_request(sk, skb, req))
1506                 return NULL;
1507
1508         return inet_csk_route_req(sk, &fl->u.ip4, req);
1509 }
1510
1511 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1512         .family         =       PF_INET,
1513         .obj_size       =       sizeof(struct tcp_request_sock),
1514         .rtx_syn_ack    =       tcp_rtx_synack,
1515         .send_ack       =       tcp_v4_reqsk_send_ack,
1516         .destructor     =       tcp_v4_reqsk_destructor,
1517         .send_reset     =       tcp_v4_send_reset,
1518         .syn_ack_timeout =      tcp_syn_ack_timeout,
1519 };
1520
1521 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1522         .mss_clamp      =       TCP_MSS_DEFAULT,
1523 #ifdef CONFIG_TCP_MD5SIG
1524         .req_md5_lookup =       tcp_v4_md5_lookup,
1525         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1526 #endif
1527 #ifdef CONFIG_SYN_COOKIES
1528         .cookie_init_seq =      cookie_v4_init_sequence,
1529 #endif
1530         .route_req      =       tcp_v4_route_req,
1531         .init_seq       =       tcp_v4_init_seq,
1532         .init_ts_off    =       tcp_v4_init_ts_off,
1533         .send_synack    =       tcp_v4_send_synack,
1534 };
1535
1536 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1537 {
1538         /* Never answer to SYNs send to broadcast or multicast */
1539         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1540                 goto drop;
1541
1542         return tcp_conn_request(&tcp_request_sock_ops,
1543                                 &tcp_request_sock_ipv4_ops, sk, skb);
1544
1545 drop:
1546         tcp_listendrop(sk);
1547         return 0;
1548 }
1549 EXPORT_SYMBOL(tcp_v4_conn_request);
1550
1551
1552 /*
1553  * The three way handshake has completed - we got a valid synack -
1554  * now create the new socket.
1555  */
1556 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1557                                   struct request_sock *req,
1558                                   struct dst_entry *dst,
1559                                   struct request_sock *req_unhash,
1560                                   bool *own_req)
1561 {
1562         struct inet_request_sock *ireq;
1563         bool found_dup_sk = false;
1564         struct inet_sock *newinet;
1565         struct tcp_sock *newtp;
1566         struct sock *newsk;
1567 #ifdef CONFIG_TCP_MD5SIG
1568         const union tcp_md5_addr *addr;
1569         struct tcp_md5sig_key *key;
1570         int l3index;
1571 #endif
1572         struct ip_options_rcu *inet_opt;
1573
1574         if (sk_acceptq_is_full(sk))
1575                 goto exit_overflow;
1576
1577         newsk = tcp_create_openreq_child(sk, req, skb);
1578         if (!newsk)
1579                 goto exit_nonewsk;
1580
1581         newsk->sk_gso_type = SKB_GSO_TCPV4;
1582         inet_sk_rx_dst_set(newsk, skb);
1583
1584         newtp                 = tcp_sk(newsk);
1585         newinet               = inet_sk(newsk);
1586         ireq                  = inet_rsk(req);
1587         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1588         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1589         newsk->sk_bound_dev_if = ireq->ir_iif;
1590         newinet->inet_saddr   = ireq->ir_loc_addr;
1591         inet_opt              = rcu_dereference(ireq->ireq_opt);
1592         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1593         newinet->mc_index     = inet_iif(skb);
1594         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1595         newinet->rcv_tos      = ip_hdr(skb)->tos;
1596         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1597         if (inet_opt)
1598                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1599         atomic_set(&newinet->inet_id, get_random_u16());
1600
1601         /* Set ToS of the new socket based upon the value of incoming SYN.
1602          * ECT bits are set later in tcp_init_transfer().
1603          */
1604         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1605                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1606
1607         if (!dst) {
1608                 dst = inet_csk_route_child_sock(sk, newsk, req);
1609                 if (!dst)
1610                         goto put_and_exit;
1611         } else {
1612                 /* syncookie case : see end of cookie_v4_check() */
1613         }
1614         sk_setup_caps(newsk, dst);
1615
1616         tcp_ca_openreq_child(newsk, dst);
1617
1618         tcp_sync_mss(newsk, dst_mtu(dst));
1619         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1620
1621         tcp_initialize_rcv_mss(newsk);
1622
1623 #ifdef CONFIG_TCP_MD5SIG
1624         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1625         /* Copy over the MD5 key from the original socket */
1626         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1627         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1628         if (key) {
1629                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1630                         goto put_and_exit;
1631                 sk_gso_disable(newsk);
1632         }
1633 #endif
1634
1635         if (__inet_inherit_port(sk, newsk) < 0)
1636                 goto put_and_exit;
1637         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1638                                        &found_dup_sk);
1639         if (likely(*own_req)) {
1640                 tcp_move_syn(newtp, req);
1641                 ireq->ireq_opt = NULL;
1642         } else {
1643                 newinet->inet_opt = NULL;
1644
1645                 if (!req_unhash && found_dup_sk) {
1646                         /* This code path should only be executed in the
1647                          * syncookie case only
1648                          */
1649                         bh_unlock_sock(newsk);
1650                         sock_put(newsk);
1651                         newsk = NULL;
1652                 }
1653         }
1654         return newsk;
1655
1656 exit_overflow:
1657         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1658 exit_nonewsk:
1659         dst_release(dst);
1660 exit:
1661         tcp_listendrop(sk);
1662         return NULL;
1663 put_and_exit:
1664         newinet->inet_opt = NULL;
1665         inet_csk_prepare_forced_close(newsk);
1666         tcp_done(newsk);
1667         goto exit;
1668 }
1669 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1670
1671 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1672 {
1673 #ifdef CONFIG_SYN_COOKIES
1674         const struct tcphdr *th = tcp_hdr(skb);
1675
1676         if (!th->syn)
1677                 sk = cookie_v4_check(sk, skb);
1678 #endif
1679         return sk;
1680 }
1681
1682 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1683                          struct tcphdr *th, u32 *cookie)
1684 {
1685         u16 mss = 0;
1686 #ifdef CONFIG_SYN_COOKIES
1687         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1688                                     &tcp_request_sock_ipv4_ops, sk, th);
1689         if (mss) {
1690                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1691                 tcp_synq_overflow(sk);
1692         }
1693 #endif
1694         return mss;
1695 }
1696
1697 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1698                                                            u32));
1699 /* The socket must have it's spinlock held when we get
1700  * here, unless it is a TCP_LISTEN socket.
1701  *
1702  * We have a potential double-lock case here, so even when
1703  * doing backlog processing we use the BH locking scheme.
1704  * This is because we cannot sleep with the original spinlock
1705  * held.
1706  */
1707 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1708 {
1709         enum skb_drop_reason reason;
1710         struct sock *rsk;
1711
1712         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1713                 struct dst_entry *dst;
1714
1715                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1716                                                 lockdep_sock_is_held(sk));
1717
1718                 sock_rps_save_rxhash(sk, skb);
1719                 sk_mark_napi_id(sk, skb);
1720                 if (dst) {
1721                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1722                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1723                                              dst, 0)) {
1724                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1725                                 dst_release(dst);
1726                         }
1727                 }
1728                 tcp_rcv_established(sk, skb);
1729                 return 0;
1730         }
1731
1732         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1733         if (tcp_checksum_complete(skb))
1734                 goto csum_err;
1735
1736         if (sk->sk_state == TCP_LISTEN) {
1737                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1738
1739                 if (!nsk)
1740                         goto discard;
1741                 if (nsk != sk) {
1742                         if (tcp_child_process(sk, nsk, skb)) {
1743                                 rsk = nsk;
1744                                 goto reset;
1745                         }
1746                         return 0;
1747                 }
1748         } else
1749                 sock_rps_save_rxhash(sk, skb);
1750
1751         if (tcp_rcv_state_process(sk, skb)) {
1752                 rsk = sk;
1753                 goto reset;
1754         }
1755         return 0;
1756
1757 reset:
1758         tcp_v4_send_reset(rsk, skb);
1759 discard:
1760         kfree_skb_reason(skb, reason);
1761         /* Be careful here. If this function gets more complicated and
1762          * gcc suffers from register pressure on the x86, sk (in %ebx)
1763          * might be destroyed here. This current version compiles correctly,
1764          * but you have been warned.
1765          */
1766         return 0;
1767
1768 csum_err:
1769         reason = SKB_DROP_REASON_TCP_CSUM;
1770         trace_tcp_bad_csum(skb);
1771         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1772         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1773         goto discard;
1774 }
1775 EXPORT_SYMBOL(tcp_v4_do_rcv);
1776
1777 int tcp_v4_early_demux(struct sk_buff *skb)
1778 {
1779         struct net *net = dev_net(skb->dev);
1780         const struct iphdr *iph;
1781         const struct tcphdr *th;
1782         struct sock *sk;
1783
1784         if (skb->pkt_type != PACKET_HOST)
1785                 return 0;
1786
1787         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1788                 return 0;
1789
1790         iph = ip_hdr(skb);
1791         th = tcp_hdr(skb);
1792
1793         if (th->doff < sizeof(struct tcphdr) / 4)
1794                 return 0;
1795
1796         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1797                                        iph->saddr, th->source,
1798                                        iph->daddr, ntohs(th->dest),
1799                                        skb->skb_iif, inet_sdif(skb));
1800         if (sk) {
1801                 skb->sk = sk;
1802                 skb->destructor = sock_edemux;
1803                 if (sk_fullsock(sk)) {
1804                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1805
1806                         if (dst)
1807                                 dst = dst_check(dst, 0);
1808                         if (dst &&
1809                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1810                                 skb_dst_set_noref(skb, dst);
1811                 }
1812         }
1813         return 0;
1814 }
1815
1816 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1817                      enum skb_drop_reason *reason)
1818 {
1819         u32 limit, tail_gso_size, tail_gso_segs;
1820         struct skb_shared_info *shinfo;
1821         const struct tcphdr *th;
1822         struct tcphdr *thtail;
1823         struct sk_buff *tail;
1824         unsigned int hdrlen;
1825         bool fragstolen;
1826         u32 gso_segs;
1827         u32 gso_size;
1828         int delta;
1829
1830         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1831          * we can fix skb->truesize to its real value to avoid future drops.
1832          * This is valid because skb is not yet charged to the socket.
1833          * It has been noticed pure SACK packets were sometimes dropped
1834          * (if cooked by drivers without copybreak feature).
1835          */
1836         skb_condense(skb);
1837
1838         skb_dst_drop(skb);
1839
1840         if (unlikely(tcp_checksum_complete(skb))) {
1841                 bh_unlock_sock(sk);
1842                 trace_tcp_bad_csum(skb);
1843                 *reason = SKB_DROP_REASON_TCP_CSUM;
1844                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1845                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1846                 return true;
1847         }
1848
1849         /* Attempt coalescing to last skb in backlog, even if we are
1850          * above the limits.
1851          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1852          */
1853         th = (const struct tcphdr *)skb->data;
1854         hdrlen = th->doff * 4;
1855
1856         tail = sk->sk_backlog.tail;
1857         if (!tail)
1858                 goto no_coalesce;
1859         thtail = (struct tcphdr *)tail->data;
1860
1861         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1862             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1863             ((TCP_SKB_CB(tail)->tcp_flags |
1864               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1865             !((TCP_SKB_CB(tail)->tcp_flags &
1866               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1867             ((TCP_SKB_CB(tail)->tcp_flags ^
1868               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1869 #ifdef CONFIG_TLS_DEVICE
1870             tail->decrypted != skb->decrypted ||
1871 #endif
1872             thtail->doff != th->doff ||
1873             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1874                 goto no_coalesce;
1875
1876         __skb_pull(skb, hdrlen);
1877
1878         shinfo = skb_shinfo(skb);
1879         gso_size = shinfo->gso_size ?: skb->len;
1880         gso_segs = shinfo->gso_segs ?: 1;
1881
1882         shinfo = skb_shinfo(tail);
1883         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1884         tail_gso_segs = shinfo->gso_segs ?: 1;
1885
1886         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1887                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1888
1889                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1890                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1891                         thtail->window = th->window;
1892                 }
1893
1894                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1895                  * thtail->fin, so that the fast path in tcp_rcv_established()
1896                  * is not entered if we append a packet with a FIN.
1897                  * SYN, RST, URG are not present.
1898                  * ACK is set on both packets.
1899                  * PSH : we do not really care in TCP stack,
1900                  *       at least for 'GRO' packets.
1901                  */
1902                 thtail->fin |= th->fin;
1903                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1904
1905                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1906                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1907                         tail->tstamp = skb->tstamp;
1908                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1909                 }
1910
1911                 /* Not as strict as GRO. We only need to carry mss max value */
1912                 shinfo->gso_size = max(gso_size, tail_gso_size);
1913                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1914
1915                 sk->sk_backlog.len += delta;
1916                 __NET_INC_STATS(sock_net(sk),
1917                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1918                 kfree_skb_partial(skb, fragstolen);
1919                 return false;
1920         }
1921         __skb_push(skb, hdrlen);
1922
1923 no_coalesce:
1924         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1925
1926         /* Only socket owner can try to collapse/prune rx queues
1927          * to reduce memory overhead, so add a little headroom here.
1928          * Few sockets backlog are possibly concurrently non empty.
1929          */
1930         limit += 64 * 1024;
1931
1932         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1933                 bh_unlock_sock(sk);
1934                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1935                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1936                 return true;
1937         }
1938         return false;
1939 }
1940 EXPORT_SYMBOL(tcp_add_backlog);
1941
1942 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1943 {
1944         struct tcphdr *th = (struct tcphdr *)skb->data;
1945
1946         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1947 }
1948 EXPORT_SYMBOL(tcp_filter);
1949
1950 static void tcp_v4_restore_cb(struct sk_buff *skb)
1951 {
1952         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1953                 sizeof(struct inet_skb_parm));
1954 }
1955
1956 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1957                            const struct tcphdr *th)
1958 {
1959         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1960          * barrier() makes sure compiler wont play fool^Waliasing games.
1961          */
1962         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1963                 sizeof(struct inet_skb_parm));
1964         barrier();
1965
1966         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1967         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1968                                     skb->len - th->doff * 4);
1969         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1970         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1971         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1972         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1973         TCP_SKB_CB(skb)->sacked  = 0;
1974         TCP_SKB_CB(skb)->has_rxtstamp =
1975                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1976 }
1977
1978 /*
1979  *      From tcp_input.c
1980  */
1981
1982 int tcp_v4_rcv(struct sk_buff *skb)
1983 {
1984         struct net *net = dev_net(skb->dev);
1985         enum skb_drop_reason drop_reason;
1986         int sdif = inet_sdif(skb);
1987         int dif = inet_iif(skb);
1988         const struct iphdr *iph;
1989         const struct tcphdr *th;
1990         bool refcounted;
1991         struct sock *sk;
1992         int ret;
1993
1994         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1995         if (skb->pkt_type != PACKET_HOST)
1996                 goto discard_it;
1997
1998         /* Count it even if it's bad */
1999         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
2000
2001         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
2002                 goto discard_it;
2003
2004         th = (const struct tcphdr *)skb->data;
2005
2006         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2007                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2008                 goto bad_packet;
2009         }
2010         if (!pskb_may_pull(skb, th->doff * 4))
2011                 goto discard_it;
2012
2013         /* An explanation is required here, I think.
2014          * Packet length and doff are validated by header prediction,
2015          * provided case of th->doff==0 is eliminated.
2016          * So, we defer the checks. */
2017
2018         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2019                 goto csum_error;
2020
2021         th = (const struct tcphdr *)skb->data;
2022         iph = ip_hdr(skb);
2023 lookup:
2024         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2025                                skb, __tcp_hdrlen(th), th->source,
2026                                th->dest, sdif, &refcounted);
2027         if (!sk)
2028                 goto no_tcp_socket;
2029
2030 process:
2031         if (sk->sk_state == TCP_TIME_WAIT)
2032                 goto do_time_wait;
2033
2034         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2035                 struct request_sock *req = inet_reqsk(sk);
2036                 bool req_stolen = false;
2037                 struct sock *nsk;
2038
2039                 sk = req->rsk_listener;
2040                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2041                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2042                 else
2043                         drop_reason = tcp_inbound_md5_hash(sk, skb,
2044                                                    &iph->saddr, &iph->daddr,
2045                                                    AF_INET, dif, sdif);
2046                 if (unlikely(drop_reason)) {
2047                         sk_drops_add(sk, skb);
2048                         reqsk_put(req);
2049                         goto discard_it;
2050                 }
2051                 if (tcp_checksum_complete(skb)) {
2052                         reqsk_put(req);
2053                         goto csum_error;
2054                 }
2055                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2056                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2057                         if (!nsk) {
2058                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2059                                 goto lookup;
2060                         }
2061                         sk = nsk;
2062                         /* reuseport_migrate_sock() has already held one sk_refcnt
2063                          * before returning.
2064                          */
2065                 } else {
2066                         /* We own a reference on the listener, increase it again
2067                          * as we might lose it too soon.
2068                          */
2069                         sock_hold(sk);
2070                 }
2071                 refcounted = true;
2072                 nsk = NULL;
2073                 if (!tcp_filter(sk, skb)) {
2074                         th = (const struct tcphdr *)skb->data;
2075                         iph = ip_hdr(skb);
2076                         tcp_v4_fill_cb(skb, iph, th);
2077                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2078                 } else {
2079                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2080                 }
2081                 if (!nsk) {
2082                         reqsk_put(req);
2083                         if (req_stolen) {
2084                                 /* Another cpu got exclusive access to req
2085                                  * and created a full blown socket.
2086                                  * Try to feed this packet to this socket
2087                                  * instead of discarding it.
2088                                  */
2089                                 tcp_v4_restore_cb(skb);
2090                                 sock_put(sk);
2091                                 goto lookup;
2092                         }
2093                         goto discard_and_relse;
2094                 }
2095                 nf_reset_ct(skb);
2096                 if (nsk == sk) {
2097                         reqsk_put(req);
2098                         tcp_v4_restore_cb(skb);
2099                 } else if (tcp_child_process(sk, nsk, skb)) {
2100                         tcp_v4_send_reset(nsk, skb);
2101                         goto discard_and_relse;
2102                 } else {
2103                         sock_put(sk);
2104                         return 0;
2105                 }
2106         }
2107
2108         if (static_branch_unlikely(&ip4_min_ttl)) {
2109                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2110                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2111                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2112                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2113                         goto discard_and_relse;
2114                 }
2115         }
2116
2117         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2118                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2119                 goto discard_and_relse;
2120         }
2121
2122         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2123                                            &iph->daddr, AF_INET, dif, sdif);
2124         if (drop_reason)
2125                 goto discard_and_relse;
2126
2127         nf_reset_ct(skb);
2128
2129         if (tcp_filter(sk, skb)) {
2130                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2131                 goto discard_and_relse;
2132         }
2133         th = (const struct tcphdr *)skb->data;
2134         iph = ip_hdr(skb);
2135         tcp_v4_fill_cb(skb, iph, th);
2136
2137         skb->dev = NULL;
2138
2139         if (sk->sk_state == TCP_LISTEN) {
2140                 ret = tcp_v4_do_rcv(sk, skb);
2141                 goto put_and_return;
2142         }
2143
2144         sk_incoming_cpu_update(sk);
2145
2146         bh_lock_sock_nested(sk);
2147         tcp_segs_in(tcp_sk(sk), skb);
2148         ret = 0;
2149         if (!sock_owned_by_user(sk)) {
2150                 ret = tcp_v4_do_rcv(sk, skb);
2151         } else {
2152                 if (tcp_add_backlog(sk, skb, &drop_reason))
2153                         goto discard_and_relse;
2154         }
2155         bh_unlock_sock(sk);
2156
2157 put_and_return:
2158         if (refcounted)
2159                 sock_put(sk);
2160
2161         return ret;
2162
2163 no_tcp_socket:
2164         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2165         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2166                 goto discard_it;
2167
2168         tcp_v4_fill_cb(skb, iph, th);
2169
2170         if (tcp_checksum_complete(skb)) {
2171 csum_error:
2172                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2173                 trace_tcp_bad_csum(skb);
2174                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2175 bad_packet:
2176                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2177         } else {
2178                 tcp_v4_send_reset(NULL, skb);
2179         }
2180
2181 discard_it:
2182         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2183         /* Discard frame. */
2184         kfree_skb_reason(skb, drop_reason);
2185         return 0;
2186
2187 discard_and_relse:
2188         sk_drops_add(sk, skb);
2189         if (refcounted)
2190                 sock_put(sk);
2191         goto discard_it;
2192
2193 do_time_wait:
2194         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2195                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2196                 inet_twsk_put(inet_twsk(sk));
2197                 goto discard_it;
2198         }
2199
2200         tcp_v4_fill_cb(skb, iph, th);
2201
2202         if (tcp_checksum_complete(skb)) {
2203                 inet_twsk_put(inet_twsk(sk));
2204                 goto csum_error;
2205         }
2206         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2207         case TCP_TW_SYN: {
2208                 struct sock *sk2 = inet_lookup_listener(net,
2209                                                         net->ipv4.tcp_death_row.hashinfo,
2210                                                         skb, __tcp_hdrlen(th),
2211                                                         iph->saddr, th->source,
2212                                                         iph->daddr, th->dest,
2213                                                         inet_iif(skb),
2214                                                         sdif);
2215                 if (sk2) {
2216                         inet_twsk_deschedule_put(inet_twsk(sk));
2217                         sk = sk2;
2218                         tcp_v4_restore_cb(skb);
2219                         refcounted = false;
2220                         goto process;
2221                 }
2222         }
2223                 /* to ACK */
2224                 fallthrough;
2225         case TCP_TW_ACK:
2226                 tcp_v4_timewait_ack(sk, skb);
2227                 break;
2228         case TCP_TW_RST:
2229                 tcp_v4_send_reset(sk, skb);
2230                 inet_twsk_deschedule_put(inet_twsk(sk));
2231                 goto discard_it;
2232         case TCP_TW_SUCCESS:;
2233         }
2234         goto discard_it;
2235 }
2236
2237 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2238         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2239         .twsk_unique    = tcp_twsk_unique,
2240         .twsk_destructor= tcp_twsk_destructor,
2241 };
2242
2243 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2244 {
2245         struct dst_entry *dst = skb_dst(skb);
2246
2247         if (dst && dst_hold_safe(dst)) {
2248                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2249                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2250         }
2251 }
2252 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2253
2254 const struct inet_connection_sock_af_ops ipv4_specific = {
2255         .queue_xmit        = ip_queue_xmit,
2256         .send_check        = tcp_v4_send_check,
2257         .rebuild_header    = inet_sk_rebuild_header,
2258         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2259         .conn_request      = tcp_v4_conn_request,
2260         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2261         .net_header_len    = sizeof(struct iphdr),
2262         .setsockopt        = ip_setsockopt,
2263         .getsockopt        = ip_getsockopt,
2264         .addr2sockaddr     = inet_csk_addr2sockaddr,
2265         .sockaddr_len      = sizeof(struct sockaddr_in),
2266         .mtu_reduced       = tcp_v4_mtu_reduced,
2267 };
2268 EXPORT_SYMBOL(ipv4_specific);
2269
2270 #ifdef CONFIG_TCP_MD5SIG
2271 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2272         .md5_lookup             = tcp_v4_md5_lookup,
2273         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2274         .md5_parse              = tcp_v4_parse_md5_keys,
2275 };
2276 #endif
2277
2278 /* NOTE: A lot of things set to zero explicitly by call to
2279  *       sk_alloc() so need not be done here.
2280  */
2281 static int tcp_v4_init_sock(struct sock *sk)
2282 {
2283         struct inet_connection_sock *icsk = inet_csk(sk);
2284
2285         tcp_init_sock(sk);
2286
2287         icsk->icsk_af_ops = &ipv4_specific;
2288
2289 #ifdef CONFIG_TCP_MD5SIG
2290         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2291 #endif
2292
2293         return 0;
2294 }
2295
2296 void tcp_v4_destroy_sock(struct sock *sk)
2297 {
2298         struct tcp_sock *tp = tcp_sk(sk);
2299
2300         trace_tcp_destroy_sock(sk);
2301
2302         tcp_clear_xmit_timers(sk);
2303
2304         tcp_cleanup_congestion_control(sk);
2305
2306         tcp_cleanup_ulp(sk);
2307
2308         /* Cleanup up the write buffer. */
2309         tcp_write_queue_purge(sk);
2310
2311         /* Check if we want to disable active TFO */
2312         tcp_fastopen_active_disable_ofo_check(sk);
2313
2314         /* Cleans up our, hopefully empty, out_of_order_queue. */
2315         skb_rbtree_purge(&tp->out_of_order_queue);
2316
2317 #ifdef CONFIG_TCP_MD5SIG
2318         /* Clean up the MD5 key list, if any */
2319         if (tp->md5sig_info) {
2320                 tcp_clear_md5_list(sk);
2321                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2322                 tp->md5sig_info = NULL;
2323                 static_branch_slow_dec_deferred(&tcp_md5_needed);
2324         }
2325 #endif
2326
2327         /* Clean up a referenced TCP bind bucket. */
2328         if (inet_csk(sk)->icsk_bind_hash)
2329                 inet_put_port(sk);
2330
2331         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2332
2333         /* If socket is aborted during connect operation */
2334         tcp_free_fastopen_req(tp);
2335         tcp_fastopen_destroy_cipher(sk);
2336         tcp_saved_syn_free(tp);
2337
2338         sk_sockets_allocated_dec(sk);
2339 }
2340 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2341
2342 #ifdef CONFIG_PROC_FS
2343 /* Proc filesystem TCP sock list dumping. */
2344
2345 static unsigned short seq_file_family(const struct seq_file *seq);
2346
2347 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2348 {
2349         unsigned short family = seq_file_family(seq);
2350
2351         /* AF_UNSPEC is used as a match all */
2352         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2353                 net_eq(sock_net(sk), seq_file_net(seq)));
2354 }
2355
2356 /* Find a non empty bucket (starting from st->bucket)
2357  * and return the first sk from it.
2358  */
2359 static void *listening_get_first(struct seq_file *seq)
2360 {
2361         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2362         struct tcp_iter_state *st = seq->private;
2363
2364         st->offset = 0;
2365         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2366                 struct inet_listen_hashbucket *ilb2;
2367                 struct hlist_nulls_node *node;
2368                 struct sock *sk;
2369
2370                 ilb2 = &hinfo->lhash2[st->bucket];
2371                 if (hlist_nulls_empty(&ilb2->nulls_head))
2372                         continue;
2373
2374                 spin_lock(&ilb2->lock);
2375                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2376                         if (seq_sk_match(seq, sk))
2377                                 return sk;
2378                 }
2379                 spin_unlock(&ilb2->lock);
2380         }
2381
2382         return NULL;
2383 }
2384
2385 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2386  * If "cur" is the last one in the st->bucket,
2387  * call listening_get_first() to return the first sk of the next
2388  * non empty bucket.
2389  */
2390 static void *listening_get_next(struct seq_file *seq, void *cur)
2391 {
2392         struct tcp_iter_state *st = seq->private;
2393         struct inet_listen_hashbucket *ilb2;
2394         struct hlist_nulls_node *node;
2395         struct inet_hashinfo *hinfo;
2396         struct sock *sk = cur;
2397
2398         ++st->num;
2399         ++st->offset;
2400
2401         sk = sk_nulls_next(sk);
2402         sk_nulls_for_each_from(sk, node) {
2403                 if (seq_sk_match(seq, sk))
2404                         return sk;
2405         }
2406
2407         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2408         ilb2 = &hinfo->lhash2[st->bucket];
2409         spin_unlock(&ilb2->lock);
2410         ++st->bucket;
2411         return listening_get_first(seq);
2412 }
2413
2414 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2415 {
2416         struct tcp_iter_state *st = seq->private;
2417         void *rc;
2418
2419         st->bucket = 0;
2420         st->offset = 0;
2421         rc = listening_get_first(seq);
2422
2423         while (rc && *pos) {
2424                 rc = listening_get_next(seq, rc);
2425                 --*pos;
2426         }
2427         return rc;
2428 }
2429
2430 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2431                                 const struct tcp_iter_state *st)
2432 {
2433         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2434 }
2435
2436 /*
2437  * Get first established socket starting from bucket given in st->bucket.
2438  * If st->bucket is zero, the very first socket in the hash is returned.
2439  */
2440 static void *established_get_first(struct seq_file *seq)
2441 {
2442         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2443         struct tcp_iter_state *st = seq->private;
2444
2445         st->offset = 0;
2446         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2447                 struct sock *sk;
2448                 struct hlist_nulls_node *node;
2449                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2450
2451                 cond_resched();
2452
2453                 /* Lockless fast path for the common case of empty buckets */
2454                 if (empty_bucket(hinfo, st))
2455                         continue;
2456
2457                 spin_lock_bh(lock);
2458                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2459                         if (seq_sk_match(seq, sk))
2460                                 return sk;
2461                 }
2462                 spin_unlock_bh(lock);
2463         }
2464
2465         return NULL;
2466 }
2467
2468 static void *established_get_next(struct seq_file *seq, void *cur)
2469 {
2470         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2471         struct tcp_iter_state *st = seq->private;
2472         struct hlist_nulls_node *node;
2473         struct sock *sk = cur;
2474
2475         ++st->num;
2476         ++st->offset;
2477
2478         sk = sk_nulls_next(sk);
2479
2480         sk_nulls_for_each_from(sk, node) {
2481                 if (seq_sk_match(seq, sk))
2482                         return sk;
2483         }
2484
2485         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2486         ++st->bucket;
2487         return established_get_first(seq);
2488 }
2489
2490 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2491 {
2492         struct tcp_iter_state *st = seq->private;
2493         void *rc;
2494
2495         st->bucket = 0;
2496         rc = established_get_first(seq);
2497
2498         while (rc && pos) {
2499                 rc = established_get_next(seq, rc);
2500                 --pos;
2501         }
2502         return rc;
2503 }
2504
2505 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2506 {
2507         void *rc;
2508         struct tcp_iter_state *st = seq->private;
2509
2510         st->state = TCP_SEQ_STATE_LISTENING;
2511         rc        = listening_get_idx(seq, &pos);
2512
2513         if (!rc) {
2514                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2515                 rc        = established_get_idx(seq, pos);
2516         }
2517
2518         return rc;
2519 }
2520
2521 static void *tcp_seek_last_pos(struct seq_file *seq)
2522 {
2523         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2524         struct tcp_iter_state *st = seq->private;
2525         int bucket = st->bucket;
2526         int offset = st->offset;
2527         int orig_num = st->num;
2528         void *rc = NULL;
2529
2530         switch (st->state) {
2531         case TCP_SEQ_STATE_LISTENING:
2532                 if (st->bucket > hinfo->lhash2_mask)
2533                         break;
2534                 rc = listening_get_first(seq);
2535                 while (offset-- && rc && bucket == st->bucket)
2536                         rc = listening_get_next(seq, rc);
2537                 if (rc)
2538                         break;
2539                 st->bucket = 0;
2540                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2541                 fallthrough;
2542         case TCP_SEQ_STATE_ESTABLISHED:
2543                 if (st->bucket > hinfo->ehash_mask)
2544                         break;
2545                 rc = established_get_first(seq);
2546                 while (offset-- && rc && bucket == st->bucket)
2547                         rc = established_get_next(seq, rc);
2548         }
2549
2550         st->num = orig_num;
2551
2552         return rc;
2553 }
2554
2555 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2556 {
2557         struct tcp_iter_state *st = seq->private;
2558         void *rc;
2559
2560         if (*pos && *pos == st->last_pos) {
2561                 rc = tcp_seek_last_pos(seq);
2562                 if (rc)
2563                         goto out;
2564         }
2565
2566         st->state = TCP_SEQ_STATE_LISTENING;
2567         st->num = 0;
2568         st->bucket = 0;
2569         st->offset = 0;
2570         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2571
2572 out:
2573         st->last_pos = *pos;
2574         return rc;
2575 }
2576 EXPORT_SYMBOL(tcp_seq_start);
2577
2578 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2579 {
2580         struct tcp_iter_state *st = seq->private;
2581         void *rc = NULL;
2582
2583         if (v == SEQ_START_TOKEN) {
2584                 rc = tcp_get_idx(seq, 0);
2585                 goto out;
2586         }
2587
2588         switch (st->state) {
2589         case TCP_SEQ_STATE_LISTENING:
2590                 rc = listening_get_next(seq, v);
2591                 if (!rc) {
2592                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2593                         st->bucket = 0;
2594                         st->offset = 0;
2595                         rc        = established_get_first(seq);
2596                 }
2597                 break;
2598         case TCP_SEQ_STATE_ESTABLISHED:
2599                 rc = established_get_next(seq, v);
2600                 break;
2601         }
2602 out:
2603         ++*pos;
2604         st->last_pos = *pos;
2605         return rc;
2606 }
2607 EXPORT_SYMBOL(tcp_seq_next);
2608
2609 void tcp_seq_stop(struct seq_file *seq, void *v)
2610 {
2611         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2612         struct tcp_iter_state *st = seq->private;
2613
2614         switch (st->state) {
2615         case TCP_SEQ_STATE_LISTENING:
2616                 if (v != SEQ_START_TOKEN)
2617                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2618                 break;
2619         case TCP_SEQ_STATE_ESTABLISHED:
2620                 if (v)
2621                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2622                 break;
2623         }
2624 }
2625 EXPORT_SYMBOL(tcp_seq_stop);
2626
2627 static void get_openreq4(const struct request_sock *req,
2628                          struct seq_file *f, int i)
2629 {
2630         const struct inet_request_sock *ireq = inet_rsk(req);
2631         long delta = req->rsk_timer.expires - jiffies;
2632
2633         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2634                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2635                 i,
2636                 ireq->ir_loc_addr,
2637                 ireq->ir_num,
2638                 ireq->ir_rmt_addr,
2639                 ntohs(ireq->ir_rmt_port),
2640                 TCP_SYN_RECV,
2641                 0, 0, /* could print option size, but that is af dependent. */
2642                 1,    /* timers active (only the expire timer) */
2643                 jiffies_delta_to_clock_t(delta),
2644                 req->num_timeout,
2645                 from_kuid_munged(seq_user_ns(f),
2646                                  sock_i_uid(req->rsk_listener)),
2647                 0,  /* non standard timer */
2648                 0, /* open_requests have no inode */
2649                 0,
2650                 req);
2651 }
2652
2653 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2654 {
2655         int timer_active;
2656         unsigned long timer_expires;
2657         const struct tcp_sock *tp = tcp_sk(sk);
2658         const struct inet_connection_sock *icsk = inet_csk(sk);
2659         const struct inet_sock *inet = inet_sk(sk);
2660         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2661         __be32 dest = inet->inet_daddr;
2662         __be32 src = inet->inet_rcv_saddr;
2663         __u16 destp = ntohs(inet->inet_dport);
2664         __u16 srcp = ntohs(inet->inet_sport);
2665         int rx_queue;
2666         int state;
2667
2668         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2669             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2670             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2671                 timer_active    = 1;
2672                 timer_expires   = icsk->icsk_timeout;
2673         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2674                 timer_active    = 4;
2675                 timer_expires   = icsk->icsk_timeout;
2676         } else if (timer_pending(&sk->sk_timer)) {
2677                 timer_active    = 2;
2678                 timer_expires   = sk->sk_timer.expires;
2679         } else {
2680                 timer_active    = 0;
2681                 timer_expires = jiffies;
2682         }
2683
2684         state = inet_sk_state_load(sk);
2685         if (state == TCP_LISTEN)
2686                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2687         else
2688                 /* Because we don't lock the socket,
2689                  * we might find a transient negative value.
2690                  */
2691                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2692                                       READ_ONCE(tp->copied_seq), 0);
2693
2694         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2695                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2696                 i, src, srcp, dest, destp, state,
2697                 READ_ONCE(tp->write_seq) - tp->snd_una,
2698                 rx_queue,
2699                 timer_active,
2700                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2701                 icsk->icsk_retransmits,
2702                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2703                 icsk->icsk_probes_out,
2704                 sock_i_ino(sk),
2705                 refcount_read(&sk->sk_refcnt), sk,
2706                 jiffies_to_clock_t(icsk->icsk_rto),
2707                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2708                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2709                 tcp_snd_cwnd(tp),
2710                 state == TCP_LISTEN ?
2711                     fastopenq->max_qlen :
2712                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2713 }
2714
2715 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2716                                struct seq_file *f, int i)
2717 {
2718         long delta = tw->tw_timer.expires - jiffies;
2719         __be32 dest, src;
2720         __u16 destp, srcp;
2721
2722         dest  = tw->tw_daddr;
2723         src   = tw->tw_rcv_saddr;
2724         destp = ntohs(tw->tw_dport);
2725         srcp  = ntohs(tw->tw_sport);
2726
2727         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2728                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2729                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2730                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2731                 refcount_read(&tw->tw_refcnt), tw);
2732 }
2733
2734 #define TMPSZ 150
2735
2736 static int tcp4_seq_show(struct seq_file *seq, void *v)
2737 {
2738         struct tcp_iter_state *st;
2739         struct sock *sk = v;
2740
2741         seq_setwidth(seq, TMPSZ - 1);
2742         if (v == SEQ_START_TOKEN) {
2743                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2744                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2745                            "inode");
2746                 goto out;
2747         }
2748         st = seq->private;
2749
2750         if (sk->sk_state == TCP_TIME_WAIT)
2751                 get_timewait4_sock(v, seq, st->num);
2752         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2753                 get_openreq4(v, seq, st->num);
2754         else
2755                 get_tcp4_sock(v, seq, st->num);
2756 out:
2757         seq_pad(seq, '\n');
2758         return 0;
2759 }
2760
2761 #ifdef CONFIG_BPF_SYSCALL
2762 struct bpf_tcp_iter_state {
2763         struct tcp_iter_state state;
2764         unsigned int cur_sk;
2765         unsigned int end_sk;
2766         unsigned int max_sk;
2767         struct sock **batch;
2768         bool st_bucket_done;
2769 };
2770
2771 struct bpf_iter__tcp {
2772         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2773         __bpf_md_ptr(struct sock_common *, sk_common);
2774         uid_t uid __aligned(8);
2775 };
2776
2777 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2778                              struct sock_common *sk_common, uid_t uid)
2779 {
2780         struct bpf_iter__tcp ctx;
2781
2782         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2783         ctx.meta = meta;
2784         ctx.sk_common = sk_common;
2785         ctx.uid = uid;
2786         return bpf_iter_run_prog(prog, &ctx);
2787 }
2788
2789 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2790 {
2791         while (iter->cur_sk < iter->end_sk)
2792                 sock_gen_put(iter->batch[iter->cur_sk++]);
2793 }
2794
2795 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2796                                       unsigned int new_batch_sz)
2797 {
2798         struct sock **new_batch;
2799
2800         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2801                              GFP_USER | __GFP_NOWARN);
2802         if (!new_batch)
2803                 return -ENOMEM;
2804
2805         bpf_iter_tcp_put_batch(iter);
2806         kvfree(iter->batch);
2807         iter->batch = new_batch;
2808         iter->max_sk = new_batch_sz;
2809
2810         return 0;
2811 }
2812
2813 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2814                                                  struct sock *start_sk)
2815 {
2816         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2817         struct bpf_tcp_iter_state *iter = seq->private;
2818         struct tcp_iter_state *st = &iter->state;
2819         struct hlist_nulls_node *node;
2820         unsigned int expected = 1;
2821         struct sock *sk;
2822
2823         sock_hold(start_sk);
2824         iter->batch[iter->end_sk++] = start_sk;
2825
2826         sk = sk_nulls_next(start_sk);
2827         sk_nulls_for_each_from(sk, node) {
2828                 if (seq_sk_match(seq, sk)) {
2829                         if (iter->end_sk < iter->max_sk) {
2830                                 sock_hold(sk);
2831                                 iter->batch[iter->end_sk++] = sk;
2832                         }
2833                         expected++;
2834                 }
2835         }
2836         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2837
2838         return expected;
2839 }
2840
2841 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2842                                                    struct sock *start_sk)
2843 {
2844         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2845         struct bpf_tcp_iter_state *iter = seq->private;
2846         struct tcp_iter_state *st = &iter->state;
2847         struct hlist_nulls_node *node;
2848         unsigned int expected = 1;
2849         struct sock *sk;
2850
2851         sock_hold(start_sk);
2852         iter->batch[iter->end_sk++] = start_sk;
2853
2854         sk = sk_nulls_next(start_sk);
2855         sk_nulls_for_each_from(sk, node) {
2856                 if (seq_sk_match(seq, sk)) {
2857                         if (iter->end_sk < iter->max_sk) {
2858                                 sock_hold(sk);
2859                                 iter->batch[iter->end_sk++] = sk;
2860                         }
2861                         expected++;
2862                 }
2863         }
2864         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2865
2866         return expected;
2867 }
2868
2869 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2870 {
2871         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2872         struct bpf_tcp_iter_state *iter = seq->private;
2873         struct tcp_iter_state *st = &iter->state;
2874         unsigned int expected;
2875         bool resized = false;
2876         struct sock *sk;
2877
2878         /* The st->bucket is done.  Directly advance to the next
2879          * bucket instead of having the tcp_seek_last_pos() to skip
2880          * one by one in the current bucket and eventually find out
2881          * it has to advance to the next bucket.
2882          */
2883         if (iter->st_bucket_done) {
2884                 st->offset = 0;
2885                 st->bucket++;
2886                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2887                     st->bucket > hinfo->lhash2_mask) {
2888                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2889                         st->bucket = 0;
2890                 }
2891         }
2892
2893 again:
2894         /* Get a new batch */
2895         iter->cur_sk = 0;
2896         iter->end_sk = 0;
2897         iter->st_bucket_done = false;
2898
2899         sk = tcp_seek_last_pos(seq);
2900         if (!sk)
2901                 return NULL; /* Done */
2902
2903         if (st->state == TCP_SEQ_STATE_LISTENING)
2904                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2905         else
2906                 expected = bpf_iter_tcp_established_batch(seq, sk);
2907
2908         if (iter->end_sk == expected) {
2909                 iter->st_bucket_done = true;
2910                 return sk;
2911         }
2912
2913         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2914                 resized = true;
2915                 goto again;
2916         }
2917
2918         return sk;
2919 }
2920
2921 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2922 {
2923         /* bpf iter does not support lseek, so it always
2924          * continue from where it was stop()-ped.
2925          */
2926         if (*pos)
2927                 return bpf_iter_tcp_batch(seq);
2928
2929         return SEQ_START_TOKEN;
2930 }
2931
2932 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2933 {
2934         struct bpf_tcp_iter_state *iter = seq->private;
2935         struct tcp_iter_state *st = &iter->state;
2936         struct sock *sk;
2937
2938         /* Whenever seq_next() is called, the iter->cur_sk is
2939          * done with seq_show(), so advance to the next sk in
2940          * the batch.
2941          */
2942         if (iter->cur_sk < iter->end_sk) {
2943                 /* Keeping st->num consistent in tcp_iter_state.
2944                  * bpf_iter_tcp does not use st->num.
2945                  * meta.seq_num is used instead.
2946                  */
2947                 st->num++;
2948                 /* Move st->offset to the next sk in the bucket such that
2949                  * the future start() will resume at st->offset in
2950                  * st->bucket.  See tcp_seek_last_pos().
2951                  */
2952                 st->offset++;
2953                 sock_gen_put(iter->batch[iter->cur_sk++]);
2954         }
2955
2956         if (iter->cur_sk < iter->end_sk)
2957                 sk = iter->batch[iter->cur_sk];
2958         else
2959                 sk = bpf_iter_tcp_batch(seq);
2960
2961         ++*pos;
2962         /* Keeping st->last_pos consistent in tcp_iter_state.
2963          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2964          */
2965         st->last_pos = *pos;
2966         return sk;
2967 }
2968
2969 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2970 {
2971         struct bpf_iter_meta meta;
2972         struct bpf_prog *prog;
2973         struct sock *sk = v;
2974         uid_t uid;
2975         int ret;
2976
2977         if (v == SEQ_START_TOKEN)
2978                 return 0;
2979
2980         if (sk_fullsock(sk))
2981                 lock_sock(sk);
2982
2983         if (unlikely(sk_unhashed(sk))) {
2984                 ret = SEQ_SKIP;
2985                 goto unlock;
2986         }
2987
2988         if (sk->sk_state == TCP_TIME_WAIT) {
2989                 uid = 0;
2990         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2991                 const struct request_sock *req = v;
2992
2993                 uid = from_kuid_munged(seq_user_ns(seq),
2994                                        sock_i_uid(req->rsk_listener));
2995         } else {
2996                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2997         }
2998
2999         meta.seq = seq;
3000         prog = bpf_iter_get_info(&meta, false);
3001         ret = tcp_prog_seq_show(prog, &meta, v, uid);
3002
3003 unlock:
3004         if (sk_fullsock(sk))
3005                 release_sock(sk);
3006         return ret;
3007
3008 }
3009
3010 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3011 {
3012         struct bpf_tcp_iter_state *iter = seq->private;
3013         struct bpf_iter_meta meta;
3014         struct bpf_prog *prog;
3015
3016         if (!v) {
3017                 meta.seq = seq;
3018                 prog = bpf_iter_get_info(&meta, true);
3019                 if (prog)
3020                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3021         }
3022
3023         if (iter->cur_sk < iter->end_sk) {
3024                 bpf_iter_tcp_put_batch(iter);
3025                 iter->st_bucket_done = false;
3026         }
3027 }
3028
3029 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3030         .show           = bpf_iter_tcp_seq_show,
3031         .start          = bpf_iter_tcp_seq_start,
3032         .next           = bpf_iter_tcp_seq_next,
3033         .stop           = bpf_iter_tcp_seq_stop,
3034 };
3035 #endif
3036 static unsigned short seq_file_family(const struct seq_file *seq)
3037 {
3038         const struct tcp_seq_afinfo *afinfo;
3039
3040 #ifdef CONFIG_BPF_SYSCALL
3041         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3042         if (seq->op == &bpf_iter_tcp_seq_ops)
3043                 return AF_UNSPEC;
3044 #endif
3045
3046         /* Iterated from proc fs */
3047         afinfo = pde_data(file_inode(seq->file));
3048         return afinfo->family;
3049 }
3050
3051 static const struct seq_operations tcp4_seq_ops = {
3052         .show           = tcp4_seq_show,
3053         .start          = tcp_seq_start,
3054         .next           = tcp_seq_next,
3055         .stop           = tcp_seq_stop,
3056 };
3057
3058 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3059         .family         = AF_INET,
3060 };
3061
3062 static int __net_init tcp4_proc_init_net(struct net *net)
3063 {
3064         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3065                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3066                 return -ENOMEM;
3067         return 0;
3068 }
3069
3070 static void __net_exit tcp4_proc_exit_net(struct net *net)
3071 {
3072         remove_proc_entry("tcp", net->proc_net);
3073 }
3074
3075 static struct pernet_operations tcp4_net_ops = {
3076         .init = tcp4_proc_init_net,
3077         .exit = tcp4_proc_exit_net,
3078 };
3079
3080 int __init tcp4_proc_init(void)
3081 {
3082         return register_pernet_subsys(&tcp4_net_ops);
3083 }
3084
3085 void tcp4_proc_exit(void)
3086 {
3087         unregister_pernet_subsys(&tcp4_net_ops);
3088 }
3089 #endif /* CONFIG_PROC_FS */
3090
3091 /* @wake is one when sk_stream_write_space() calls us.
3092  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3093  * This mimics the strategy used in sock_def_write_space().
3094  */
3095 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3096 {
3097         const struct tcp_sock *tp = tcp_sk(sk);
3098         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3099                             READ_ONCE(tp->snd_nxt);
3100
3101         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3102 }
3103 EXPORT_SYMBOL(tcp_stream_memory_free);
3104
3105 struct proto tcp_prot = {
3106         .name                   = "TCP",
3107         .owner                  = THIS_MODULE,
3108         .close                  = tcp_close,
3109         .pre_connect            = tcp_v4_pre_connect,
3110         .connect                = tcp_v4_connect,
3111         .disconnect             = tcp_disconnect,
3112         .accept                 = inet_csk_accept,
3113         .ioctl                  = tcp_ioctl,
3114         .init                   = tcp_v4_init_sock,
3115         .destroy                = tcp_v4_destroy_sock,
3116         .shutdown               = tcp_shutdown,
3117         .setsockopt             = tcp_setsockopt,
3118         .getsockopt             = tcp_getsockopt,
3119         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3120         .keepalive              = tcp_set_keepalive,
3121         .recvmsg                = tcp_recvmsg,
3122         .sendmsg                = tcp_sendmsg,
3123         .splice_eof             = tcp_splice_eof,
3124         .backlog_rcv            = tcp_v4_do_rcv,
3125         .release_cb             = tcp_release_cb,
3126         .hash                   = inet_hash,
3127         .unhash                 = inet_unhash,
3128         .get_port               = inet_csk_get_port,
3129         .put_port               = inet_put_port,
3130 #ifdef CONFIG_BPF_SYSCALL
3131         .psock_update_sk_prot   = tcp_bpf_update_proto,
3132 #endif
3133         .enter_memory_pressure  = tcp_enter_memory_pressure,
3134         .leave_memory_pressure  = tcp_leave_memory_pressure,
3135         .stream_memory_free     = tcp_stream_memory_free,
3136         .sockets_allocated      = &tcp_sockets_allocated,
3137         .orphan_count           = &tcp_orphan_count,
3138
3139         .memory_allocated       = &tcp_memory_allocated,
3140         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3141
3142         .memory_pressure        = &tcp_memory_pressure,
3143         .sysctl_mem             = sysctl_tcp_mem,
3144         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3145         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3146         .max_header             = MAX_TCP_HEADER,
3147         .obj_size               = sizeof(struct tcp_sock),
3148         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3149         .twsk_prot              = &tcp_timewait_sock_ops,
3150         .rsk_prot               = &tcp_request_sock_ops,
3151         .h.hashinfo             = NULL,
3152         .no_autobind            = true,
3153         .diag_destroy           = tcp_abort,
3154 };
3155 EXPORT_SYMBOL(tcp_prot);
3156
3157 static void __net_exit tcp_sk_exit(struct net *net)
3158 {
3159         if (net->ipv4.tcp_congestion_control)
3160                 bpf_module_put(net->ipv4.tcp_congestion_control,
3161                                net->ipv4.tcp_congestion_control->owner);
3162 }
3163
3164 static void __net_init tcp_set_hashinfo(struct net *net)
3165 {
3166         struct inet_hashinfo *hinfo;
3167         unsigned int ehash_entries;
3168         struct net *old_net;
3169
3170         if (net_eq(net, &init_net))
3171                 goto fallback;
3172
3173         old_net = current->nsproxy->net_ns;
3174         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3175         if (!ehash_entries)
3176                 goto fallback;
3177
3178         ehash_entries = roundup_pow_of_two(ehash_entries);
3179         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3180         if (!hinfo) {
3181                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3182                         "for a netns, fallback to the global one\n",
3183                         ehash_entries);
3184 fallback:
3185                 hinfo = &tcp_hashinfo;
3186                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3187         }
3188
3189         net->ipv4.tcp_death_row.hashinfo = hinfo;
3190         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3191         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3192 }
3193
3194 static int __net_init tcp_sk_init(struct net *net)
3195 {
3196         net->ipv4.sysctl_tcp_ecn = 2;
3197         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3198
3199         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3200         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3201         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3202         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3203         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3204
3205         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3206         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3207         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3208
3209         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3210         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3211         net->ipv4.sysctl_tcp_syncookies = 1;
3212         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3213         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3214         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3215         net->ipv4.sysctl_tcp_orphan_retries = 0;
3216         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3217         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3218         net->ipv4.sysctl_tcp_tw_reuse = 2;
3219         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3220
3221         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3222         tcp_set_hashinfo(net);
3223
3224         net->ipv4.sysctl_tcp_sack = 1;
3225         net->ipv4.sysctl_tcp_window_scaling = 1;
3226         net->ipv4.sysctl_tcp_timestamps = 1;
3227         net->ipv4.sysctl_tcp_early_retrans = 3;
3228         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3229         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3230         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3231         net->ipv4.sysctl_tcp_max_reordering = 300;
3232         net->ipv4.sysctl_tcp_dsack = 1;
3233         net->ipv4.sysctl_tcp_app_win = 31;
3234         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3235         net->ipv4.sysctl_tcp_frto = 2;
3236         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3237         /* This limits the percentage of the congestion window which we
3238          * will allow a single TSO frame to consume.  Building TSO frames
3239          * which are too large can cause TCP streams to be bursty.
3240          */
3241         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3242         /* Default TSQ limit of 16 TSO segments */
3243         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3244
3245         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3246         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3247
3248         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3249         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3250         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3251         net->ipv4.sysctl_tcp_autocorking = 1;
3252         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3253         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3254         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3255         if (net != &init_net) {
3256                 memcpy(net->ipv4.sysctl_tcp_rmem,
3257                        init_net.ipv4.sysctl_tcp_rmem,
3258                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3259                 memcpy(net->ipv4.sysctl_tcp_wmem,
3260                        init_net.ipv4.sysctl_tcp_wmem,
3261                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3262         }
3263         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3264         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3265         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3266         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3267         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3268         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3269
3270         /* Set default values for PLB */
3271         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3272         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3273         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3274         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3275         /* Default congestion threshold for PLB to mark a round is 50% */
3276         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3277
3278         /* Reno is always built in */
3279         if (!net_eq(net, &init_net) &&
3280             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3281                                init_net.ipv4.tcp_congestion_control->owner))
3282                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3283         else
3284                 net->ipv4.tcp_congestion_control = &tcp_reno;
3285
3286         net->ipv4.sysctl_tcp_syn_linear_timeouts = 4;
3287         net->ipv4.sysctl_tcp_shrink_window = 0;
3288
3289         return 0;
3290 }
3291
3292 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3293 {
3294         struct net *net;
3295
3296         tcp_twsk_purge(net_exit_list, AF_INET);
3297
3298         list_for_each_entry(net, net_exit_list, exit_list) {
3299                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3300                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3301                 tcp_fastopen_ctx_destroy(net);
3302         }
3303 }
3304
3305 static struct pernet_operations __net_initdata tcp_sk_ops = {
3306        .init       = tcp_sk_init,
3307        .exit       = tcp_sk_exit,
3308        .exit_batch = tcp_sk_exit_batch,
3309 };
3310
3311 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3312 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3313                      struct sock_common *sk_common, uid_t uid)
3314
3315 #define INIT_BATCH_SZ 16
3316
3317 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3318 {
3319         struct bpf_tcp_iter_state *iter = priv_data;
3320         int err;
3321
3322         err = bpf_iter_init_seq_net(priv_data, aux);
3323         if (err)
3324                 return err;
3325
3326         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3327         if (err) {
3328                 bpf_iter_fini_seq_net(priv_data);
3329                 return err;
3330         }
3331
3332         return 0;
3333 }
3334
3335 static void bpf_iter_fini_tcp(void *priv_data)
3336 {
3337         struct bpf_tcp_iter_state *iter = priv_data;
3338
3339         bpf_iter_fini_seq_net(priv_data);
3340         kvfree(iter->batch);
3341 }
3342
3343 static const struct bpf_iter_seq_info tcp_seq_info = {
3344         .seq_ops                = &bpf_iter_tcp_seq_ops,
3345         .init_seq_private       = bpf_iter_init_tcp,
3346         .fini_seq_private       = bpf_iter_fini_tcp,
3347         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3348 };
3349
3350 static const struct bpf_func_proto *
3351 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3352                             const struct bpf_prog *prog)
3353 {
3354         switch (func_id) {
3355         case BPF_FUNC_setsockopt:
3356                 return &bpf_sk_setsockopt_proto;
3357         case BPF_FUNC_getsockopt:
3358                 return &bpf_sk_getsockopt_proto;
3359         default:
3360                 return NULL;
3361         }
3362 }
3363
3364 static struct bpf_iter_reg tcp_reg_info = {
3365         .target                 = "tcp",
3366         .ctx_arg_info_size      = 1,
3367         .ctx_arg_info           = {
3368                 { offsetof(struct bpf_iter__tcp, sk_common),
3369                   PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED },
3370         },
3371         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3372         .seq_info               = &tcp_seq_info,
3373 };
3374
3375 static void __init bpf_iter_register(void)
3376 {
3377         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3378         if (bpf_iter_reg_target(&tcp_reg_info))
3379                 pr_warn("Warning: could not register bpf iterator tcp\n");
3380 }
3381
3382 #endif
3383
3384 void __init tcp_v4_init(void)
3385 {
3386         int cpu, res;
3387
3388         for_each_possible_cpu(cpu) {
3389                 struct sock *sk;
3390
3391                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3392                                            IPPROTO_TCP, &init_net);
3393                 if (res)
3394                         panic("Failed to create the TCP control socket.\n");
3395                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3396
3397                 /* Please enforce IP_DF and IPID==0 for RST and
3398                  * ACK sent in SYN-RECV and TIME-WAIT state.
3399                  */
3400                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3401
3402                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3403         }
3404         if (register_pernet_subsys(&tcp_sk_ops))
3405                 panic("Failed to create the TCP control socket.\n");
3406
3407 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3408         bpf_iter_register();
3409 #endif
3410 }