bpf: tcp: Use sock_gen_put instead of sock_put in bpf_iter_tcp
[platform/kernel/linux-starfive.git] / net / ipv4 / tcp_ipv4.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              Implementation of the Transmission Control Protocol(TCP).
8  *
9  *              IPv4 specific functions
10  *
11  *              code split from:
12  *              linux/ipv4/tcp.c
13  *              linux/ipv4/tcp_input.c
14  *              linux/ipv4/tcp_output.c
15  *
16  *              See tcp.c for author information
17  */
18
19 /*
20  * Changes:
21  *              David S. Miller :       New socket lookup architecture.
22  *                                      This code is dedicated to John Dyson.
23  *              David S. Miller :       Change semantics of established hash,
24  *                                      half is devoted to TIME_WAIT sockets
25  *                                      and the rest go in the other half.
26  *              Andi Kleen :            Add support for syncookies and fixed
27  *                                      some bugs: ip options weren't passed to
28  *                                      the TCP layer, missed a check for an
29  *                                      ACK bit.
30  *              Andi Kleen :            Implemented fast path mtu discovery.
31  *                                      Fixed many serious bugs in the
32  *                                      request_sock handling and moved
33  *                                      most of it into the af independent code.
34  *                                      Added tail drop and some other bugfixes.
35  *                                      Added new listen semantics.
36  *              Mike McLagan    :       Routing by source
37  *      Juan Jose Ciarlante:            ip_dynaddr bits
38  *              Andi Kleen:             various fixes.
39  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
40  *                                      coma.
41  *      Andi Kleen              :       Fix new listen.
42  *      Andi Kleen              :       Fix accept error reporting.
43  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
44  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
45  *                                      a single port at the same time.
46  */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98         return secure_tcp_seq(ip_hdr(skb)->daddr,
99                               ip_hdr(skb)->saddr,
100                               tcp_hdr(skb)->dest,
101                               tcp_hdr(skb)->source);
102 }
103
104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106         return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111         int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112         const struct inet_timewait_sock *tw = inet_twsk(sktw);
113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114         struct tcp_sock *tp = tcp_sk(sk);
115
116         if (reuse == 2) {
117                 /* Still does not detect *everything* that goes through
118                  * lo, since we require a loopback src or dst address
119                  * or direct binding to 'lo' interface.
120                  */
121                 bool loopback = false;
122                 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123                         loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125                 if (tw->tw_family == AF_INET6) {
126                         if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128                             ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129                             ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130                                 loopback = true;
131                 } else
132 #endif
133                 {
134                         if (ipv4_is_loopback(tw->tw_daddr) ||
135                             ipv4_is_loopback(tw->tw_rcv_saddr))
136                                 loopback = true;
137                 }
138                 if (!loopback)
139                         reuse = 0;
140         }
141
142         /* With PAWS, it is safe from the viewpoint
143            of data integrity. Even without PAWS it is safe provided sequence
144            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146            Actually, the idea is close to VJ's one, only timestamp cache is
147            held not per host, but per port pair and TW bucket is used as state
148            holder.
149
150            If TW bucket has been already destroyed we fall back to VJ's scheme
151            and use initial timestamp retrieved from peer table.
152          */
153         if (tcptw->tw_ts_recent_stamp &&
154             (!twp || (reuse && time_after32(ktime_get_seconds(),
155                                             tcptw->tw_ts_recent_stamp)))) {
156                 /* In case of repair and re-using TIME-WAIT sockets we still
157                  * want to be sure that it is safe as above but honor the
158                  * sequence numbers and time stamps set as part of the repair
159                  * process.
160                  *
161                  * Without this check re-using a TIME-WAIT socket with TCP
162                  * repair would accumulate a -1 on the repair assigned
163                  * sequence number. The first time it is reused the sequence
164                  * is -1, the second time -2, etc. This fixes that issue
165                  * without appearing to create any others.
166                  */
167                 if (likely(!tp->repair)) {
168                         u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170                         if (!seq)
171                                 seq = 1;
172                         WRITE_ONCE(tp->write_seq, seq);
173                         tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
174                         tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175                 }
176                 sock_hold(sktw);
177                 return 1;
178         }
179
180         return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185                               int addr_len)
186 {
187         /* This check is replicated from tcp_v4_connect() and intended to
188          * prevent BPF program called below from accessing bytes that are out
189          * of the bound specified by user in addr_len.
190          */
191         if (addr_len < sizeof(struct sockaddr_in))
192                 return -EINVAL;
193
194         sock_owned_by_me(sk);
195
196         return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203         struct inet_timewait_death_row *tcp_death_row;
204         struct inet_sock *inet = inet_sk(sk);
205         struct tcp_sock *tp = tcp_sk(sk);
206         struct ip_options_rcu *inet_opt;
207         struct net *net = sock_net(sk);
208         __be16 orig_sport, orig_dport;
209         __be32 daddr, nexthop;
210         struct flowi4 *fl4;
211         struct rtable *rt;
212         int err;
213
214         if (addr_len < sizeof(struct sockaddr_in))
215                 return -EINVAL;
216
217         if (usin->sin_family != AF_INET)
218                 return -EAFNOSUPPORT;
219
220         nexthop = daddr = usin->sin_addr.s_addr;
221         inet_opt = rcu_dereference_protected(inet->inet_opt,
222                                              lockdep_sock_is_held(sk));
223         if (inet_opt && inet_opt->opt.srr) {
224                 if (!daddr)
225                         return -EINVAL;
226                 nexthop = inet_opt->opt.faddr;
227         }
228
229         orig_sport = inet->inet_sport;
230         orig_dport = usin->sin_port;
231         fl4 = &inet->cork.fl.u.ip4;
232         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
233                               sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
234                               orig_dport, sk);
235         if (IS_ERR(rt)) {
236                 err = PTR_ERR(rt);
237                 if (err == -ENETUNREACH)
238                         IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
239                 return err;
240         }
241
242         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
243                 ip_rt_put(rt);
244                 return -ENETUNREACH;
245         }
246
247         if (!inet_opt || !inet_opt->opt.srr)
248                 daddr = fl4->daddr;
249
250         tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
251
252         if (!inet->inet_saddr) {
253                 err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
254                 if (err) {
255                         ip_rt_put(rt);
256                         return err;
257                 }
258         } else {
259                 sk_rcv_saddr_set(sk, inet->inet_saddr);
260         }
261
262         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
263                 /* Reset inherited state */
264                 tp->rx_opt.ts_recent       = 0;
265                 tp->rx_opt.ts_recent_stamp = 0;
266                 if (likely(!tp->repair))
267                         WRITE_ONCE(tp->write_seq, 0);
268         }
269
270         inet->inet_dport = usin->sin_port;
271         sk_daddr_set(sk, daddr);
272
273         inet_csk(sk)->icsk_ext_hdr_len = 0;
274         if (inet_opt)
275                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
276
277         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
278
279         /* Socket identity is still unknown (sport may be zero).
280          * However we set state to SYN-SENT and not releasing socket
281          * lock select source port, enter ourselves into the hash tables and
282          * complete initialization after this.
283          */
284         tcp_set_state(sk, TCP_SYN_SENT);
285         err = inet_hash_connect(tcp_death_row, sk);
286         if (err)
287                 goto failure;
288
289         sk_set_txhash(sk);
290
291         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
292                                inet->inet_sport, inet->inet_dport, sk);
293         if (IS_ERR(rt)) {
294                 err = PTR_ERR(rt);
295                 rt = NULL;
296                 goto failure;
297         }
298         /* OK, now commit destination to socket.  */
299         sk->sk_gso_type = SKB_GSO_TCPV4;
300         sk_setup_caps(sk, &rt->dst);
301         rt = NULL;
302
303         if (likely(!tp->repair)) {
304                 if (!tp->write_seq)
305                         WRITE_ONCE(tp->write_seq,
306                                    secure_tcp_seq(inet->inet_saddr,
307                                                   inet->inet_daddr,
308                                                   inet->inet_sport,
309                                                   usin->sin_port));
310                 tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr,
311                                                  inet->inet_daddr);
312         }
313
314         inet->inet_id = get_random_u16();
315
316         if (tcp_fastopen_defer_connect(sk, &err))
317                 return err;
318         if (err)
319                 goto failure;
320
321         err = tcp_connect(sk);
322
323         if (err)
324                 goto failure;
325
326         return 0;
327
328 failure:
329         /*
330          * This unhashes the socket and releases the local port,
331          * if necessary.
332          */
333         tcp_set_state(sk, TCP_CLOSE);
334         inet_bhash2_reset_saddr(sk);
335         ip_rt_put(rt);
336         sk->sk_route_caps = 0;
337         inet->inet_dport = 0;
338         return err;
339 }
340 EXPORT_SYMBOL(tcp_v4_connect);
341
342 /*
343  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
344  * It can be called through tcp_release_cb() if socket was owned by user
345  * at the time tcp_v4_err() was called to handle ICMP message.
346  */
347 void tcp_v4_mtu_reduced(struct sock *sk)
348 {
349         struct inet_sock *inet = inet_sk(sk);
350         struct dst_entry *dst;
351         u32 mtu;
352
353         if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
354                 return;
355         mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
356         dst = inet_csk_update_pmtu(sk, mtu);
357         if (!dst)
358                 return;
359
360         /* Something is about to be wrong... Remember soft error
361          * for the case, if this connection will not able to recover.
362          */
363         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
364                 sk->sk_err_soft = EMSGSIZE;
365
366         mtu = dst_mtu(dst);
367
368         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
369             ip_sk_accept_pmtu(sk) &&
370             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
371                 tcp_sync_mss(sk, mtu);
372
373                 /* Resend the TCP packet because it's
374                  * clear that the old packet has been
375                  * dropped. This is the new "fast" path mtu
376                  * discovery.
377                  */
378                 tcp_simple_retransmit(sk);
379         } /* else let the usual retransmit timer handle it */
380 }
381 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
382
383 static void do_redirect(struct sk_buff *skb, struct sock *sk)
384 {
385         struct dst_entry *dst = __sk_dst_check(sk, 0);
386
387         if (dst)
388                 dst->ops->redirect(dst, sk, skb);
389 }
390
391
392 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
393 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
394 {
395         struct request_sock *req = inet_reqsk(sk);
396         struct net *net = sock_net(sk);
397
398         /* ICMPs are not backlogged, hence we cannot get
399          * an established socket here.
400          */
401         if (seq != tcp_rsk(req)->snt_isn) {
402                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
403         } else if (abort) {
404                 /*
405                  * Still in SYN_RECV, just remove it silently.
406                  * There is no good way to pass the error to the newly
407                  * created socket, and POSIX does not want network
408                  * errors returned from accept().
409                  */
410                 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
411                 tcp_listendrop(req->rsk_listener);
412         }
413         reqsk_put(req);
414 }
415 EXPORT_SYMBOL(tcp_req_err);
416
417 /* TCP-LD (RFC 6069) logic */
418 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
419 {
420         struct inet_connection_sock *icsk = inet_csk(sk);
421         struct tcp_sock *tp = tcp_sk(sk);
422         struct sk_buff *skb;
423         s32 remaining;
424         u32 delta_us;
425
426         if (sock_owned_by_user(sk))
427                 return;
428
429         if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
430             !icsk->icsk_backoff)
431                 return;
432
433         skb = tcp_rtx_queue_head(sk);
434         if (WARN_ON_ONCE(!skb))
435                 return;
436
437         icsk->icsk_backoff--;
438         icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
439         icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
440
441         tcp_mstamp_refresh(tp);
442         delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
443         remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
444
445         if (remaining > 0) {
446                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
447                                           remaining, TCP_RTO_MAX);
448         } else {
449                 /* RTO revert clocked out retransmission.
450                  * Will retransmit now.
451                  */
452                 tcp_retransmit_timer(sk);
453         }
454 }
455 EXPORT_SYMBOL(tcp_ld_RTO_revert);
456
457 /*
458  * This routine is called by the ICMP module when it gets some
459  * sort of error condition.  If err < 0 then the socket should
460  * be closed and the error returned to the user.  If err > 0
461  * it's just the icmp type << 8 | icmp code.  After adjustment
462  * header points to the first 8 bytes of the tcp header.  We need
463  * to find the appropriate port.
464  *
465  * The locking strategy used here is very "optimistic". When
466  * someone else accesses the socket the ICMP is just dropped
467  * and for some paths there is no check at all.
468  * A more general error queue to queue errors for later handling
469  * is probably better.
470  *
471  */
472
473 int tcp_v4_err(struct sk_buff *skb, u32 info)
474 {
475         const struct iphdr *iph = (const struct iphdr *)skb->data;
476         struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
477         struct tcp_sock *tp;
478         struct inet_sock *inet;
479         const int type = icmp_hdr(skb)->type;
480         const int code = icmp_hdr(skb)->code;
481         struct sock *sk;
482         struct request_sock *fastopen;
483         u32 seq, snd_una;
484         int err;
485         struct net *net = dev_net(skb->dev);
486
487         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
488                                        iph->daddr, th->dest, iph->saddr,
489                                        ntohs(th->source), inet_iif(skb), 0);
490         if (!sk) {
491                 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
492                 return -ENOENT;
493         }
494         if (sk->sk_state == TCP_TIME_WAIT) {
495                 inet_twsk_put(inet_twsk(sk));
496                 return 0;
497         }
498         seq = ntohl(th->seq);
499         if (sk->sk_state == TCP_NEW_SYN_RECV) {
500                 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
501                                      type == ICMP_TIME_EXCEEDED ||
502                                      (type == ICMP_DEST_UNREACH &&
503                                       (code == ICMP_NET_UNREACH ||
504                                        code == ICMP_HOST_UNREACH)));
505                 return 0;
506         }
507
508         bh_lock_sock(sk);
509         /* If too many ICMPs get dropped on busy
510          * servers this needs to be solved differently.
511          * We do take care of PMTU discovery (RFC1191) special case :
512          * we can receive locally generated ICMP messages while socket is held.
513          */
514         if (sock_owned_by_user(sk)) {
515                 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
516                         __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
517         }
518         if (sk->sk_state == TCP_CLOSE)
519                 goto out;
520
521         if (static_branch_unlikely(&ip4_min_ttl)) {
522                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
523                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
524                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
525                         goto out;
526                 }
527         }
528
529         tp = tcp_sk(sk);
530         /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
531         fastopen = rcu_dereference(tp->fastopen_rsk);
532         snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
533         if (sk->sk_state != TCP_LISTEN &&
534             !between(seq, snd_una, tp->snd_nxt)) {
535                 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
536                 goto out;
537         }
538
539         switch (type) {
540         case ICMP_REDIRECT:
541                 if (!sock_owned_by_user(sk))
542                         do_redirect(skb, sk);
543                 goto out;
544         case ICMP_SOURCE_QUENCH:
545                 /* Just silently ignore these. */
546                 goto out;
547         case ICMP_PARAMETERPROB:
548                 err = EPROTO;
549                 break;
550         case ICMP_DEST_UNREACH:
551                 if (code > NR_ICMP_UNREACH)
552                         goto out;
553
554                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
555                         /* We are not interested in TCP_LISTEN and open_requests
556                          * (SYN-ACKs send out by Linux are always <576bytes so
557                          * they should go through unfragmented).
558                          */
559                         if (sk->sk_state == TCP_LISTEN)
560                                 goto out;
561
562                         WRITE_ONCE(tp->mtu_info, info);
563                         if (!sock_owned_by_user(sk)) {
564                                 tcp_v4_mtu_reduced(sk);
565                         } else {
566                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
567                                         sock_hold(sk);
568                         }
569                         goto out;
570                 }
571
572                 err = icmp_err_convert[code].errno;
573                 /* check if this ICMP message allows revert of backoff.
574                  * (see RFC 6069)
575                  */
576                 if (!fastopen &&
577                     (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
578                         tcp_ld_RTO_revert(sk, seq);
579                 break;
580         case ICMP_TIME_EXCEEDED:
581                 err = EHOSTUNREACH;
582                 break;
583         default:
584                 goto out;
585         }
586
587         switch (sk->sk_state) {
588         case TCP_SYN_SENT:
589         case TCP_SYN_RECV:
590                 /* Only in fast or simultaneous open. If a fast open socket is
591                  * already accepted it is treated as a connected one below.
592                  */
593                 if (fastopen && !fastopen->sk)
594                         break;
595
596                 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
597
598                 if (!sock_owned_by_user(sk)) {
599                         sk->sk_err = err;
600
601                         sk_error_report(sk);
602
603                         tcp_done(sk);
604                 } else {
605                         sk->sk_err_soft = err;
606                 }
607                 goto out;
608         }
609
610         /* If we've already connected we will keep trying
611          * until we time out, or the user gives up.
612          *
613          * rfc1122 4.2.3.9 allows to consider as hard errors
614          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
615          * but it is obsoleted by pmtu discovery).
616          *
617          * Note, that in modern internet, where routing is unreliable
618          * and in each dark corner broken firewalls sit, sending random
619          * errors ordered by their masters even this two messages finally lose
620          * their original sense (even Linux sends invalid PORT_UNREACHs)
621          *
622          * Now we are in compliance with RFCs.
623          *                                                      --ANK (980905)
624          */
625
626         inet = inet_sk(sk);
627         if (!sock_owned_by_user(sk) && inet->recverr) {
628                 sk->sk_err = err;
629                 sk_error_report(sk);
630         } else  { /* Only an error on timeout */
631                 sk->sk_err_soft = err;
632         }
633
634 out:
635         bh_unlock_sock(sk);
636         sock_put(sk);
637         return 0;
638 }
639
640 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
641 {
642         struct tcphdr *th = tcp_hdr(skb);
643
644         th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
645         skb->csum_start = skb_transport_header(skb) - skb->head;
646         skb->csum_offset = offsetof(struct tcphdr, check);
647 }
648
649 /* This routine computes an IPv4 TCP checksum. */
650 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
651 {
652         const struct inet_sock *inet = inet_sk(sk);
653
654         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
655 }
656 EXPORT_SYMBOL(tcp_v4_send_check);
657
658 /*
659  *      This routine will send an RST to the other tcp.
660  *
661  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
662  *                    for reset.
663  *      Answer: if a packet caused RST, it is not for a socket
664  *              existing in our system, if it is matched to a socket,
665  *              it is just duplicate segment or bug in other side's TCP.
666  *              So that we build reply only basing on parameters
667  *              arrived with segment.
668  *      Exception: precedence violation. We do not implement it in any case.
669  */
670
671 #ifdef CONFIG_TCP_MD5SIG
672 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
673 #else
674 #define OPTION_BYTES sizeof(__be32)
675 #endif
676
677 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
678 {
679         const struct tcphdr *th = tcp_hdr(skb);
680         struct {
681                 struct tcphdr th;
682                 __be32 opt[OPTION_BYTES / sizeof(__be32)];
683         } rep;
684         struct ip_reply_arg arg;
685 #ifdef CONFIG_TCP_MD5SIG
686         struct tcp_md5sig_key *key = NULL;
687         const __u8 *hash_location = NULL;
688         unsigned char newhash[16];
689         int genhash;
690         struct sock *sk1 = NULL;
691 #endif
692         u64 transmit_time = 0;
693         struct sock *ctl_sk;
694         struct net *net;
695
696         /* Never send a reset in response to a reset. */
697         if (th->rst)
698                 return;
699
700         /* If sk not NULL, it means we did a successful lookup and incoming
701          * route had to be correct. prequeue might have dropped our dst.
702          */
703         if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
704                 return;
705
706         /* Swap the send and the receive. */
707         memset(&rep, 0, sizeof(rep));
708         rep.th.dest   = th->source;
709         rep.th.source = th->dest;
710         rep.th.doff   = sizeof(struct tcphdr) / 4;
711         rep.th.rst    = 1;
712
713         if (th->ack) {
714                 rep.th.seq = th->ack_seq;
715         } else {
716                 rep.th.ack = 1;
717                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
718                                        skb->len - (th->doff << 2));
719         }
720
721         memset(&arg, 0, sizeof(arg));
722         arg.iov[0].iov_base = (unsigned char *)&rep;
723         arg.iov[0].iov_len  = sizeof(rep.th);
724
725         net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
726 #ifdef CONFIG_TCP_MD5SIG
727         rcu_read_lock();
728         hash_location = tcp_parse_md5sig_option(th);
729         if (sk && sk_fullsock(sk)) {
730                 const union tcp_md5_addr *addr;
731                 int l3index;
732
733                 /* sdif set, means packet ingressed via a device
734                  * in an L3 domain and inet_iif is set to it.
735                  */
736                 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
737                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
738                 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
739         } else if (hash_location) {
740                 const union tcp_md5_addr *addr;
741                 int sdif = tcp_v4_sdif(skb);
742                 int dif = inet_iif(skb);
743                 int l3index;
744
745                 /*
746                  * active side is lost. Try to find listening socket through
747                  * source port, and then find md5 key through listening socket.
748                  * we are not loose security here:
749                  * Incoming packet is checked with md5 hash with finding key,
750                  * no RST generated if md5 hash doesn't match.
751                  */
752                 sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
753                                              NULL, 0, ip_hdr(skb)->saddr,
754                                              th->source, ip_hdr(skb)->daddr,
755                                              ntohs(th->source), dif, sdif);
756                 /* don't send rst if it can't find key */
757                 if (!sk1)
758                         goto out;
759
760                 /* sdif set, means packet ingressed via a device
761                  * in an L3 domain and dif is set to it.
762                  */
763                 l3index = sdif ? dif : 0;
764                 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
765                 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
766                 if (!key)
767                         goto out;
768
769
770                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
771                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
772                         goto out;
773
774         }
775
776         if (key) {
777                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
778                                    (TCPOPT_NOP << 16) |
779                                    (TCPOPT_MD5SIG << 8) |
780                                    TCPOLEN_MD5SIG);
781                 /* Update length and the length the header thinks exists */
782                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
783                 rep.th.doff = arg.iov[0].iov_len / 4;
784
785                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
786                                      key, ip_hdr(skb)->saddr,
787                                      ip_hdr(skb)->daddr, &rep.th);
788         }
789 #endif
790         /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
791         if (rep.opt[0] == 0) {
792                 __be32 mrst = mptcp_reset_option(skb);
793
794                 if (mrst) {
795                         rep.opt[0] = mrst;
796                         arg.iov[0].iov_len += sizeof(mrst);
797                         rep.th.doff = arg.iov[0].iov_len / 4;
798                 }
799         }
800
801         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
802                                       ip_hdr(skb)->saddr, /* XXX */
803                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
804         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
805         arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
806
807         /* When socket is gone, all binding information is lost.
808          * routing might fail in this case. No choice here, if we choose to force
809          * input interface, we will misroute in case of asymmetric route.
810          */
811         if (sk) {
812                 arg.bound_dev_if = sk->sk_bound_dev_if;
813                 if (sk_fullsock(sk))
814                         trace_tcp_send_reset(sk, skb);
815         }
816
817         BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
818                      offsetof(struct inet_timewait_sock, tw_bound_dev_if));
819
820         arg.tos = ip_hdr(skb)->tos;
821         arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
822         local_bh_disable();
823         ctl_sk = this_cpu_read(ipv4_tcp_sk);
824         sock_net_set(ctl_sk, net);
825         if (sk) {
826                 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
827                                    inet_twsk(sk)->tw_mark : sk->sk_mark;
828                 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
829                                    inet_twsk(sk)->tw_priority : sk->sk_priority;
830                 transmit_time = tcp_transmit_time(sk);
831                 xfrm_sk_clone_policy(ctl_sk, sk);
832         }
833         ip_send_unicast_reply(ctl_sk,
834                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
835                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
836                               &arg, arg.iov[0].iov_len,
837                               transmit_time);
838
839         ctl_sk->sk_mark = 0;
840         xfrm_sk_free_policy(ctl_sk);
841         sock_net_set(ctl_sk, &init_net);
842         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
843         __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
844         local_bh_enable();
845
846 #ifdef CONFIG_TCP_MD5SIG
847 out:
848         rcu_read_unlock();
849 #endif
850 }
851
852 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
853    outside socket context is ugly, certainly. What can I do?
854  */
855
856 static void tcp_v4_send_ack(const struct sock *sk,
857                             struct sk_buff *skb, u32 seq, u32 ack,
858                             u32 win, u32 tsval, u32 tsecr, int oif,
859                             struct tcp_md5sig_key *key,
860                             int reply_flags, u8 tos)
861 {
862         const struct tcphdr *th = tcp_hdr(skb);
863         struct {
864                 struct tcphdr th;
865                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
866 #ifdef CONFIG_TCP_MD5SIG
867                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
868 #endif
869                         ];
870         } rep;
871         struct net *net = sock_net(sk);
872         struct ip_reply_arg arg;
873         struct sock *ctl_sk;
874         u64 transmit_time;
875
876         memset(&rep.th, 0, sizeof(struct tcphdr));
877         memset(&arg, 0, sizeof(arg));
878
879         arg.iov[0].iov_base = (unsigned char *)&rep;
880         arg.iov[0].iov_len  = sizeof(rep.th);
881         if (tsecr) {
882                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
883                                    (TCPOPT_TIMESTAMP << 8) |
884                                    TCPOLEN_TIMESTAMP);
885                 rep.opt[1] = htonl(tsval);
886                 rep.opt[2] = htonl(tsecr);
887                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
888         }
889
890         /* Swap the send and the receive. */
891         rep.th.dest    = th->source;
892         rep.th.source  = th->dest;
893         rep.th.doff    = arg.iov[0].iov_len / 4;
894         rep.th.seq     = htonl(seq);
895         rep.th.ack_seq = htonl(ack);
896         rep.th.ack     = 1;
897         rep.th.window  = htons(win);
898
899 #ifdef CONFIG_TCP_MD5SIG
900         if (key) {
901                 int offset = (tsecr) ? 3 : 0;
902
903                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
904                                           (TCPOPT_NOP << 16) |
905                                           (TCPOPT_MD5SIG << 8) |
906                                           TCPOLEN_MD5SIG);
907                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
908                 rep.th.doff = arg.iov[0].iov_len/4;
909
910                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
911                                     key, ip_hdr(skb)->saddr,
912                                     ip_hdr(skb)->daddr, &rep.th);
913         }
914 #endif
915         arg.flags = reply_flags;
916         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
917                                       ip_hdr(skb)->saddr, /* XXX */
918                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
919         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
920         if (oif)
921                 arg.bound_dev_if = oif;
922         arg.tos = tos;
923         arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
924         local_bh_disable();
925         ctl_sk = this_cpu_read(ipv4_tcp_sk);
926         sock_net_set(ctl_sk, net);
927         ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
928                            inet_twsk(sk)->tw_mark : sk->sk_mark;
929         ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
930                            inet_twsk(sk)->tw_priority : sk->sk_priority;
931         transmit_time = tcp_transmit_time(sk);
932         ip_send_unicast_reply(ctl_sk,
933                               skb, &TCP_SKB_CB(skb)->header.h4.opt,
934                               ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
935                               &arg, arg.iov[0].iov_len,
936                               transmit_time);
937
938         ctl_sk->sk_mark = 0;
939         sock_net_set(ctl_sk, &init_net);
940         __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
941         local_bh_enable();
942 }
943
944 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
945 {
946         struct inet_timewait_sock *tw = inet_twsk(sk);
947         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
948
949         tcp_v4_send_ack(sk, skb,
950                         tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
951                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
952                         tcp_time_stamp_raw() + tcptw->tw_ts_offset,
953                         tcptw->tw_ts_recent,
954                         tw->tw_bound_dev_if,
955                         tcp_twsk_md5_key(tcptw),
956                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
957                         tw->tw_tos
958                         );
959
960         inet_twsk_put(tw);
961 }
962
963 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
964                                   struct request_sock *req)
965 {
966         const union tcp_md5_addr *addr;
967         int l3index;
968
969         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
970          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
971          */
972         u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
973                                              tcp_sk(sk)->snd_nxt;
974
975         /* RFC 7323 2.3
976          * The window field (SEG.WND) of every outgoing segment, with the
977          * exception of <SYN> segments, MUST be right-shifted by
978          * Rcv.Wind.Shift bits:
979          */
980         addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
981         l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
982         tcp_v4_send_ack(sk, skb, seq,
983                         tcp_rsk(req)->rcv_nxt,
984                         req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
985                         tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
986                         req->ts_recent,
987                         0,
988                         tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
989                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
990                         ip_hdr(skb)->tos);
991 }
992
993 /*
994  *      Send a SYN-ACK after having received a SYN.
995  *      This still operates on a request_sock only, not on a big
996  *      socket.
997  */
998 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
999                               struct flowi *fl,
1000                               struct request_sock *req,
1001                               struct tcp_fastopen_cookie *foc,
1002                               enum tcp_synack_type synack_type,
1003                               struct sk_buff *syn_skb)
1004 {
1005         const struct inet_request_sock *ireq = inet_rsk(req);
1006         struct flowi4 fl4;
1007         int err = -1;
1008         struct sk_buff *skb;
1009         u8 tos;
1010
1011         /* First, grab a route. */
1012         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1013                 return -1;
1014
1015         skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1016
1017         if (skb) {
1018                 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1019
1020                 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1021                                 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1022                                 (inet_sk(sk)->tos & INET_ECN_MASK) :
1023                                 inet_sk(sk)->tos;
1024
1025                 if (!INET_ECN_is_capable(tos) &&
1026                     tcp_bpf_ca_needs_ecn((struct sock *)req))
1027                         tos |= INET_ECN_ECT_0;
1028
1029                 rcu_read_lock();
1030                 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1031                                             ireq->ir_rmt_addr,
1032                                             rcu_dereference(ireq->ireq_opt),
1033                                             tos);
1034                 rcu_read_unlock();
1035                 err = net_xmit_eval(err);
1036         }
1037
1038         return err;
1039 }
1040
1041 /*
1042  *      IPv4 request_sock destructor.
1043  */
1044 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1045 {
1046         kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1047 }
1048
1049 #ifdef CONFIG_TCP_MD5SIG
1050 /*
1051  * RFC2385 MD5 checksumming requires a mapping of
1052  * IP address->MD5 Key.
1053  * We need to maintain these in the sk structure.
1054  */
1055
1056 DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
1057 EXPORT_SYMBOL(tcp_md5_needed);
1058
1059 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1060 {
1061         if (!old)
1062                 return true;
1063
1064         /* l3index always overrides non-l3index */
1065         if (old->l3index && new->l3index == 0)
1066                 return false;
1067         if (old->l3index == 0 && new->l3index)
1068                 return true;
1069
1070         return old->prefixlen < new->prefixlen;
1071 }
1072
1073 /* Find the Key structure for an address.  */
1074 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1075                                            const union tcp_md5_addr *addr,
1076                                            int family)
1077 {
1078         const struct tcp_sock *tp = tcp_sk(sk);
1079         struct tcp_md5sig_key *key;
1080         const struct tcp_md5sig_info *md5sig;
1081         __be32 mask;
1082         struct tcp_md5sig_key *best_match = NULL;
1083         bool match;
1084
1085         /* caller either holds rcu_read_lock() or socket lock */
1086         md5sig = rcu_dereference_check(tp->md5sig_info,
1087                                        lockdep_sock_is_held(sk));
1088         if (!md5sig)
1089                 return NULL;
1090
1091         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1092                                  lockdep_sock_is_held(sk)) {
1093                 if (key->family != family)
1094                         continue;
1095                 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1096                         continue;
1097                 if (family == AF_INET) {
1098                         mask = inet_make_mask(key->prefixlen);
1099                         match = (key->addr.a4.s_addr & mask) ==
1100                                 (addr->a4.s_addr & mask);
1101 #if IS_ENABLED(CONFIG_IPV6)
1102                 } else if (family == AF_INET6) {
1103                         match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1104                                                   key->prefixlen);
1105 #endif
1106                 } else {
1107                         match = false;
1108                 }
1109
1110                 if (match && better_md5_match(best_match, key))
1111                         best_match = key;
1112         }
1113         return best_match;
1114 }
1115 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1116
1117 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1118                                                       const union tcp_md5_addr *addr,
1119                                                       int family, u8 prefixlen,
1120                                                       int l3index, u8 flags)
1121 {
1122         const struct tcp_sock *tp = tcp_sk(sk);
1123         struct tcp_md5sig_key *key;
1124         unsigned int size = sizeof(struct in_addr);
1125         const struct tcp_md5sig_info *md5sig;
1126
1127         /* caller either holds rcu_read_lock() or socket lock */
1128         md5sig = rcu_dereference_check(tp->md5sig_info,
1129                                        lockdep_sock_is_held(sk));
1130         if (!md5sig)
1131                 return NULL;
1132 #if IS_ENABLED(CONFIG_IPV6)
1133         if (family == AF_INET6)
1134                 size = sizeof(struct in6_addr);
1135 #endif
1136         hlist_for_each_entry_rcu(key, &md5sig->head, node,
1137                                  lockdep_sock_is_held(sk)) {
1138                 if (key->family != family)
1139                         continue;
1140                 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1141                         continue;
1142                 if (key->l3index != l3index)
1143                         continue;
1144                 if (!memcmp(&key->addr, addr, size) &&
1145                     key->prefixlen == prefixlen)
1146                         return key;
1147         }
1148         return NULL;
1149 }
1150
1151 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1152                                          const struct sock *addr_sk)
1153 {
1154         const union tcp_md5_addr *addr;
1155         int l3index;
1156
1157         l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1158                                                  addr_sk->sk_bound_dev_if);
1159         addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1160         return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1161 }
1162 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1163
1164 static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
1165 {
1166         struct tcp_sock *tp = tcp_sk(sk);
1167         struct tcp_md5sig_info *md5sig;
1168
1169         md5sig = kmalloc(sizeof(*md5sig), gfp);
1170         if (!md5sig)
1171                 return -ENOMEM;
1172
1173         sk_gso_disable(sk);
1174         INIT_HLIST_HEAD(&md5sig->head);
1175         rcu_assign_pointer(tp->md5sig_info, md5sig);
1176         return 0;
1177 }
1178
1179 /* This can be called on a newly created socket, from other files */
1180 static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1181                             int family, u8 prefixlen, int l3index, u8 flags,
1182                             const u8 *newkey, u8 newkeylen, gfp_t gfp)
1183 {
1184         /* Add Key to the list */
1185         struct tcp_md5sig_key *key;
1186         struct tcp_sock *tp = tcp_sk(sk);
1187         struct tcp_md5sig_info *md5sig;
1188
1189         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1190         if (key) {
1191                 /* Pre-existing entry - just update that one.
1192                  * Note that the key might be used concurrently.
1193                  * data_race() is telling kcsan that we do not care of
1194                  * key mismatches, since changing MD5 key on live flows
1195                  * can lead to packet drops.
1196                  */
1197                 data_race(memcpy(key->key, newkey, newkeylen));
1198
1199                 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1200                  * Also note that a reader could catch new key->keylen value
1201                  * but old key->key[], this is the reason we use __GFP_ZERO
1202                  * at sock_kmalloc() time below these lines.
1203                  */
1204                 WRITE_ONCE(key->keylen, newkeylen);
1205
1206                 return 0;
1207         }
1208
1209         md5sig = rcu_dereference_protected(tp->md5sig_info,
1210                                            lockdep_sock_is_held(sk));
1211
1212         key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1213         if (!key)
1214                 return -ENOMEM;
1215         if (!tcp_alloc_md5sig_pool()) {
1216                 sock_kfree_s(sk, key, sizeof(*key));
1217                 return -ENOMEM;
1218         }
1219
1220         memcpy(key->key, newkey, newkeylen);
1221         key->keylen = newkeylen;
1222         key->family = family;
1223         key->prefixlen = prefixlen;
1224         key->l3index = l3index;
1225         key->flags = flags;
1226         memcpy(&key->addr, addr,
1227                (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1228                                                                  sizeof(struct in_addr));
1229         hlist_add_head_rcu(&key->node, &md5sig->head);
1230         return 0;
1231 }
1232
1233 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1234                    int family, u8 prefixlen, int l3index, u8 flags,
1235                    const u8 *newkey, u8 newkeylen)
1236 {
1237         struct tcp_sock *tp = tcp_sk(sk);
1238
1239         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1240                 if (tcp_md5sig_info_add(sk, GFP_KERNEL))
1241                         return -ENOMEM;
1242
1243                 if (!static_branch_inc(&tcp_md5_needed.key)) {
1244                         struct tcp_md5sig_info *md5sig;
1245
1246                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1247                         rcu_assign_pointer(tp->md5sig_info, NULL);
1248                         kfree_rcu(md5sig, rcu);
1249                         return -EUSERS;
1250                 }
1251         }
1252
1253         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
1254                                 newkey, newkeylen, GFP_KERNEL);
1255 }
1256 EXPORT_SYMBOL(tcp_md5_do_add);
1257
1258 int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
1259                      int family, u8 prefixlen, int l3index,
1260                      struct tcp_md5sig_key *key)
1261 {
1262         struct tcp_sock *tp = tcp_sk(sk);
1263
1264         if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
1265                 if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
1266                         return -ENOMEM;
1267
1268                 if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
1269                         struct tcp_md5sig_info *md5sig;
1270
1271                         md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
1272                         net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
1273                         rcu_assign_pointer(tp->md5sig_info, NULL);
1274                         kfree_rcu(md5sig, rcu);
1275                         return -EUSERS;
1276                 }
1277         }
1278
1279         return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
1280                                 key->flags, key->key, key->keylen,
1281                                 sk_gfp_mask(sk, GFP_ATOMIC));
1282 }
1283 EXPORT_SYMBOL(tcp_md5_key_copy);
1284
1285 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1286                    u8 prefixlen, int l3index, u8 flags)
1287 {
1288         struct tcp_md5sig_key *key;
1289
1290         key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1291         if (!key)
1292                 return -ENOENT;
1293         hlist_del_rcu(&key->node);
1294         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1295         kfree_rcu(key, rcu);
1296         return 0;
1297 }
1298 EXPORT_SYMBOL(tcp_md5_do_del);
1299
1300 static void tcp_clear_md5_list(struct sock *sk)
1301 {
1302         struct tcp_sock *tp = tcp_sk(sk);
1303         struct tcp_md5sig_key *key;
1304         struct hlist_node *n;
1305         struct tcp_md5sig_info *md5sig;
1306
1307         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1308
1309         hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1310                 hlist_del_rcu(&key->node);
1311                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1312                 kfree_rcu(key, rcu);
1313         }
1314 }
1315
1316 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1317                                  sockptr_t optval, int optlen)
1318 {
1319         struct tcp_md5sig cmd;
1320         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1321         const union tcp_md5_addr *addr;
1322         u8 prefixlen = 32;
1323         int l3index = 0;
1324         u8 flags;
1325
1326         if (optlen < sizeof(cmd))
1327                 return -EINVAL;
1328
1329         if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1330                 return -EFAULT;
1331
1332         if (sin->sin_family != AF_INET)
1333                 return -EINVAL;
1334
1335         flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1336
1337         if (optname == TCP_MD5SIG_EXT &&
1338             cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1339                 prefixlen = cmd.tcpm_prefixlen;
1340                 if (prefixlen > 32)
1341                         return -EINVAL;
1342         }
1343
1344         if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1345             cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1346                 struct net_device *dev;
1347
1348                 rcu_read_lock();
1349                 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1350                 if (dev && netif_is_l3_master(dev))
1351                         l3index = dev->ifindex;
1352
1353                 rcu_read_unlock();
1354
1355                 /* ok to reference set/not set outside of rcu;
1356                  * right now device MUST be an L3 master
1357                  */
1358                 if (!dev || !l3index)
1359                         return -EINVAL;
1360         }
1361
1362         addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1363
1364         if (!cmd.tcpm_keylen)
1365                 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1366
1367         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1368                 return -EINVAL;
1369
1370         return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1371                               cmd.tcpm_key, cmd.tcpm_keylen);
1372 }
1373
1374 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1375                                    __be32 daddr, __be32 saddr,
1376                                    const struct tcphdr *th, int nbytes)
1377 {
1378         struct tcp4_pseudohdr *bp;
1379         struct scatterlist sg;
1380         struct tcphdr *_th;
1381
1382         bp = hp->scratch;
1383         bp->saddr = saddr;
1384         bp->daddr = daddr;
1385         bp->pad = 0;
1386         bp->protocol = IPPROTO_TCP;
1387         bp->len = cpu_to_be16(nbytes);
1388
1389         _th = (struct tcphdr *)(bp + 1);
1390         memcpy(_th, th, sizeof(*th));
1391         _th->check = 0;
1392
1393         sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1394         ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1395                                 sizeof(*bp) + sizeof(*th));
1396         return crypto_ahash_update(hp->md5_req);
1397 }
1398
1399 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1400                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1401 {
1402         struct tcp_md5sig_pool *hp;
1403         struct ahash_request *req;
1404
1405         hp = tcp_get_md5sig_pool();
1406         if (!hp)
1407                 goto clear_hash_noput;
1408         req = hp->md5_req;
1409
1410         if (crypto_ahash_init(req))
1411                 goto clear_hash;
1412         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1413                 goto clear_hash;
1414         if (tcp_md5_hash_key(hp, key))
1415                 goto clear_hash;
1416         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1417         if (crypto_ahash_final(req))
1418                 goto clear_hash;
1419
1420         tcp_put_md5sig_pool();
1421         return 0;
1422
1423 clear_hash:
1424         tcp_put_md5sig_pool();
1425 clear_hash_noput:
1426         memset(md5_hash, 0, 16);
1427         return 1;
1428 }
1429
1430 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1431                         const struct sock *sk,
1432                         const struct sk_buff *skb)
1433 {
1434         struct tcp_md5sig_pool *hp;
1435         struct ahash_request *req;
1436         const struct tcphdr *th = tcp_hdr(skb);
1437         __be32 saddr, daddr;
1438
1439         if (sk) { /* valid for establish/request sockets */
1440                 saddr = sk->sk_rcv_saddr;
1441                 daddr = sk->sk_daddr;
1442         } else {
1443                 const struct iphdr *iph = ip_hdr(skb);
1444                 saddr = iph->saddr;
1445                 daddr = iph->daddr;
1446         }
1447
1448         hp = tcp_get_md5sig_pool();
1449         if (!hp)
1450                 goto clear_hash_noput;
1451         req = hp->md5_req;
1452
1453         if (crypto_ahash_init(req))
1454                 goto clear_hash;
1455
1456         if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1457                 goto clear_hash;
1458         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1459                 goto clear_hash;
1460         if (tcp_md5_hash_key(hp, key))
1461                 goto clear_hash;
1462         ahash_request_set_crypt(req, NULL, md5_hash, 0);
1463         if (crypto_ahash_final(req))
1464                 goto clear_hash;
1465
1466         tcp_put_md5sig_pool();
1467         return 0;
1468
1469 clear_hash:
1470         tcp_put_md5sig_pool();
1471 clear_hash_noput:
1472         memset(md5_hash, 0, 16);
1473         return 1;
1474 }
1475 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1476
1477 #endif
1478
1479 static void tcp_v4_init_req(struct request_sock *req,
1480                             const struct sock *sk_listener,
1481                             struct sk_buff *skb)
1482 {
1483         struct inet_request_sock *ireq = inet_rsk(req);
1484         struct net *net = sock_net(sk_listener);
1485
1486         sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1487         sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1488         RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1489 }
1490
1491 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1492                                           struct sk_buff *skb,
1493                                           struct flowi *fl,
1494                                           struct request_sock *req)
1495 {
1496         tcp_v4_init_req(req, sk, skb);
1497
1498         if (security_inet_conn_request(sk, skb, req))
1499                 return NULL;
1500
1501         return inet_csk_route_req(sk, &fl->u.ip4, req);
1502 }
1503
1504 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1505         .family         =       PF_INET,
1506         .obj_size       =       sizeof(struct tcp_request_sock),
1507         .rtx_syn_ack    =       tcp_rtx_synack,
1508         .send_ack       =       tcp_v4_reqsk_send_ack,
1509         .destructor     =       tcp_v4_reqsk_destructor,
1510         .send_reset     =       tcp_v4_send_reset,
1511         .syn_ack_timeout =      tcp_syn_ack_timeout,
1512 };
1513
1514 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1515         .mss_clamp      =       TCP_MSS_DEFAULT,
1516 #ifdef CONFIG_TCP_MD5SIG
1517         .req_md5_lookup =       tcp_v4_md5_lookup,
1518         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1519 #endif
1520 #ifdef CONFIG_SYN_COOKIES
1521         .cookie_init_seq =      cookie_v4_init_sequence,
1522 #endif
1523         .route_req      =       tcp_v4_route_req,
1524         .init_seq       =       tcp_v4_init_seq,
1525         .init_ts_off    =       tcp_v4_init_ts_off,
1526         .send_synack    =       tcp_v4_send_synack,
1527 };
1528
1529 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1530 {
1531         /* Never answer to SYNs send to broadcast or multicast */
1532         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1533                 goto drop;
1534
1535         return tcp_conn_request(&tcp_request_sock_ops,
1536                                 &tcp_request_sock_ipv4_ops, sk, skb);
1537
1538 drop:
1539         tcp_listendrop(sk);
1540         return 0;
1541 }
1542 EXPORT_SYMBOL(tcp_v4_conn_request);
1543
1544
1545 /*
1546  * The three way handshake has completed - we got a valid synack -
1547  * now create the new socket.
1548  */
1549 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1550                                   struct request_sock *req,
1551                                   struct dst_entry *dst,
1552                                   struct request_sock *req_unhash,
1553                                   bool *own_req)
1554 {
1555         struct inet_request_sock *ireq;
1556         bool found_dup_sk = false;
1557         struct inet_sock *newinet;
1558         struct tcp_sock *newtp;
1559         struct sock *newsk;
1560 #ifdef CONFIG_TCP_MD5SIG
1561         const union tcp_md5_addr *addr;
1562         struct tcp_md5sig_key *key;
1563         int l3index;
1564 #endif
1565         struct ip_options_rcu *inet_opt;
1566
1567         if (sk_acceptq_is_full(sk))
1568                 goto exit_overflow;
1569
1570         newsk = tcp_create_openreq_child(sk, req, skb);
1571         if (!newsk)
1572                 goto exit_nonewsk;
1573
1574         newsk->sk_gso_type = SKB_GSO_TCPV4;
1575         inet_sk_rx_dst_set(newsk, skb);
1576
1577         newtp                 = tcp_sk(newsk);
1578         newinet               = inet_sk(newsk);
1579         ireq                  = inet_rsk(req);
1580         sk_daddr_set(newsk, ireq->ir_rmt_addr);
1581         sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1582         newsk->sk_bound_dev_if = ireq->ir_iif;
1583         newinet->inet_saddr   = ireq->ir_loc_addr;
1584         inet_opt              = rcu_dereference(ireq->ireq_opt);
1585         RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1586         newinet->mc_index     = inet_iif(skb);
1587         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1588         newinet->rcv_tos      = ip_hdr(skb)->tos;
1589         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1590         if (inet_opt)
1591                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1592         newinet->inet_id = get_random_u16();
1593
1594         /* Set ToS of the new socket based upon the value of incoming SYN.
1595          * ECT bits are set later in tcp_init_transfer().
1596          */
1597         if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1598                 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1599
1600         if (!dst) {
1601                 dst = inet_csk_route_child_sock(sk, newsk, req);
1602                 if (!dst)
1603                         goto put_and_exit;
1604         } else {
1605                 /* syncookie case : see end of cookie_v4_check() */
1606         }
1607         sk_setup_caps(newsk, dst);
1608
1609         tcp_ca_openreq_child(newsk, dst);
1610
1611         tcp_sync_mss(newsk, dst_mtu(dst));
1612         newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1613
1614         tcp_initialize_rcv_mss(newsk);
1615
1616 #ifdef CONFIG_TCP_MD5SIG
1617         l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1618         /* Copy over the MD5 key from the original socket */
1619         addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1620         key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1621         if (key) {
1622                 if (tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key))
1623                         goto put_and_exit;
1624                 sk_gso_disable(newsk);
1625         }
1626 #endif
1627
1628         if (__inet_inherit_port(sk, newsk) < 0)
1629                 goto put_and_exit;
1630         *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1631                                        &found_dup_sk);
1632         if (likely(*own_req)) {
1633                 tcp_move_syn(newtp, req);
1634                 ireq->ireq_opt = NULL;
1635         } else {
1636                 newinet->inet_opt = NULL;
1637
1638                 if (!req_unhash && found_dup_sk) {
1639                         /* This code path should only be executed in the
1640                          * syncookie case only
1641                          */
1642                         bh_unlock_sock(newsk);
1643                         sock_put(newsk);
1644                         newsk = NULL;
1645                 }
1646         }
1647         return newsk;
1648
1649 exit_overflow:
1650         NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1651 exit_nonewsk:
1652         dst_release(dst);
1653 exit:
1654         tcp_listendrop(sk);
1655         return NULL;
1656 put_and_exit:
1657         newinet->inet_opt = NULL;
1658         inet_csk_prepare_forced_close(newsk);
1659         tcp_done(newsk);
1660         goto exit;
1661 }
1662 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1663
1664 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1665 {
1666 #ifdef CONFIG_SYN_COOKIES
1667         const struct tcphdr *th = tcp_hdr(skb);
1668
1669         if (!th->syn)
1670                 sk = cookie_v4_check(sk, skb);
1671 #endif
1672         return sk;
1673 }
1674
1675 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1676                          struct tcphdr *th, u32 *cookie)
1677 {
1678         u16 mss = 0;
1679 #ifdef CONFIG_SYN_COOKIES
1680         mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1681                                     &tcp_request_sock_ipv4_ops, sk, th);
1682         if (mss) {
1683                 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1684                 tcp_synq_overflow(sk);
1685         }
1686 #endif
1687         return mss;
1688 }
1689
1690 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1691                                                            u32));
1692 /* The socket must have it's spinlock held when we get
1693  * here, unless it is a TCP_LISTEN socket.
1694  *
1695  * We have a potential double-lock case here, so even when
1696  * doing backlog processing we use the BH locking scheme.
1697  * This is because we cannot sleep with the original spinlock
1698  * held.
1699  */
1700 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1701 {
1702         enum skb_drop_reason reason;
1703         struct sock *rsk;
1704
1705         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1706                 struct dst_entry *dst;
1707
1708                 dst = rcu_dereference_protected(sk->sk_rx_dst,
1709                                                 lockdep_sock_is_held(sk));
1710
1711                 sock_rps_save_rxhash(sk, skb);
1712                 sk_mark_napi_id(sk, skb);
1713                 if (dst) {
1714                         if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1715                             !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1716                                              dst, 0)) {
1717                                 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1718                                 dst_release(dst);
1719                         }
1720                 }
1721                 tcp_rcv_established(sk, skb);
1722                 return 0;
1723         }
1724
1725         reason = SKB_DROP_REASON_NOT_SPECIFIED;
1726         if (tcp_checksum_complete(skb))
1727                 goto csum_err;
1728
1729         if (sk->sk_state == TCP_LISTEN) {
1730                 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1731
1732                 if (!nsk)
1733                         goto discard;
1734                 if (nsk != sk) {
1735                         if (tcp_child_process(sk, nsk, skb)) {
1736                                 rsk = nsk;
1737                                 goto reset;
1738                         }
1739                         return 0;
1740                 }
1741         } else
1742                 sock_rps_save_rxhash(sk, skb);
1743
1744         if (tcp_rcv_state_process(sk, skb)) {
1745                 rsk = sk;
1746                 goto reset;
1747         }
1748         return 0;
1749
1750 reset:
1751         tcp_v4_send_reset(rsk, skb);
1752 discard:
1753         kfree_skb_reason(skb, reason);
1754         /* Be careful here. If this function gets more complicated and
1755          * gcc suffers from register pressure on the x86, sk (in %ebx)
1756          * might be destroyed here. This current version compiles correctly,
1757          * but you have been warned.
1758          */
1759         return 0;
1760
1761 csum_err:
1762         reason = SKB_DROP_REASON_TCP_CSUM;
1763         trace_tcp_bad_csum(skb);
1764         TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1765         TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1766         goto discard;
1767 }
1768 EXPORT_SYMBOL(tcp_v4_do_rcv);
1769
1770 int tcp_v4_early_demux(struct sk_buff *skb)
1771 {
1772         struct net *net = dev_net(skb->dev);
1773         const struct iphdr *iph;
1774         const struct tcphdr *th;
1775         struct sock *sk;
1776
1777         if (skb->pkt_type != PACKET_HOST)
1778                 return 0;
1779
1780         if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1781                 return 0;
1782
1783         iph = ip_hdr(skb);
1784         th = tcp_hdr(skb);
1785
1786         if (th->doff < sizeof(struct tcphdr) / 4)
1787                 return 0;
1788
1789         sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
1790                                        iph->saddr, th->source,
1791                                        iph->daddr, ntohs(th->dest),
1792                                        skb->skb_iif, inet_sdif(skb));
1793         if (sk) {
1794                 skb->sk = sk;
1795                 skb->destructor = sock_edemux;
1796                 if (sk_fullsock(sk)) {
1797                         struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1798
1799                         if (dst)
1800                                 dst = dst_check(dst, 0);
1801                         if (dst &&
1802                             sk->sk_rx_dst_ifindex == skb->skb_iif)
1803                                 skb_dst_set_noref(skb, dst);
1804                 }
1805         }
1806         return 0;
1807 }
1808
1809 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1810                      enum skb_drop_reason *reason)
1811 {
1812         u32 limit, tail_gso_size, tail_gso_segs;
1813         struct skb_shared_info *shinfo;
1814         const struct tcphdr *th;
1815         struct tcphdr *thtail;
1816         struct sk_buff *tail;
1817         unsigned int hdrlen;
1818         bool fragstolen;
1819         u32 gso_segs;
1820         u32 gso_size;
1821         int delta;
1822
1823         /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1824          * we can fix skb->truesize to its real value to avoid future drops.
1825          * This is valid because skb is not yet charged to the socket.
1826          * It has been noticed pure SACK packets were sometimes dropped
1827          * (if cooked by drivers without copybreak feature).
1828          */
1829         skb_condense(skb);
1830
1831         skb_dst_drop(skb);
1832
1833         if (unlikely(tcp_checksum_complete(skb))) {
1834                 bh_unlock_sock(sk);
1835                 trace_tcp_bad_csum(skb);
1836                 *reason = SKB_DROP_REASON_TCP_CSUM;
1837                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1838                 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1839                 return true;
1840         }
1841
1842         /* Attempt coalescing to last skb in backlog, even if we are
1843          * above the limits.
1844          * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1845          */
1846         th = (const struct tcphdr *)skb->data;
1847         hdrlen = th->doff * 4;
1848
1849         tail = sk->sk_backlog.tail;
1850         if (!tail)
1851                 goto no_coalesce;
1852         thtail = (struct tcphdr *)tail->data;
1853
1854         if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1855             TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1856             ((TCP_SKB_CB(tail)->tcp_flags |
1857               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1858             !((TCP_SKB_CB(tail)->tcp_flags &
1859               TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1860             ((TCP_SKB_CB(tail)->tcp_flags ^
1861               TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1862 #ifdef CONFIG_TLS_DEVICE
1863             tail->decrypted != skb->decrypted ||
1864 #endif
1865             thtail->doff != th->doff ||
1866             memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1867                 goto no_coalesce;
1868
1869         __skb_pull(skb, hdrlen);
1870
1871         shinfo = skb_shinfo(skb);
1872         gso_size = shinfo->gso_size ?: skb->len;
1873         gso_segs = shinfo->gso_segs ?: 1;
1874
1875         shinfo = skb_shinfo(tail);
1876         tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1877         tail_gso_segs = shinfo->gso_segs ?: 1;
1878
1879         if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1880                 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1881
1882                 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1883                         TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1884                         thtail->window = th->window;
1885                 }
1886
1887                 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1888                  * thtail->fin, so that the fast path in tcp_rcv_established()
1889                  * is not entered if we append a packet with a FIN.
1890                  * SYN, RST, URG are not present.
1891                  * ACK is set on both packets.
1892                  * PSH : we do not really care in TCP stack,
1893                  *       at least for 'GRO' packets.
1894                  */
1895                 thtail->fin |= th->fin;
1896                 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1897
1898                 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1899                         TCP_SKB_CB(tail)->has_rxtstamp = true;
1900                         tail->tstamp = skb->tstamp;
1901                         skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1902                 }
1903
1904                 /* Not as strict as GRO. We only need to carry mss max value */
1905                 shinfo->gso_size = max(gso_size, tail_gso_size);
1906                 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1907
1908                 sk->sk_backlog.len += delta;
1909                 __NET_INC_STATS(sock_net(sk),
1910                                 LINUX_MIB_TCPBACKLOGCOALESCE);
1911                 kfree_skb_partial(skb, fragstolen);
1912                 return false;
1913         }
1914         __skb_push(skb, hdrlen);
1915
1916 no_coalesce:
1917         limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1);
1918
1919         /* Only socket owner can try to collapse/prune rx queues
1920          * to reduce memory overhead, so add a little headroom here.
1921          * Few sockets backlog are possibly concurrently non empty.
1922          */
1923         limit += 64 * 1024;
1924
1925         if (unlikely(sk_add_backlog(sk, skb, limit))) {
1926                 bh_unlock_sock(sk);
1927                 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1928                 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1929                 return true;
1930         }
1931         return false;
1932 }
1933 EXPORT_SYMBOL(tcp_add_backlog);
1934
1935 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1936 {
1937         struct tcphdr *th = (struct tcphdr *)skb->data;
1938
1939         return sk_filter_trim_cap(sk, skb, th->doff * 4);
1940 }
1941 EXPORT_SYMBOL(tcp_filter);
1942
1943 static void tcp_v4_restore_cb(struct sk_buff *skb)
1944 {
1945         memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1946                 sizeof(struct inet_skb_parm));
1947 }
1948
1949 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1950                            const struct tcphdr *th)
1951 {
1952         /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1953          * barrier() makes sure compiler wont play fool^Waliasing games.
1954          */
1955         memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1956                 sizeof(struct inet_skb_parm));
1957         barrier();
1958
1959         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1960         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1961                                     skb->len - th->doff * 4);
1962         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1963         TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1964         TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1965         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1966         TCP_SKB_CB(skb)->sacked  = 0;
1967         TCP_SKB_CB(skb)->has_rxtstamp =
1968                         skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1969 }
1970
1971 /*
1972  *      From tcp_input.c
1973  */
1974
1975 int tcp_v4_rcv(struct sk_buff *skb)
1976 {
1977         struct net *net = dev_net(skb->dev);
1978         enum skb_drop_reason drop_reason;
1979         int sdif = inet_sdif(skb);
1980         int dif = inet_iif(skb);
1981         const struct iphdr *iph;
1982         const struct tcphdr *th;
1983         bool refcounted;
1984         struct sock *sk;
1985         int ret;
1986
1987         drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1988         if (skb->pkt_type != PACKET_HOST)
1989                 goto discard_it;
1990
1991         /* Count it even if it's bad */
1992         __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1993
1994         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1995                 goto discard_it;
1996
1997         th = (const struct tcphdr *)skb->data;
1998
1999         if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
2000                 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
2001                 goto bad_packet;
2002         }
2003         if (!pskb_may_pull(skb, th->doff * 4))
2004                 goto discard_it;
2005
2006         /* An explanation is required here, I think.
2007          * Packet length and doff are validated by header prediction,
2008          * provided case of th->doff==0 is eliminated.
2009          * So, we defer the checks. */
2010
2011         if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
2012                 goto csum_error;
2013
2014         th = (const struct tcphdr *)skb->data;
2015         iph = ip_hdr(skb);
2016 lookup:
2017         sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
2018                                skb, __tcp_hdrlen(th), th->source,
2019                                th->dest, sdif, &refcounted);
2020         if (!sk)
2021                 goto no_tcp_socket;
2022
2023 process:
2024         if (sk->sk_state == TCP_TIME_WAIT)
2025                 goto do_time_wait;
2026
2027         if (sk->sk_state == TCP_NEW_SYN_RECV) {
2028                 struct request_sock *req = inet_reqsk(sk);
2029                 bool req_stolen = false;
2030                 struct sock *nsk;
2031
2032                 sk = req->rsk_listener;
2033                 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2034                         drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2035                 else
2036                         drop_reason = tcp_inbound_md5_hash(sk, skb,
2037                                                    &iph->saddr, &iph->daddr,
2038                                                    AF_INET, dif, sdif);
2039                 if (unlikely(drop_reason)) {
2040                         sk_drops_add(sk, skb);
2041                         reqsk_put(req);
2042                         goto discard_it;
2043                 }
2044                 if (tcp_checksum_complete(skb)) {
2045                         reqsk_put(req);
2046                         goto csum_error;
2047                 }
2048                 if (unlikely(sk->sk_state != TCP_LISTEN)) {
2049                         nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
2050                         if (!nsk) {
2051                                 inet_csk_reqsk_queue_drop_and_put(sk, req);
2052                                 goto lookup;
2053                         }
2054                         sk = nsk;
2055                         /* reuseport_migrate_sock() has already held one sk_refcnt
2056                          * before returning.
2057                          */
2058                 } else {
2059                         /* We own a reference on the listener, increase it again
2060                          * as we might lose it too soon.
2061                          */
2062                         sock_hold(sk);
2063                 }
2064                 refcounted = true;
2065                 nsk = NULL;
2066                 if (!tcp_filter(sk, skb)) {
2067                         th = (const struct tcphdr *)skb->data;
2068                         iph = ip_hdr(skb);
2069                         tcp_v4_fill_cb(skb, iph, th);
2070                         nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2071                 } else {
2072                         drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2073                 }
2074                 if (!nsk) {
2075                         reqsk_put(req);
2076                         if (req_stolen) {
2077                                 /* Another cpu got exclusive access to req
2078                                  * and created a full blown socket.
2079                                  * Try to feed this packet to this socket
2080                                  * instead of discarding it.
2081                                  */
2082                                 tcp_v4_restore_cb(skb);
2083                                 sock_put(sk);
2084                                 goto lookup;
2085                         }
2086                         goto discard_and_relse;
2087                 }
2088                 nf_reset_ct(skb);
2089                 if (nsk == sk) {
2090                         reqsk_put(req);
2091                         tcp_v4_restore_cb(skb);
2092                 } else if (tcp_child_process(sk, nsk, skb)) {
2093                         tcp_v4_send_reset(nsk, skb);
2094                         goto discard_and_relse;
2095                 } else {
2096                         sock_put(sk);
2097                         return 0;
2098                 }
2099         }
2100
2101         if (static_branch_unlikely(&ip4_min_ttl)) {
2102                 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2103                 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2104                         __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2105                         drop_reason = SKB_DROP_REASON_TCP_MINTTL;
2106                         goto discard_and_relse;
2107                 }
2108         }
2109
2110         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2111                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2112                 goto discard_and_relse;
2113         }
2114
2115         drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2116                                            &iph->daddr, AF_INET, dif, sdif);
2117         if (drop_reason)
2118                 goto discard_and_relse;
2119
2120         nf_reset_ct(skb);
2121
2122         if (tcp_filter(sk, skb)) {
2123                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2124                 goto discard_and_relse;
2125         }
2126         th = (const struct tcphdr *)skb->data;
2127         iph = ip_hdr(skb);
2128         tcp_v4_fill_cb(skb, iph, th);
2129
2130         skb->dev = NULL;
2131
2132         if (sk->sk_state == TCP_LISTEN) {
2133                 ret = tcp_v4_do_rcv(sk, skb);
2134                 goto put_and_return;
2135         }
2136
2137         sk_incoming_cpu_update(sk);
2138
2139         bh_lock_sock_nested(sk);
2140         tcp_segs_in(tcp_sk(sk), skb);
2141         ret = 0;
2142         if (!sock_owned_by_user(sk)) {
2143                 ret = tcp_v4_do_rcv(sk, skb);
2144         } else {
2145                 if (tcp_add_backlog(sk, skb, &drop_reason))
2146                         goto discard_and_relse;
2147         }
2148         bh_unlock_sock(sk);
2149
2150 put_and_return:
2151         if (refcounted)
2152                 sock_put(sk);
2153
2154         return ret;
2155
2156 no_tcp_socket:
2157         drop_reason = SKB_DROP_REASON_NO_SOCKET;
2158         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2159                 goto discard_it;
2160
2161         tcp_v4_fill_cb(skb, iph, th);
2162
2163         if (tcp_checksum_complete(skb)) {
2164 csum_error:
2165                 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2166                 trace_tcp_bad_csum(skb);
2167                 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2168 bad_packet:
2169                 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2170         } else {
2171                 tcp_v4_send_reset(NULL, skb);
2172         }
2173
2174 discard_it:
2175         SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2176         /* Discard frame. */
2177         kfree_skb_reason(skb, drop_reason);
2178         return 0;
2179
2180 discard_and_relse:
2181         sk_drops_add(sk, skb);
2182         if (refcounted)
2183                 sock_put(sk);
2184         goto discard_it;
2185
2186 do_time_wait:
2187         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2188                 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2189                 inet_twsk_put(inet_twsk(sk));
2190                 goto discard_it;
2191         }
2192
2193         tcp_v4_fill_cb(skb, iph, th);
2194
2195         if (tcp_checksum_complete(skb)) {
2196                 inet_twsk_put(inet_twsk(sk));
2197                 goto csum_error;
2198         }
2199         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2200         case TCP_TW_SYN: {
2201                 struct sock *sk2 = inet_lookup_listener(net,
2202                                                         net->ipv4.tcp_death_row.hashinfo,
2203                                                         skb, __tcp_hdrlen(th),
2204                                                         iph->saddr, th->source,
2205                                                         iph->daddr, th->dest,
2206                                                         inet_iif(skb),
2207                                                         sdif);
2208                 if (sk2) {
2209                         inet_twsk_deschedule_put(inet_twsk(sk));
2210                         sk = sk2;
2211                         tcp_v4_restore_cb(skb);
2212                         refcounted = false;
2213                         goto process;
2214                 }
2215         }
2216                 /* to ACK */
2217                 fallthrough;
2218         case TCP_TW_ACK:
2219                 tcp_v4_timewait_ack(sk, skb);
2220                 break;
2221         case TCP_TW_RST:
2222                 tcp_v4_send_reset(sk, skb);
2223                 inet_twsk_deschedule_put(inet_twsk(sk));
2224                 goto discard_it;
2225         case TCP_TW_SUCCESS:;
2226         }
2227         goto discard_it;
2228 }
2229
2230 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2231         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2232         .twsk_unique    = tcp_twsk_unique,
2233         .twsk_destructor= tcp_twsk_destructor,
2234 };
2235
2236 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2237 {
2238         struct dst_entry *dst = skb_dst(skb);
2239
2240         if (dst && dst_hold_safe(dst)) {
2241                 rcu_assign_pointer(sk->sk_rx_dst, dst);
2242                 sk->sk_rx_dst_ifindex = skb->skb_iif;
2243         }
2244 }
2245 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2246
2247 const struct inet_connection_sock_af_ops ipv4_specific = {
2248         .queue_xmit        = ip_queue_xmit,
2249         .send_check        = tcp_v4_send_check,
2250         .rebuild_header    = inet_sk_rebuild_header,
2251         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2252         .conn_request      = tcp_v4_conn_request,
2253         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2254         .net_header_len    = sizeof(struct iphdr),
2255         .setsockopt        = ip_setsockopt,
2256         .getsockopt        = ip_getsockopt,
2257         .addr2sockaddr     = inet_csk_addr2sockaddr,
2258         .sockaddr_len      = sizeof(struct sockaddr_in),
2259         .mtu_reduced       = tcp_v4_mtu_reduced,
2260 };
2261 EXPORT_SYMBOL(ipv4_specific);
2262
2263 #ifdef CONFIG_TCP_MD5SIG
2264 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2265         .md5_lookup             = tcp_v4_md5_lookup,
2266         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2267         .md5_parse              = tcp_v4_parse_md5_keys,
2268 };
2269 #endif
2270
2271 /* NOTE: A lot of things set to zero explicitly by call to
2272  *       sk_alloc() so need not be done here.
2273  */
2274 static int tcp_v4_init_sock(struct sock *sk)
2275 {
2276         struct inet_connection_sock *icsk = inet_csk(sk);
2277
2278         tcp_init_sock(sk);
2279
2280         icsk->icsk_af_ops = &ipv4_specific;
2281
2282 #ifdef CONFIG_TCP_MD5SIG
2283         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2284 #endif
2285
2286         return 0;
2287 }
2288
2289 void tcp_v4_destroy_sock(struct sock *sk)
2290 {
2291         struct tcp_sock *tp = tcp_sk(sk);
2292
2293         trace_tcp_destroy_sock(sk);
2294
2295         tcp_clear_xmit_timers(sk);
2296
2297         tcp_cleanup_congestion_control(sk);
2298
2299         tcp_cleanup_ulp(sk);
2300
2301         /* Cleanup up the write buffer. */
2302         tcp_write_queue_purge(sk);
2303
2304         /* Check if we want to disable active TFO */
2305         tcp_fastopen_active_disable_ofo_check(sk);
2306
2307         /* Cleans up our, hopefully empty, out_of_order_queue. */
2308         skb_rbtree_purge(&tp->out_of_order_queue);
2309
2310 #ifdef CONFIG_TCP_MD5SIG
2311         /* Clean up the MD5 key list, if any */
2312         if (tp->md5sig_info) {
2313                 tcp_clear_md5_list(sk);
2314                 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2315                 tp->md5sig_info = NULL;
2316                 static_branch_slow_dec_deferred(&tcp_md5_needed);
2317         }
2318 #endif
2319
2320         /* Clean up a referenced TCP bind bucket. */
2321         if (inet_csk(sk)->icsk_bind_hash)
2322                 inet_put_port(sk);
2323
2324         BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2325
2326         /* If socket is aborted during connect operation */
2327         tcp_free_fastopen_req(tp);
2328         tcp_fastopen_destroy_cipher(sk);
2329         tcp_saved_syn_free(tp);
2330
2331         sk_sockets_allocated_dec(sk);
2332 }
2333 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2334
2335 #ifdef CONFIG_PROC_FS
2336 /* Proc filesystem TCP sock list dumping. */
2337
2338 static unsigned short seq_file_family(const struct seq_file *seq);
2339
2340 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2341 {
2342         unsigned short family = seq_file_family(seq);
2343
2344         /* AF_UNSPEC is used as a match all */
2345         return ((family == AF_UNSPEC || family == sk->sk_family) &&
2346                 net_eq(sock_net(sk), seq_file_net(seq)));
2347 }
2348
2349 /* Find a non empty bucket (starting from st->bucket)
2350  * and return the first sk from it.
2351  */
2352 static void *listening_get_first(struct seq_file *seq)
2353 {
2354         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2355         struct tcp_iter_state *st = seq->private;
2356
2357         st->offset = 0;
2358         for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) {
2359                 struct inet_listen_hashbucket *ilb2;
2360                 struct hlist_nulls_node *node;
2361                 struct sock *sk;
2362
2363                 ilb2 = &hinfo->lhash2[st->bucket];
2364                 if (hlist_nulls_empty(&ilb2->nulls_head))
2365                         continue;
2366
2367                 spin_lock(&ilb2->lock);
2368                 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2369                         if (seq_sk_match(seq, sk))
2370                                 return sk;
2371                 }
2372                 spin_unlock(&ilb2->lock);
2373         }
2374
2375         return NULL;
2376 }
2377
2378 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2379  * If "cur" is the last one in the st->bucket,
2380  * call listening_get_first() to return the first sk of the next
2381  * non empty bucket.
2382  */
2383 static void *listening_get_next(struct seq_file *seq, void *cur)
2384 {
2385         struct tcp_iter_state *st = seq->private;
2386         struct inet_listen_hashbucket *ilb2;
2387         struct hlist_nulls_node *node;
2388         struct inet_hashinfo *hinfo;
2389         struct sock *sk = cur;
2390
2391         ++st->num;
2392         ++st->offset;
2393
2394         sk = sk_nulls_next(sk);
2395         sk_nulls_for_each_from(sk, node) {
2396                 if (seq_sk_match(seq, sk))
2397                         return sk;
2398         }
2399
2400         hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2401         ilb2 = &hinfo->lhash2[st->bucket];
2402         spin_unlock(&ilb2->lock);
2403         ++st->bucket;
2404         return listening_get_first(seq);
2405 }
2406
2407 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2408 {
2409         struct tcp_iter_state *st = seq->private;
2410         void *rc;
2411
2412         st->bucket = 0;
2413         st->offset = 0;
2414         rc = listening_get_first(seq);
2415
2416         while (rc && *pos) {
2417                 rc = listening_get_next(seq, rc);
2418                 --*pos;
2419         }
2420         return rc;
2421 }
2422
2423 static inline bool empty_bucket(struct inet_hashinfo *hinfo,
2424                                 const struct tcp_iter_state *st)
2425 {
2426         return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain);
2427 }
2428
2429 /*
2430  * Get first established socket starting from bucket given in st->bucket.
2431  * If st->bucket is zero, the very first socket in the hash is returned.
2432  */
2433 static void *established_get_first(struct seq_file *seq)
2434 {
2435         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2436         struct tcp_iter_state *st = seq->private;
2437
2438         st->offset = 0;
2439         for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) {
2440                 struct sock *sk;
2441                 struct hlist_nulls_node *node;
2442                 spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket);
2443
2444                 /* Lockless fast path for the common case of empty buckets */
2445                 if (empty_bucket(hinfo, st))
2446                         continue;
2447
2448                 spin_lock_bh(lock);
2449                 sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) {
2450                         if (seq_sk_match(seq, sk))
2451                                 return sk;
2452                 }
2453                 spin_unlock_bh(lock);
2454         }
2455
2456         return NULL;
2457 }
2458
2459 static void *established_get_next(struct seq_file *seq, void *cur)
2460 {
2461         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2462         struct tcp_iter_state *st = seq->private;
2463         struct hlist_nulls_node *node;
2464         struct sock *sk = cur;
2465
2466         ++st->num;
2467         ++st->offset;
2468
2469         sk = sk_nulls_next(sk);
2470
2471         sk_nulls_for_each_from(sk, node) {
2472                 if (seq_sk_match(seq, sk))
2473                         return sk;
2474         }
2475
2476         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2477         ++st->bucket;
2478         return established_get_first(seq);
2479 }
2480
2481 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2482 {
2483         struct tcp_iter_state *st = seq->private;
2484         void *rc;
2485
2486         st->bucket = 0;
2487         rc = established_get_first(seq);
2488
2489         while (rc && pos) {
2490                 rc = established_get_next(seq, rc);
2491                 --pos;
2492         }
2493         return rc;
2494 }
2495
2496 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2497 {
2498         void *rc;
2499         struct tcp_iter_state *st = seq->private;
2500
2501         st->state = TCP_SEQ_STATE_LISTENING;
2502         rc        = listening_get_idx(seq, &pos);
2503
2504         if (!rc) {
2505                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2506                 rc        = established_get_idx(seq, pos);
2507         }
2508
2509         return rc;
2510 }
2511
2512 static void *tcp_seek_last_pos(struct seq_file *seq)
2513 {
2514         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2515         struct tcp_iter_state *st = seq->private;
2516         int bucket = st->bucket;
2517         int offset = st->offset;
2518         int orig_num = st->num;
2519         void *rc = NULL;
2520
2521         switch (st->state) {
2522         case TCP_SEQ_STATE_LISTENING:
2523                 if (st->bucket > hinfo->lhash2_mask)
2524                         break;
2525                 rc = listening_get_first(seq);
2526                 while (offset-- && rc && bucket == st->bucket)
2527                         rc = listening_get_next(seq, rc);
2528                 if (rc)
2529                         break;
2530                 st->bucket = 0;
2531                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2532                 fallthrough;
2533         case TCP_SEQ_STATE_ESTABLISHED:
2534                 if (st->bucket > hinfo->ehash_mask)
2535                         break;
2536                 rc = established_get_first(seq);
2537                 while (offset-- && rc && bucket == st->bucket)
2538                         rc = established_get_next(seq, rc);
2539         }
2540
2541         st->num = orig_num;
2542
2543         return rc;
2544 }
2545
2546 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2547 {
2548         struct tcp_iter_state *st = seq->private;
2549         void *rc;
2550
2551         if (*pos && *pos == st->last_pos) {
2552                 rc = tcp_seek_last_pos(seq);
2553                 if (rc)
2554                         goto out;
2555         }
2556
2557         st->state = TCP_SEQ_STATE_LISTENING;
2558         st->num = 0;
2559         st->bucket = 0;
2560         st->offset = 0;
2561         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2562
2563 out:
2564         st->last_pos = *pos;
2565         return rc;
2566 }
2567 EXPORT_SYMBOL(tcp_seq_start);
2568
2569 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2570 {
2571         struct tcp_iter_state *st = seq->private;
2572         void *rc = NULL;
2573
2574         if (v == SEQ_START_TOKEN) {
2575                 rc = tcp_get_idx(seq, 0);
2576                 goto out;
2577         }
2578
2579         switch (st->state) {
2580         case TCP_SEQ_STATE_LISTENING:
2581                 rc = listening_get_next(seq, v);
2582                 if (!rc) {
2583                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2584                         st->bucket = 0;
2585                         st->offset = 0;
2586                         rc        = established_get_first(seq);
2587                 }
2588                 break;
2589         case TCP_SEQ_STATE_ESTABLISHED:
2590                 rc = established_get_next(seq, v);
2591                 break;
2592         }
2593 out:
2594         ++*pos;
2595         st->last_pos = *pos;
2596         return rc;
2597 }
2598 EXPORT_SYMBOL(tcp_seq_next);
2599
2600 void tcp_seq_stop(struct seq_file *seq, void *v)
2601 {
2602         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2603         struct tcp_iter_state *st = seq->private;
2604
2605         switch (st->state) {
2606         case TCP_SEQ_STATE_LISTENING:
2607                 if (v != SEQ_START_TOKEN)
2608                         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2609                 break;
2610         case TCP_SEQ_STATE_ESTABLISHED:
2611                 if (v)
2612                         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2613                 break;
2614         }
2615 }
2616 EXPORT_SYMBOL(tcp_seq_stop);
2617
2618 static void get_openreq4(const struct request_sock *req,
2619                          struct seq_file *f, int i)
2620 {
2621         const struct inet_request_sock *ireq = inet_rsk(req);
2622         long delta = req->rsk_timer.expires - jiffies;
2623
2624         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2625                 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2626                 i,
2627                 ireq->ir_loc_addr,
2628                 ireq->ir_num,
2629                 ireq->ir_rmt_addr,
2630                 ntohs(ireq->ir_rmt_port),
2631                 TCP_SYN_RECV,
2632                 0, 0, /* could print option size, but that is af dependent. */
2633                 1,    /* timers active (only the expire timer) */
2634                 jiffies_delta_to_clock_t(delta),
2635                 req->num_timeout,
2636                 from_kuid_munged(seq_user_ns(f),
2637                                  sock_i_uid(req->rsk_listener)),
2638                 0,  /* non standard timer */
2639                 0, /* open_requests have no inode */
2640                 0,
2641                 req);
2642 }
2643
2644 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2645 {
2646         int timer_active;
2647         unsigned long timer_expires;
2648         const struct tcp_sock *tp = tcp_sk(sk);
2649         const struct inet_connection_sock *icsk = inet_csk(sk);
2650         const struct inet_sock *inet = inet_sk(sk);
2651         const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2652         __be32 dest = inet->inet_daddr;
2653         __be32 src = inet->inet_rcv_saddr;
2654         __u16 destp = ntohs(inet->inet_dport);
2655         __u16 srcp = ntohs(inet->inet_sport);
2656         int rx_queue;
2657         int state;
2658
2659         if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2660             icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2661             icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2662                 timer_active    = 1;
2663                 timer_expires   = icsk->icsk_timeout;
2664         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2665                 timer_active    = 4;
2666                 timer_expires   = icsk->icsk_timeout;
2667         } else if (timer_pending(&sk->sk_timer)) {
2668                 timer_active    = 2;
2669                 timer_expires   = sk->sk_timer.expires;
2670         } else {
2671                 timer_active    = 0;
2672                 timer_expires = jiffies;
2673         }
2674
2675         state = inet_sk_state_load(sk);
2676         if (state == TCP_LISTEN)
2677                 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2678         else
2679                 /* Because we don't lock the socket,
2680                  * we might find a transient negative value.
2681                  */
2682                 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2683                                       READ_ONCE(tp->copied_seq), 0);
2684
2685         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2686                         "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2687                 i, src, srcp, dest, destp, state,
2688                 READ_ONCE(tp->write_seq) - tp->snd_una,
2689                 rx_queue,
2690                 timer_active,
2691                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2692                 icsk->icsk_retransmits,
2693                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2694                 icsk->icsk_probes_out,
2695                 sock_i_ino(sk),
2696                 refcount_read(&sk->sk_refcnt), sk,
2697                 jiffies_to_clock_t(icsk->icsk_rto),
2698                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2699                 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2700                 tcp_snd_cwnd(tp),
2701                 state == TCP_LISTEN ?
2702                     fastopenq->max_qlen :
2703                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2704 }
2705
2706 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2707                                struct seq_file *f, int i)
2708 {
2709         long delta = tw->tw_timer.expires - jiffies;
2710         __be32 dest, src;
2711         __u16 destp, srcp;
2712
2713         dest  = tw->tw_daddr;
2714         src   = tw->tw_rcv_saddr;
2715         destp = ntohs(tw->tw_dport);
2716         srcp  = ntohs(tw->tw_sport);
2717
2718         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2719                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2720                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2721                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2722                 refcount_read(&tw->tw_refcnt), tw);
2723 }
2724
2725 #define TMPSZ 150
2726
2727 static int tcp4_seq_show(struct seq_file *seq, void *v)
2728 {
2729         struct tcp_iter_state *st;
2730         struct sock *sk = v;
2731
2732         seq_setwidth(seq, TMPSZ - 1);
2733         if (v == SEQ_START_TOKEN) {
2734                 seq_puts(seq, "  sl  local_address rem_address   st tx_queue "
2735                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2736                            "inode");
2737                 goto out;
2738         }
2739         st = seq->private;
2740
2741         if (sk->sk_state == TCP_TIME_WAIT)
2742                 get_timewait4_sock(v, seq, st->num);
2743         else if (sk->sk_state == TCP_NEW_SYN_RECV)
2744                 get_openreq4(v, seq, st->num);
2745         else
2746                 get_tcp4_sock(v, seq, st->num);
2747 out:
2748         seq_pad(seq, '\n');
2749         return 0;
2750 }
2751
2752 #ifdef CONFIG_BPF_SYSCALL
2753 struct bpf_tcp_iter_state {
2754         struct tcp_iter_state state;
2755         unsigned int cur_sk;
2756         unsigned int end_sk;
2757         unsigned int max_sk;
2758         struct sock **batch;
2759         bool st_bucket_done;
2760 };
2761
2762 struct bpf_iter__tcp {
2763         __bpf_md_ptr(struct bpf_iter_meta *, meta);
2764         __bpf_md_ptr(struct sock_common *, sk_common);
2765         uid_t uid __aligned(8);
2766 };
2767
2768 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2769                              struct sock_common *sk_common, uid_t uid)
2770 {
2771         struct bpf_iter__tcp ctx;
2772
2773         meta->seq_num--;  /* skip SEQ_START_TOKEN */
2774         ctx.meta = meta;
2775         ctx.sk_common = sk_common;
2776         ctx.uid = uid;
2777         return bpf_iter_run_prog(prog, &ctx);
2778 }
2779
2780 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2781 {
2782         while (iter->cur_sk < iter->end_sk)
2783                 sock_gen_put(iter->batch[iter->cur_sk++]);
2784 }
2785
2786 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2787                                       unsigned int new_batch_sz)
2788 {
2789         struct sock **new_batch;
2790
2791         new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2792                              GFP_USER | __GFP_NOWARN);
2793         if (!new_batch)
2794                 return -ENOMEM;
2795
2796         bpf_iter_tcp_put_batch(iter);
2797         kvfree(iter->batch);
2798         iter->batch = new_batch;
2799         iter->max_sk = new_batch_sz;
2800
2801         return 0;
2802 }
2803
2804 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2805                                                  struct sock *start_sk)
2806 {
2807         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2808         struct bpf_tcp_iter_state *iter = seq->private;
2809         struct tcp_iter_state *st = &iter->state;
2810         struct hlist_nulls_node *node;
2811         unsigned int expected = 1;
2812         struct sock *sk;
2813
2814         sock_hold(start_sk);
2815         iter->batch[iter->end_sk++] = start_sk;
2816
2817         sk = sk_nulls_next(start_sk);
2818         sk_nulls_for_each_from(sk, node) {
2819                 if (seq_sk_match(seq, sk)) {
2820                         if (iter->end_sk < iter->max_sk) {
2821                                 sock_hold(sk);
2822                                 iter->batch[iter->end_sk++] = sk;
2823                         }
2824                         expected++;
2825                 }
2826         }
2827         spin_unlock(&hinfo->lhash2[st->bucket].lock);
2828
2829         return expected;
2830 }
2831
2832 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2833                                                    struct sock *start_sk)
2834 {
2835         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2836         struct bpf_tcp_iter_state *iter = seq->private;
2837         struct tcp_iter_state *st = &iter->state;
2838         struct hlist_nulls_node *node;
2839         unsigned int expected = 1;
2840         struct sock *sk;
2841
2842         sock_hold(start_sk);
2843         iter->batch[iter->end_sk++] = start_sk;
2844
2845         sk = sk_nulls_next(start_sk);
2846         sk_nulls_for_each_from(sk, node) {
2847                 if (seq_sk_match(seq, sk)) {
2848                         if (iter->end_sk < iter->max_sk) {
2849                                 sock_hold(sk);
2850                                 iter->batch[iter->end_sk++] = sk;
2851                         }
2852                         expected++;
2853                 }
2854         }
2855         spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket));
2856
2857         return expected;
2858 }
2859
2860 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2861 {
2862         struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo;
2863         struct bpf_tcp_iter_state *iter = seq->private;
2864         struct tcp_iter_state *st = &iter->state;
2865         unsigned int expected;
2866         bool resized = false;
2867         struct sock *sk;
2868
2869         /* The st->bucket is done.  Directly advance to the next
2870          * bucket instead of having the tcp_seek_last_pos() to skip
2871          * one by one in the current bucket and eventually find out
2872          * it has to advance to the next bucket.
2873          */
2874         if (iter->st_bucket_done) {
2875                 st->offset = 0;
2876                 st->bucket++;
2877                 if (st->state == TCP_SEQ_STATE_LISTENING &&
2878                     st->bucket > hinfo->lhash2_mask) {
2879                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2880                         st->bucket = 0;
2881                 }
2882         }
2883
2884 again:
2885         /* Get a new batch */
2886         iter->cur_sk = 0;
2887         iter->end_sk = 0;
2888         iter->st_bucket_done = false;
2889
2890         sk = tcp_seek_last_pos(seq);
2891         if (!sk)
2892                 return NULL; /* Done */
2893
2894         if (st->state == TCP_SEQ_STATE_LISTENING)
2895                 expected = bpf_iter_tcp_listening_batch(seq, sk);
2896         else
2897                 expected = bpf_iter_tcp_established_batch(seq, sk);
2898
2899         if (iter->end_sk == expected) {
2900                 iter->st_bucket_done = true;
2901                 return sk;
2902         }
2903
2904         if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2905                 resized = true;
2906                 goto again;
2907         }
2908
2909         return sk;
2910 }
2911
2912 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2913 {
2914         /* bpf iter does not support lseek, so it always
2915          * continue from where it was stop()-ped.
2916          */
2917         if (*pos)
2918                 return bpf_iter_tcp_batch(seq);
2919
2920         return SEQ_START_TOKEN;
2921 }
2922
2923 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2924 {
2925         struct bpf_tcp_iter_state *iter = seq->private;
2926         struct tcp_iter_state *st = &iter->state;
2927         struct sock *sk;
2928
2929         /* Whenever seq_next() is called, the iter->cur_sk is
2930          * done with seq_show(), so advance to the next sk in
2931          * the batch.
2932          */
2933         if (iter->cur_sk < iter->end_sk) {
2934                 /* Keeping st->num consistent in tcp_iter_state.
2935                  * bpf_iter_tcp does not use st->num.
2936                  * meta.seq_num is used instead.
2937                  */
2938                 st->num++;
2939                 /* Move st->offset to the next sk in the bucket such that
2940                  * the future start() will resume at st->offset in
2941                  * st->bucket.  See tcp_seek_last_pos().
2942                  */
2943                 st->offset++;
2944                 sock_gen_put(iter->batch[iter->cur_sk++]);
2945         }
2946
2947         if (iter->cur_sk < iter->end_sk)
2948                 sk = iter->batch[iter->cur_sk];
2949         else
2950                 sk = bpf_iter_tcp_batch(seq);
2951
2952         ++*pos;
2953         /* Keeping st->last_pos consistent in tcp_iter_state.
2954          * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2955          */
2956         st->last_pos = *pos;
2957         return sk;
2958 }
2959
2960 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2961 {
2962         struct bpf_iter_meta meta;
2963         struct bpf_prog *prog;
2964         struct sock *sk = v;
2965         bool slow;
2966         uid_t uid;
2967         int ret;
2968
2969         if (v == SEQ_START_TOKEN)
2970                 return 0;
2971
2972         if (sk_fullsock(sk))
2973                 slow = lock_sock_fast(sk);
2974
2975         if (unlikely(sk_unhashed(sk))) {
2976                 ret = SEQ_SKIP;
2977                 goto unlock;
2978         }
2979
2980         if (sk->sk_state == TCP_TIME_WAIT) {
2981                 uid = 0;
2982         } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2983                 const struct request_sock *req = v;
2984
2985                 uid = from_kuid_munged(seq_user_ns(seq),
2986                                        sock_i_uid(req->rsk_listener));
2987         } else {
2988                 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2989         }
2990
2991         meta.seq = seq;
2992         prog = bpf_iter_get_info(&meta, false);
2993         ret = tcp_prog_seq_show(prog, &meta, v, uid);
2994
2995 unlock:
2996         if (sk_fullsock(sk))
2997                 unlock_sock_fast(sk, slow);
2998         return ret;
2999
3000 }
3001
3002 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
3003 {
3004         struct bpf_tcp_iter_state *iter = seq->private;
3005         struct bpf_iter_meta meta;
3006         struct bpf_prog *prog;
3007
3008         if (!v) {
3009                 meta.seq = seq;
3010                 prog = bpf_iter_get_info(&meta, true);
3011                 if (prog)
3012                         (void)tcp_prog_seq_show(prog, &meta, v, 0);
3013         }
3014
3015         if (iter->cur_sk < iter->end_sk) {
3016                 bpf_iter_tcp_put_batch(iter);
3017                 iter->st_bucket_done = false;
3018         }
3019 }
3020
3021 static const struct seq_operations bpf_iter_tcp_seq_ops = {
3022         .show           = bpf_iter_tcp_seq_show,
3023         .start          = bpf_iter_tcp_seq_start,
3024         .next           = bpf_iter_tcp_seq_next,
3025         .stop           = bpf_iter_tcp_seq_stop,
3026 };
3027 #endif
3028 static unsigned short seq_file_family(const struct seq_file *seq)
3029 {
3030         const struct tcp_seq_afinfo *afinfo;
3031
3032 #ifdef CONFIG_BPF_SYSCALL
3033         /* Iterated from bpf_iter.  Let the bpf prog to filter instead. */
3034         if (seq->op == &bpf_iter_tcp_seq_ops)
3035                 return AF_UNSPEC;
3036 #endif
3037
3038         /* Iterated from proc fs */
3039         afinfo = pde_data(file_inode(seq->file));
3040         return afinfo->family;
3041 }
3042
3043 static const struct seq_operations tcp4_seq_ops = {
3044         .show           = tcp4_seq_show,
3045         .start          = tcp_seq_start,
3046         .next           = tcp_seq_next,
3047         .stop           = tcp_seq_stop,
3048 };
3049
3050 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
3051         .family         = AF_INET,
3052 };
3053
3054 static int __net_init tcp4_proc_init_net(struct net *net)
3055 {
3056         if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
3057                         sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
3058                 return -ENOMEM;
3059         return 0;
3060 }
3061
3062 static void __net_exit tcp4_proc_exit_net(struct net *net)
3063 {
3064         remove_proc_entry("tcp", net->proc_net);
3065 }
3066
3067 static struct pernet_operations tcp4_net_ops = {
3068         .init = tcp4_proc_init_net,
3069         .exit = tcp4_proc_exit_net,
3070 };
3071
3072 int __init tcp4_proc_init(void)
3073 {
3074         return register_pernet_subsys(&tcp4_net_ops);
3075 }
3076
3077 void tcp4_proc_exit(void)
3078 {
3079         unregister_pernet_subsys(&tcp4_net_ops);
3080 }
3081 #endif /* CONFIG_PROC_FS */
3082
3083 /* @wake is one when sk_stream_write_space() calls us.
3084  * This sends EPOLLOUT only if notsent_bytes is half the limit.
3085  * This mimics the strategy used in sock_def_write_space().
3086  */
3087 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3088 {
3089         const struct tcp_sock *tp = tcp_sk(sk);
3090         u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3091                             READ_ONCE(tp->snd_nxt);
3092
3093         return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3094 }
3095 EXPORT_SYMBOL(tcp_stream_memory_free);
3096
3097 struct proto tcp_prot = {
3098         .name                   = "TCP",
3099         .owner                  = THIS_MODULE,
3100         .close                  = tcp_close,
3101         .pre_connect            = tcp_v4_pre_connect,
3102         .connect                = tcp_v4_connect,
3103         .disconnect             = tcp_disconnect,
3104         .accept                 = inet_csk_accept,
3105         .ioctl                  = tcp_ioctl,
3106         .init                   = tcp_v4_init_sock,
3107         .destroy                = tcp_v4_destroy_sock,
3108         .shutdown               = tcp_shutdown,
3109         .setsockopt             = tcp_setsockopt,
3110         .getsockopt             = tcp_getsockopt,
3111         .bpf_bypass_getsockopt  = tcp_bpf_bypass_getsockopt,
3112         .keepalive              = tcp_set_keepalive,
3113         .recvmsg                = tcp_recvmsg,
3114         .sendmsg                = tcp_sendmsg,
3115         .sendpage               = tcp_sendpage,
3116         .backlog_rcv            = tcp_v4_do_rcv,
3117         .release_cb             = tcp_release_cb,
3118         .hash                   = inet_hash,
3119         .unhash                 = inet_unhash,
3120         .get_port               = inet_csk_get_port,
3121         .put_port               = inet_put_port,
3122 #ifdef CONFIG_BPF_SYSCALL
3123         .psock_update_sk_prot   = tcp_bpf_update_proto,
3124 #endif
3125         .enter_memory_pressure  = tcp_enter_memory_pressure,
3126         .leave_memory_pressure  = tcp_leave_memory_pressure,
3127         .stream_memory_free     = tcp_stream_memory_free,
3128         .sockets_allocated      = &tcp_sockets_allocated,
3129         .orphan_count           = &tcp_orphan_count,
3130
3131         .memory_allocated       = &tcp_memory_allocated,
3132         .per_cpu_fw_alloc       = &tcp_memory_per_cpu_fw_alloc,
3133
3134         .memory_pressure        = &tcp_memory_pressure,
3135         .sysctl_mem             = sysctl_tcp_mem,
3136         .sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3137         .sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3138         .max_header             = MAX_TCP_HEADER,
3139         .obj_size               = sizeof(struct tcp_sock),
3140         .slab_flags             = SLAB_TYPESAFE_BY_RCU,
3141         .twsk_prot              = &tcp_timewait_sock_ops,
3142         .rsk_prot               = &tcp_request_sock_ops,
3143         .h.hashinfo             = NULL,
3144         .no_autobind            = true,
3145         .diag_destroy           = tcp_abort,
3146 };
3147 EXPORT_SYMBOL(tcp_prot);
3148
3149 static void __net_exit tcp_sk_exit(struct net *net)
3150 {
3151         if (net->ipv4.tcp_congestion_control)
3152                 bpf_module_put(net->ipv4.tcp_congestion_control,
3153                                net->ipv4.tcp_congestion_control->owner);
3154 }
3155
3156 static void __net_init tcp_set_hashinfo(struct net *net)
3157 {
3158         struct inet_hashinfo *hinfo;
3159         unsigned int ehash_entries;
3160         struct net *old_net;
3161
3162         if (net_eq(net, &init_net))
3163                 goto fallback;
3164
3165         old_net = current->nsproxy->net_ns;
3166         ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries);
3167         if (!ehash_entries)
3168                 goto fallback;
3169
3170         ehash_entries = roundup_pow_of_two(ehash_entries);
3171         hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries);
3172         if (!hinfo) {
3173                 pr_warn("Failed to allocate TCP ehash (entries: %u) "
3174                         "for a netns, fallback to the global one\n",
3175                         ehash_entries);
3176 fallback:
3177                 hinfo = &tcp_hashinfo;
3178                 ehash_entries = tcp_hashinfo.ehash_mask + 1;
3179         }
3180
3181         net->ipv4.tcp_death_row.hashinfo = hinfo;
3182         net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2;
3183         net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128);
3184 }
3185
3186 static int __net_init tcp_sk_init(struct net *net)
3187 {
3188         net->ipv4.sysctl_tcp_ecn = 2;
3189         net->ipv4.sysctl_tcp_ecn_fallback = 1;
3190
3191         net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3192         net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3193         net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3194         net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3195         net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3196
3197         net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3198         net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3199         net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3200
3201         net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3202         net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3203         net->ipv4.sysctl_tcp_syncookies = 1;
3204         net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3205         net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3206         net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3207         net->ipv4.sysctl_tcp_orphan_retries = 0;
3208         net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3209         net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3210         net->ipv4.sysctl_tcp_tw_reuse = 2;
3211         net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3212
3213         refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
3214         tcp_set_hashinfo(net);
3215
3216         net->ipv4.sysctl_tcp_sack = 1;
3217         net->ipv4.sysctl_tcp_window_scaling = 1;
3218         net->ipv4.sysctl_tcp_timestamps = 1;
3219         net->ipv4.sysctl_tcp_early_retrans = 3;
3220         net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3221         net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior.  */
3222         net->ipv4.sysctl_tcp_retrans_collapse = 1;
3223         net->ipv4.sysctl_tcp_max_reordering = 300;
3224         net->ipv4.sysctl_tcp_dsack = 1;
3225         net->ipv4.sysctl_tcp_app_win = 31;
3226         net->ipv4.sysctl_tcp_adv_win_scale = 1;
3227         net->ipv4.sysctl_tcp_frto = 2;
3228         net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3229         /* This limits the percentage of the congestion window which we
3230          * will allow a single TSO frame to consume.  Building TSO frames
3231          * which are too large can cause TCP streams to be bursty.
3232          */
3233         net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3234         /* Default TSQ limit of 16 TSO segments */
3235         net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3236
3237         /* rfc5961 challenge ack rate limiting, per net-ns, disabled by default. */
3238         net->ipv4.sysctl_tcp_challenge_ack_limit = INT_MAX;
3239
3240         net->ipv4.sysctl_tcp_min_tso_segs = 2;
3241         net->ipv4.sysctl_tcp_tso_rtt_log = 9;  /* 2^9 = 512 usec */
3242         net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3243         net->ipv4.sysctl_tcp_autocorking = 1;
3244         net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3245         net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3246         net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3247         if (net != &init_net) {
3248                 memcpy(net->ipv4.sysctl_tcp_rmem,
3249                        init_net.ipv4.sysctl_tcp_rmem,
3250                        sizeof(init_net.ipv4.sysctl_tcp_rmem));
3251                 memcpy(net->ipv4.sysctl_tcp_wmem,
3252                        init_net.ipv4.sysctl_tcp_wmem,
3253                        sizeof(init_net.ipv4.sysctl_tcp_wmem));
3254         }
3255         net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3256         net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3257         net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3258         net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3259         net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3260         atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3261
3262         /* Set default values for PLB */
3263         net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
3264         net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
3265         net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
3266         net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
3267         /* Default congestion threshold for PLB to mark a round is 50% */
3268         net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
3269
3270         /* Reno is always built in */
3271         if (!net_eq(net, &init_net) &&
3272             bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3273                                init_net.ipv4.tcp_congestion_control->owner))
3274                 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3275         else
3276                 net->ipv4.tcp_congestion_control = &tcp_reno;
3277
3278         return 0;
3279 }
3280
3281 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3282 {
3283         struct net *net;
3284
3285         tcp_twsk_purge(net_exit_list, AF_INET);
3286
3287         list_for_each_entry(net, net_exit_list, exit_list) {
3288                 inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
3289                 WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
3290                 tcp_fastopen_ctx_destroy(net);
3291         }
3292 }
3293
3294 static struct pernet_operations __net_initdata tcp_sk_ops = {
3295        .init       = tcp_sk_init,
3296        .exit       = tcp_sk_exit,
3297        .exit_batch = tcp_sk_exit_batch,
3298 };
3299
3300 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3301 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3302                      struct sock_common *sk_common, uid_t uid)
3303
3304 #define INIT_BATCH_SZ 16
3305
3306 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3307 {
3308         struct bpf_tcp_iter_state *iter = priv_data;
3309         int err;
3310
3311         err = bpf_iter_init_seq_net(priv_data, aux);
3312         if (err)
3313                 return err;
3314
3315         err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3316         if (err) {
3317                 bpf_iter_fini_seq_net(priv_data);
3318                 return err;
3319         }
3320
3321         return 0;
3322 }
3323
3324 static void bpf_iter_fini_tcp(void *priv_data)
3325 {
3326         struct bpf_tcp_iter_state *iter = priv_data;
3327
3328         bpf_iter_fini_seq_net(priv_data);
3329         kvfree(iter->batch);
3330 }
3331
3332 static const struct bpf_iter_seq_info tcp_seq_info = {
3333         .seq_ops                = &bpf_iter_tcp_seq_ops,
3334         .init_seq_private       = bpf_iter_init_tcp,
3335         .fini_seq_private       = bpf_iter_fini_tcp,
3336         .seq_priv_size          = sizeof(struct bpf_tcp_iter_state),
3337 };
3338
3339 static const struct bpf_func_proto *
3340 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3341                             const struct bpf_prog *prog)
3342 {
3343         switch (func_id) {
3344         case BPF_FUNC_setsockopt:
3345                 return &bpf_sk_setsockopt_proto;
3346         case BPF_FUNC_getsockopt:
3347                 return &bpf_sk_getsockopt_proto;
3348         default:
3349                 return NULL;
3350         }
3351 }
3352
3353 static struct bpf_iter_reg tcp_reg_info = {
3354         .target                 = "tcp",
3355         .ctx_arg_info_size      = 1,
3356         .ctx_arg_info           = {
3357                 { offsetof(struct bpf_iter__tcp, sk_common),
3358                   PTR_TO_BTF_ID_OR_NULL },
3359         },
3360         .get_func_proto         = bpf_iter_tcp_get_func_proto,
3361         .seq_info               = &tcp_seq_info,
3362 };
3363
3364 static void __init bpf_iter_register(void)
3365 {
3366         tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3367         if (bpf_iter_reg_target(&tcp_reg_info))
3368                 pr_warn("Warning: could not register bpf iterator tcp\n");
3369 }
3370
3371 #endif
3372
3373 void __init tcp_v4_init(void)
3374 {
3375         int cpu, res;
3376
3377         for_each_possible_cpu(cpu) {
3378                 struct sock *sk;
3379
3380                 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3381                                            IPPROTO_TCP, &init_net);
3382                 if (res)
3383                         panic("Failed to create the TCP control socket.\n");
3384                 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3385
3386                 /* Please enforce IP_DF and IPID==0 for RST and
3387                  * ACK sent in SYN-RECV and TIME-WAIT state.
3388                  */
3389                 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3390
3391                 per_cpu(ipv4_tcp_sk, cpu) = sk;
3392         }
3393         if (register_pernet_subsys(&tcp_sk_ops))
3394                 panic("Failed to create the TCP control socket.\n");
3395
3396 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3397         bpf_iter_register();
3398 #endif
3399 }